deepjavalibrary
diff --git a/‎.github/workflows/client-test.yml‎
Lines changed: 2 additions & 2 deletions b/‎.github/workflows/client-test.yml‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎.github/workflows/llm_inf2_integration.yml‎
Lines changed: 0 additions & 463 deletions b/‎.github/workflows/llm_inf2_integration.yml‎
Lines changed: 0 additions & 463 deletions
diff --git a/‎.github/workflows/llm_integration.yml‎
Lines changed: 96 additions & 896 deletions b/‎.github/workflows/llm_integration.yml‎
Lines changed: 96 additions & 896 deletions
diff --git a/‎engines/python/setup/djl_python/huggingface.py‎
Lines changed: 40 additions & 60 deletions b/‎engines/python/setup/djl_python/huggingface.py‎
Lines changed: 40 additions & 60 deletions
diff --git a/‎engines/python/setup/djl_python/neuron_utils/model_loader.py‎
Lines changed: 5 additions & 0 deletions b/‎engines/python/setup/djl_python/neuron_utils/model_loader.py‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎engines/python/setup/djl_python/properties_manager/lmi_dist_rb_properties.py‎
Lines changed: 1 addition & 0 deletions b/‎engines/python/setup/djl_python/properties_manager/lmi_dist_rb_properties.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎engines/python/setup/djl_python/rolling_batch/lmi_dist_rolling_batch.py‎
Lines changed: 17 additions & 8 deletions b/‎engines/python/setup/djl_python/rolling_batch/lmi_dist_rolling_batch.py‎
Lines changed: 17 additions & 8 deletions
diff --git a/‎engines/python/setup/djl_python/rolling_batch/rolling_batch_vllm_utils.py‎
Lines changed: 2 additions & 0 deletions b/‎engines/python/setup/djl_python/rolling_batch/rolling_batch_vllm_utils.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎engines/python/setup/djl_python/sm_log_filter.py‎
Lines changed: 4 additions & 1 deletion b/‎engines/python/setup/djl_python/sm_log_filter.py‎
Lines changed: 4 additions & 1 deletion
diff --git a/‎engines/python/setup/djl_python/telemetry.py‎
Lines changed: 44 additions & 0 deletions b/‎engines/python/setup/djl_python/telemetry.py‎
Lines changed: 44 additions & 0 deletions
@@ -62,7 +62,7 @@ jobs:
           cd tests
           djl-serving -m test::Python=file://$PWD/python &> output.log &
           sleep 15
-          python test_client.py
+          python integration/test_client.py
           jobs
           kill %1
       - name: On failure step
@@ -109,7 +109,7 @@ jobs:
           ./gradlew --stop
           ./gradlew :serving:run --args="-m test::Python=file:$(pwd -W)/tests/python" &> output.log &
           sleep 30
-          cd tests/ && python test_client.py
+          cd tests/ && python integration/test_client.py
       - name: On failure step
         if: ${{ failure() }}
         shell: bash
 
@@ -22,16 +22,17 @@
     AutoModelForQuestionAnswering, StoppingCriteria, StoppingCriteriaList)
 from transformers.tokenization_utils_base import PreTrainedTokenizerBase
 from peft import PeftConfig, PeftModel, PeftModelForCausalLM
-from typing import Tuple, List
+from typing import Tuple, List, Callable, Dict
 
 from djl_python.encode_decode import encode
 from djl_python.inputs import Input
 from djl_python.outputs import Output
+from djl_python.rolling_batch.rolling_batch import RollingBatch
 from djl_python.streaming_utils import StreamingUtils
 
 from djl_python.properties_manager.properties import StreamingEnum, is_rolling_batch_enabled, is_streaming_enabled
 from djl_python.properties_manager.hf_properties import HuggingFaceProperties
-from djl_python.utils import parse_input_with_formatter, InputFormatConfigs
+from djl_python.utils import parse_input_with_formatter, InputFormatConfigs, ParsedInput, rolling_batch_inference
 
 ARCHITECTURES_2_TASK = {
     "TapasForQuestionAnswering": "table-question-answering",
@@ -140,6 +141,7 @@ def __init__(self):
         self.adapters = None
         self.hf_configs = None
         self.input_format_configs = None
+        self.parsed_input = None
 
     def initialize(self, properties: dict):
         self.hf_configs = HuggingFaceProperties(**properties)
@@ -230,13 +232,14 @@ def parse_input(
         :return batch (list): a list of Input objects contained in inputs (each one corresponds to a request)
         """
 
-        parsed_input = parse_input_with_formatter(inputs,
-                                                  self.input_format_configs,
-                                                  self.adapter_registry)
-        self.adapters = parsed_input.adapters if parsed_input.found_adapters else None
-        return parsed_input.input_data, parsed_input.input_size, parsed_input.parameters, parsed_input.errors, parsed_input.batch
+        self.parsed_input = parse_input_with_formatter(
+            inputs, self.input_format_configs, self.adapter_registry)
+        self.adapters = self.parsed_input.adapters
+        return (self.parsed_input.input_data, self.parsed_input.input_size,
+                self.parsed_input.parameters, self.parsed_input.errors,
+                self.parsed_input.batch)
 
-    def inference(self, inputs):
+    def inference(self, inputs: Input) -> Output:
         outputs = Output()
 
         input_data, input_size, parameters, errors, batch = self.parse_input(
@@ -254,70 +257,28 @@ def inference(self, inputs):
             return outputs
 
         if is_rolling_batch_enabled(self.hf_configs.rolling_batch):
-            if inputs.get_property("reset_rollingbatch"):
-                self.rolling_batch.reset()
-            if self.adapters is not None:
-                adapter_data = [
-                    self.adapter_registry.get(adapter, None)
-                    for adapter in self.adapters
-                ]
-            else:
-                adapter_data = None
-            result = self.rolling_batch.inference(input_data,
-                                                  parameters,
-                                                  adapters=adapter_data)
-            idx = 0
-            for i in range(len(batch)):
-                err = errors.get(i)
-                if err:
-                    err = {"data": "", "last": True, "code": 424, "error": err}
-                    outputs.add(Output.binary_encode(err),
-                                key="data",
-                                batch_index=i)
-                    outputs.add_property(f"batch_{i}_Content-Type",
-                                         "application/json")
-                else:
-                    content_type = result[idx].pop("content_type")
-                    outputs.add(Output.binary_encode(result[idx]),
-                                key="data",
-                                batch_index=i)
-                    if content_type is not None:
-                        outputs.add_property(f"batch_{i}_Content-Type",
-                                             content_type)
-                    idx += 1
-
-            return outputs
+            return rolling_batch_inference(self.parsed_input, inputs, outputs,
+                                           self.rolling_batch)
         elif is_streaming_enabled(self.hf_configs.enable_streaming):
-            if len(batch) > 1:
-                raise NotImplementedError(
-                    "Dynamic batch not supported for generic streaming")
-            outputs.add_property("content-type", "application/jsonlines")
-            if self.hf_configs.enable_streaming.value == StreamingEnum.huggingface.value:
-                outputs.add_stream_content(
-                    StreamingUtils.use_hf_default_streamer(
-                        self.model, self.tokenizer, input_data,
-                        self.hf_configs.device, **parameters[0]))
-            else:
-                stream_generator = StreamingUtils.get_stream_generator(
-                    "Accelerate")
-                outputs.add_stream_content(
-                    stream_generator(self.model, self.tokenizer, input_data,
-                                     self.hf_configs.device, **parameters[0]))
-            return outputs
+            return self._streaming_inference(batch, input_data, outputs,
+                                             parameters)
+        else:
+            return self._dynamic_batch_inference(batch, errors, input_data,
+                                                 input_size, inputs, outputs,
+                                                 parameters)
 
+    def _dynamic_batch_inference(self, batch, errors, input_data, input_size,
+                                 inputs, outputs, parameters):
         if not all(p == parameters[0] for p in parameters):
             raise ValueError(
                 "In order to enable dynamic batching, all input batches must have the same parameters"
             )
-
         if isinstance(self.model, PeftModelForCausalLM):
             if self.adapters is None:
                 # Inference with only base model
                 self.adapters = [""] * len(input_data)
             parameters[0]["adapters"] = self.adapters
-
         prediction = self.hf_pipeline(input_data, **parameters[0])
-
         offset = 0
         for i, item in enumerate(batch):
             content_type = item.get_property("Content-Type")
@@ -341,7 +302,26 @@ def inference(self, inputs):
                        accept,
                        key=inputs.get_content().key_at(i))
                 offset += input_size[i]
+        return outputs
 
+    def _streaming_inference(self, batch, input_data, outputs, parameters):
+        if len(batch) > 1:
+            raise NotImplementedError(
+                "Dynamic batch not supported for generic streaming")
+        outputs.add_property("content-type", "application/jsonlines")
+        if self.hf_configs.enable_streaming.value == StreamingEnum.huggingface.value:
+            outputs.add_stream_content(
+                StreamingUtils.use_hf_default_streamer(self.model,
+                                                       self.tokenizer,
+                                                       input_data,
+                                                       self.hf_configs.device,
+                                                       **parameters[0]))
+        else:
+            stream_generator = StreamingUtils.get_stream_generator(
+                "Accelerate")
+            outputs.add_stream_content(
+                stream_generator(self.model, self.tokenizer, input_data,
+                                 self.hf_configs.device, **parameters[0]))
         return outputs
 
     def get_pipeline(self, task: str, model_id_or_path: str, kwargs):
 
@@ -28,6 +28,11 @@
 from djl_python.neuron_utils.utils import NeuronXModelAdapter, get_neuronxcc_version
 from huggingface_hub import hf_hub_download
 
+# Temporary Fix: These loggers are disabled during vLLM import.
+# Remove when fixed in vLLM
+logging.getLogger("NEURON_CC_WRAPPER").disabled = False
+logging.getLogger("NEURON_CACHE").disabled = False
+
 
 class ModelLoader(ABC):
 
 
@@ -45,6 +45,7 @@ class LmiDistRbProperties(Properties):
     speculative_length: int = 5
     draft_model_tp_size: int = 1
     record_acceptance_rate: Optional[bool] = False
+    speculative_telemetry: Optional[bool] = True
     enable_lora: Optional[bool] = False
     max_loras: Optional[int] = 4
     max_lora_rank: Optional[int] = 16
 
@@ -11,6 +11,7 @@
 # BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express or implied. See the License for
 # the specific language governing permissions and limitations under the License.
 import logging
+import os
 from typing import List
 from collections import OrderedDict, defaultdict
 
@@ -26,6 +27,7 @@
     get_speculative_decoding_metrics_record, update_request_cache_with_output,
     supports_speculative_decoding, get_lora_request_params, DTYPE_MAPPER,
     FINISH_REASON_MAPPER)
+from djl_python.telemetry import telemetry_manager
 from djl_python.properties_manager.lmi_dist_rb_properties import LmiDistRbProperties
 
 _WARMUP_PREFILL_TOKENS = 4096
@@ -187,14 +189,21 @@ def inference(self,
                 self.request_cache, request_output, self.get_tokenizer())
             # Record SD metrics
             completion_output = request_output.outputs[0]
-            if self.lmi_dist_config.record_acceptance_rate and request_output.finished:
-                if self.supports_speculative_decoding and completion_output.acceptance_history:
-                    record = get_speculative_decoding_metrics_record(
-                        completion_output, request_output)
-                    logging.info(f"Speculative Decoding {record}")
-                else:
-                    logging.warning(
-                        f"Ignoring logging speculative decoding metrics")
+            if (
+                    self.lmi_dist_config.record_acceptance_rate
+                    or self.lmi_dist_config.speculative_telemetry
+            ) and self.lmi_dist_config.speculative_draft_model and request_output.finished:
+                try:
+                    if self.supports_speculative_decoding and completion_output.acceptance_history:
+                        record = get_speculative_decoding_metrics_record(
+                            completion_output, request_output)
+                        if self.lmi_dist_config.record_acceptance_rate:
+                            logging.info(f"Speculative Decoding {record}")
+                        if self.lmi_dist_config.speculative_telemetry and os.environ.get(
+                                "SAGEMAKER_SECURE_MODE") == "true":
+                            telemetry_manager.record_speculative(record)
+                except:
+                    logging.debug("SD telemetry collection failed, ignore")
 
         for request in self.active_requests:
             request_output = request.request_output
 
@@ -166,6 +166,8 @@ def get_speculative_decoding_metrics_record(
                 completion_output.acceptance_history)
     else:
         record["mean_acceptance"] = 0
+    record["acceptance_history_len"] = len(
+        completion_output.acceptance_history)
     record["prompt_size"] = len(request_output.prompt_token_ids)
     record["output_size"] = len(completion_output.token_ids)
     return record
 
@@ -19,7 +19,10 @@
 
 # https://docs.aws.amazon.com/deep-learning-containers/latest/devguide/logging-and-monitoring.html
 class SMLogFilter(logging.Filter):
-    sm_log_markers = ['ModelServerError', 'UserScriptError', 'SysHealth']
+    sm_log_markers = [
+        'ModelServerError', 'UserScriptError', 'SysHealth',
+        'ModelServerTelemetry'
+    ]
     counter = defaultdict(int)
 
     def filter(self, record):
 
@@ -0,0 +1,44 @@
+#!/usr/bin/env python
+#
+# Copyright 2023 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file
+# except in compliance with the License. A copy of the License is located at
+#
+# http://aws.amazon.com/apache2.0/
+#
+# or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS"
+# BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express or implied. See the License for
+# the specific language governing permissions and limitations under the License.
+
+import logging
+import time
+
+SPECULATIVE_FREQUENCY_SEC = 30.0
+
+
+class TelemetryManager:
+
+    def __init__(self):
+        self.reset_speculative()
+
+    def record_speculative(self, data):
+        self.speculative_acceptance_rate_count = self.speculative_acceptance_rate_count + data[
+            "acceptance_history_len"]
+        self.speculative_acceptance_rate_total = self.speculative_acceptance_rate_total + data[
+            "mean_acceptance"] * data["acceptance_history_len"]
+        if time.time(
+        ) - self.speculative_sent_time > SPECULATIVE_FREQUENCY_SEC:
+            mean_acceptance = 1.0 * self.speculative_acceptance_rate_total / self.speculative_acceptance_rate_count
+            logging.info(
+                f"ModelServerTelemetry: Speculative Decoding Mean Acceptance: {mean_acceptance} rate"
+            )
+            self.reset_speculative()
+
+    def reset_speculative(self):
+        self.speculative_sent_time = time.time()
+        self.speculative_acceptance_rate_count = 0
+        self.speculative_acceptance_rate_total = 0.0
+
+
+telemetry_manager = TelemetryManager()