deepjavalibrary · lanking520 · Jul 26, 2024 · Jul 25, 2024
@@ -122,6 +122,8 @@ jobs:
             instance: inf2
           - test: TestNeuronxRollingBatch
             instance: inf2
+          - test: TestMultiModal
+            instance: g6
     steps:
       - uses: actions/checkout@v4
       - name: Clean env

@@ -760,6 +760,18 @@ def get_model_name():
     }
 }
 
+multi_modal_spec = {
+    "llava_v1.6-mistral": {
+        "batch_size": [1, 4]
+    },
+    "paligemma-3b-mix-448": {
+        "batch_size": [1, 4],
+    },
+    "phi-3-vision-128k-instruct": {
+        "batch_size": [1, 4],
+    }
+}
+
 
 def add_file_handler_to_logger(file_path: str):
     handler = logging.FileHandler(file_path, mode='w')
@@ -1430,6 +1442,42 @@ def test_correctness(model, model_spec):
             validate_correctness(dataset, data, score)
 
 
+def get_multimodal_prompt():
+    messages = [{
+        "role":
+        "user",
+        "content": [{
+            "type": "text",
+            "text": "What is this an image of?",
+        }, {
+            "type": "image_url",
+            "image_url": {
+                "url": "https://resources.djl.ai/images/dog_bike_car.jpg",
+            }
+        }]
+    }]
+    return {
+        "messages": messages,
+        "temperature": 0.9,
+        "top_p": 0.6,
+        "max_new_tokens": 512,
+    }
+
+
+def test_multimodal(model, model_spec):
+    if model not in model_spec:
+        raise ValueError(
+            f"{model} is not currently supported {list(model_spec.keys())}")
+    spec = model_spec[model]
+    messages = get_multimodal_prompt()
+    for i, batch_size in enumerate(spec["batch_size"]):
+        awscurl_run(messages,
+                    spec.get("tokenizer", None),
+                    batch_size,
+                    num_run=5,
+                    output=True)
+
+
 def run(raw_args):
     parser = argparse.ArgumentParser(description="Build the LLM configs")
     parser.add_argument("handler", help="the handler used in the model")
@@ -1507,6 +1555,8 @@ def run(raw_args):
         test_handler_rolling_batch(args.model, no_code_rolling_batch_spec)
     elif args.handler == "correctness":
         test_correctness(args.model, correctness_model_spec)
+    elif args.handler == "multimodal":
+        test_multimodal(args.model, multi_modal_spec)
 
     else:
         raise ValueError(

@@ -574,6 +574,16 @@
         "option.model_id": "s3://djl-llm/llama-2-tiny/",
         "option.quantize": "awq",
         "option.tensor_parallel_degree": 4
+    },
+    "llava_v1.6-mistral": {
+        "option.model_id": "s3://djl-llm/llava-v1.6-mistral-7b-hf/",
+    },
+    "paligemma-3b-mix-448": {
+        "option.model_id": "s3://djl-llm/paligemma-3b-mix-448/"
+    },
+    "phi-3-vision-128k-instruct": {
+        "option.model_id": "s3://djl-llm/phi-3-vision-128k-instruct/",
+        "option.trust_remote_code": True,
     }
 }
 
@@ -784,7 +794,7 @@
         "option.dtype": "fp16",
         "option.tensor_parallel_degree": 4,
         "option.max_rolling_batch_size": 4,
-    }
+    },
 }
 
 lmi_dist_aiccl_model_list = {

@@ -883,3 +883,24 @@ def test_llama3_1_8b(self):
             prepare.build_correctness_model("neuronx-llama3-1-8b")
             r.launch(container='pytorch-inf2-2')
             client.run("correctness neuronx-llama3-1-8b".split())
+
+
+class TestMultiModalLmiDist:
+
+    def test_llava_next(self):
+        with Runner('lmi', 'llava_v1.6-mistral') as r:
+            prepare.build_lmi_dist_model('llava_v1.6-mistral')
+            r.launch()
+            client.run("multimodal llava_v1.6-mistral".split())
+
+    def test_paligemma(self):
+        with Runner('lmi', 'paligemma-3b-mix-448') as r:
+            prepare.build_lmi_dist_model('paligemma-3b-mix-448')
+            r.launch()
+            client.run("multimodal paligemma-3b-mix-448".split())
+
+    def test_phi3_v(self):
+        with Runner('lmi', 'phi-3-vision-128k-instruct') as r:
+            prepare.build_lmi_dist_model('phi-3-vision-128k-instruct')
+            r.launch()
+            client.run("multimodal phi-3-vision-128k-instruct".split())