[lmi] Upgrade lmi dockerfile for 0.29.0 release

maaquib · maaquib · commit 6c3daf909881 · 2024-07-12T22:18:19.000Z
diff --git a/.github/workflows/integration.yml b/.github/workflows/integration.yml
@@ -143,6 +143,12 @@ jobs:
           sudo apt-get install python3 python-is-python3 python3-pip -y
       - name: Install pip dependencies
         run: pip3 install pytest requests "numpy<2" pillow huggingface_hub
+      - name: Install torch
+        # Use torch to get cuda capability of current device to selectively run tests
+        # Torch version doesn't really matter that much
+        if: ${{ matrix.test.test == 'TestVllm1' }}
+        run: |
+          pip3 install torch==2.3.0
       - name: Install awscurl
         working-directory: tests/integration
         run: |
diff --git a/engines/python/setup/djl_python/properties_manager/hf_properties.py b/engines/python/setup/djl_python/properties_manager/hf_properties.py
@@ -23,6 +23,11 @@ class HFQuantizeMethods(str, Enum):
     # supported by vllm
     awq = 'awq'
     deepspeedfp = 'deepspeedfp'
+    fp8 = 'fp8'
+    gptq_marlin = 'gptq_marlin'
+    gptq_marlin_24 = 'gptq_marlin_24'
+    marlin = 'marlin'
+    squeezellm = 'squeezellm'
 
 
 def get_torch_dtype_from_str(dtype: str):
diff --git a/engines/python/setup/djl_python/properties_manager/lmi_dist_rb_properties.py b/engines/python/setup/djl_python/properties_manager/lmi_dist_rb_properties.py
@@ -20,9 +20,13 @@
 
 class LmiDistQuantizeMethods(str, Enum):
     awq = 'awq'
+    deepspeedfp = 'deepspeedfp'
+    fp8 = 'fp8'
     gptq = 'gptq'
+    gptq_marlin = 'gptq_marlin'
+    gptq_marlin_24 = 'gptq_marlin_24'
+    marlin = 'marlin'
     squeezellm = 'squeezellm'
-    deepspeedfp = 'deepspeedfp'
 
 
 class LmiDistRbProperties(Properties):
diff --git a/engines/python/setup/djl_python/properties_manager/vllm_rb_properties.py b/engines/python/setup/djl_python/properties_manager/vllm_rb_properties.py
@@ -20,9 +20,13 @@
 
 class VllmQuantizeMethods(str, Enum):
     awq = 'awq'
+    deepspeedfp = 'deepspeedfp'
+    fp8 = 'fp8'
     gptq = 'gptq'
+    gptq_marlin = 'gptq_marlin'
+    gptq_marlin_24 = 'gptq_marlin_24'
+    marlin = 'marlin'
     squeezellm = 'squeezellm'
-    deepspeedfp = 'deepspeedfp'
 
 
 class VllmRbProperties(Properties):
diff --git a/serving/docker/lmi.Dockerfile b/serving/docker/lmi.Dockerfile
@@ -19,22 +19,22 @@ ARG torch_version=2.3.0
 ARG torch_vision_version=0.18.0
 ARG onnx_version=1.18.0
 ARG onnxruntime_wheel="https://publish.djl.ai/onnxruntime/1.18.0/onnxruntime_gpu-1.18.0-cp310-cp310-linux_x86_64.whl"
-ARG pydantic_version=2.7.1
+ARG pydantic_version=2.8.2
 ARG djl_converter_wheel="https://publish.djl.ai/djl_converter/djl_converter-0.28.0-py3-none-any.whl"
-ARG vllm_cuda_name="cu12"
-ARG vllm_nccl_version=2.18.1
 # HF Deps
 ARG protobuf_version=3.20.3
-ARG transformers_version=4.41.1
-ARG accelerate_version=0.30.1
+ARG transformers_version=4.42.3
+ARG accelerate_version=0.32.1
 ARG bitsandbytes_version=0.43.1
-ARG optimum_version=1.20.0
+ARG optimum_version=1.21.2
 ARG auto_gptq_version=0.7.1
-ARG datasets_version=2.19.1
+ARG datasets_version=2.20.0
 ARG autoawq_version=0.2.5
+ARG tokenizers_version=0.19.1
 # LMI-Dist Deps
-ARG vllm_wheel="https://publish.djl.ai/vllm/cu124-pt230/vllm-0.4.2%2Bcu124-cp310-cp310-linux_x86_64.whl"
-ARG flash_attn_2_wheel="https://publish.djl.ai/flash_attn/cu124-pt230/flash_attn-2.5.8-cp310-cp310-linux_x86_64.whl"
+ARG vllm_wheel="https://github.com/vllm-project/vllm/releases/download/v0.5.1/vllm-0.5.1-cp310-cp310-manylinux1_x86_64.whl"
+ARG flash_attn_2_wheel="https://github.com/vllm-project/flash-attention/releases/download/v2.5.9/vllm_flash_attn-2.5.9-cp310-cp310-manylinux1_x86_64.whl"
+ARG flash_infer_wheel="https://github.com/flashinfer-ai/flashinfer/releases/download/v0.0.8/flashinfer-0.0.8+cu121torch2.3-cp310-cp310-linux_x86_64.whl"
 # %2B is the url escape for the '+' character
 ARG lmi_dist_wheel="https://publish.djl.ai/lmi_dist/lmi_dist-11.0.0%2Bnightly-py3-none-any.whl"
 ARG seq_scheduler_wheel="https://publish.djl.ai/seq_scheduler/seq_scheduler-0.1.0-py3-none-any.whl"
@@ -57,7 +57,6 @@ ENV PYTORCH_PRECXX11=true
 ENV PYTORCH_VERSION=${torch_version}
 ENV PYTORCH_FLAVOR=cu121-precxx11
 ENV VLLM_NO_USAGE_STATS=1
-ENV VLLM_CONFIG_ROOT=/opt/djl/vllm/.config
 
 
 ENV HF_HOME=/tmp/.cache/huggingface
@@ -102,23 +101,18 @@ RUN pip3 install torch==${torch_version} torchvision==${torch_vision_version} --
     transformers==${transformers_version} hf-transfer zstandard datasets==${datasets_version} \
     mpi4py sentencepiece tiktoken blobfile einops accelerate==${accelerate_version} bitsandbytes==${bitsandbytes_version} \
     optimum==${optimum_version} auto-gptq==${auto_gptq_version} pandas pyarrow jinja2 \
-    opencv-contrib-python-headless safetensors scipy onnx sentence_transformers ${onnxruntime_wheel} autoawq==${autoawq_version} && \
+    opencv-contrib-python-headless safetensors scipy onnx sentence_transformers ${onnxruntime_wheel} autoawq==${autoawq_version} \
+    tokenizers==${tokenizers_version} pydantic==${pydantic_version} && \
     pip3 install ${djl_converter_wheel} --no-deps && \
+    git clone https://github.com/neuralmagic/AutoFP8.git && cd AutoFP8 && git reset --hard 4b2092c && pip3 install . && cd .. && rm -rf AutoFP8 \
     pip3 cache purge
 
-RUN pip3 install ${flash_attn_2_wheel} ${lmi_dist_wheel} ${vllm_wheel} pydantic==${pydantic_version} && \
+RUN pip3 install ${flash_attn_2_wheel} ${lmi_dist_wheel} ${vllm_wheel} ${flash_infer_wheel} && \
     pip3 cache purge
 
 # Add CUDA-Compat
 RUN apt-get update && apt-get install -y cuda-compat-12-4 && apt-get clean -y && rm -rf /var/lib/apt/lists/*
 
-# We use the same NCCL version as vLLM for lmi-dist https://github.com/vllm-project/vllm/blob/v0.4.2/vllm/utils.py#L641-L646
-# This is due to https://github.com/vllm-project/vllm/blob/v0.4.2/vllm/distributed/device_communicators/pynccl.py#L1-L9
-RUN mkdir -p ${VLLM_CONFIG_ROOT}/vllm/nccl/$vllm_cuda_name/ && curl -L -o ${VLLM_CONFIG_ROOT}/vllm/nccl/$vllm_cuda_name/libnccl.so.$vllm_nccl_version \
-    https://github.com/vllm-project/vllm-nccl/releases/download/v0.1.0/$vllm_cuda_name-libnccl.so.$vllm_nccl_version && \
-    # The following is done only so that we can run the CI with `-u djl`. Sagemaker wouldn't require this.
-    chmod -R a+w ${VLLM_CONFIG_ROOT}
-
 RUN scripts/patch_oss_dlc.sh python && \
     scripts/security_patch.sh lmi && \
     useradd -m -d /home/djl djl && \
diff --git a/tests/integration/llm/client.py b/tests/integration/llm/client.py
@@ -276,6 +276,12 @@ def get_model_name():
         "seq_length": [256],
         "tokenizer": "amazon/MegaBeam-Mistral-7B-300k"
     },
+    "mistral-7b-marlin": {
+        "max_memory_per_gpu": [23.0],
+        "batch_size": [1, 4],
+        "seq_length": [256],
+        "tokenizer": "amazon/MegaBeam-Mistral-7B-300k"
+    },
     "llama-7b-unmerged-lora": {
         "max_memory_per_gpu": [15.0, 15.0],
         "batch_size": [3],
@@ -376,6 +382,15 @@ def get_model_name():
         "seq_length": [256],
         "tokenizer": "NousResearch/Nous-Hermes-2-Mixtral-8x7B-DPO"
     },
+    "qwen2-7b-fp8": {
+        "max_memory_per_gpu": [23.0],
+        "batch_size": [1, 4],
+        "seq_length": [256],
+        "tokenizer": "Qwen/Qwen-7B",
+        "parameters": {
+            "decoder_input_details": True
+        }
+    },
     "llama-7b-unmerged-lora": {
         "max_memory_per_gpu": [15.0, 15.0],
         "batch_size": [3],
diff --git a/tests/integration/llm/prepare.py b/tests/integration/llm/prepare.py
@@ -418,6 +418,13 @@
         "option.max_rolling_batch_size": 4,
         "option.quantize": "awq"
     },
+    "mistral-7b-marlin": {
+        "option.model_id": "neuralmagic/OpenHermes-2.5-Mistral-7B-marlin",
+        "option.task": "text-generation",
+        "option.tensor_parallel_degree": 4,
+        "option.max_rolling_batch_size": 4,
+        "option.quantize": "marlin"
+    },
     "llama-7b-unmerged-lora": {
         "option.model_id": "s3://djl-llm/huggyllama-llama-7b",
         "option.tensor_parallel_degree": "max",
@@ -447,17 +454,6 @@
         "option.tensor_parallel_degree": 4,
         "option.max_rolling_batch_size": 4
     },
-    "llama-7b-unmerged-lora": {
-        "option.model_id": "s3://djl-llm/huggyllama-llama-7b",
-        "option.tensor_parallel_degree": "max",
-        "option.task": "text-generation",
-        "option.dtype": "fp16",
-        "option.adapters": "adapters",
-        "option.enable_lora": "true",
-        "adapter_ids": ["tloen/alpaca-lora-7b", "22h/cabrita-lora-v0-1"],
-        "adapter_names": ["english-alpaca", "portugese-alpaca"],
-        "option.gpu_memory_utilization": "0.8",
-    },
     "llama2-13b-awq-unmerged-lora": {
         "option.model_id":
         "s3://djl-llm/TheBloke-Llama-2-13b-Chat-AWQ/",
@@ -602,6 +598,13 @@
         "option.max_rolling_batch_size": 32,
         "option.output_formatter": "jsonlines"
     },
+    "qwen2-7b-fp8": {
+        "option.model_id": "neuralmagic/Qwen2-7B-Instruct-FP8",
+        "option.task": "text-generation",
+        "option.tensor_parallel_degree": 4,
+        "option.max_rolling_batch_size": 4,
+        "option.quantize": "fp8"
+    },
     "llama-7b-unmerged-lora": {
         "option.model_id": "s3://djl-llm/huggyllama-llama-7b",
         "option.tensor_parallel_degree": "max",
diff --git a/tests/integration/tests.py b/tests/integration/tests.py
@@ -11,6 +11,15 @@
 djl_version = os.environ.get('TEST_DJL_VERSION', '').strip()
 
 
+def is_applicable_cuda_capability(arch: int) -> bool:
+    import torch
+    if not torch.cuda.is_available():
+        return False
+
+    major, minor = torch.cuda.get_device_capability()
+    return (10 * major + minor) >= arch
+
+
 class Runner:
 
     def __init__(self, container, test_name=None, download=False):
@@ -383,6 +392,12 @@ def test_mpt_7b(self):
             r.launch()
             client.run("lmi_dist mpt-7b".split())
 
+    def test_mistral_7b_marlin(self):
+        with Runner('lmi', 'mistral-7b-marlin') as r:
+            prepare.build_lmi_dist_model("mistral-7b-marlin")
+            r.launch()
+            client.run("lmi_dist mistral-7b-marlin".split())
+
     def test_llama2_tiny_autoawq(self):
         with Runner('lmi', 'llama-2-tiny-autoawq') as r:
             prepare.build_lmi_dist_model("llama-2-tiny")
@@ -494,6 +509,14 @@ def test_llama2_7b_chat(self):
             r.launch()
             client.run("vllm_chat llama2-7b-chat".split())
 
+    @pytest.mark.skipif(not is_applicable_cuda_capability(89),
+                        reason="Unsupported CUDA capability")
+    def test_qwen2_7b_fp8(self):
+        with Runner('lmi', 'qwen2-7b-fp8') as r:
+            prepare.build_vllm_model("qwen2-7b-fp8")
+            r.launch()
+            client.run("vllm qwen2-7b-fp8".split())
+
 
 class TestVllmLora:
     # Runs on g5.12xl