Skip to content

Commit 6c3daf9

Browse files
committed
[lmi] Upgrade lmi dockerfile for 0.29.0 release
1 parent 1038c63 commit 6c3daf9

File tree

8 files changed

+86
-32
lines changed

8 files changed

+86
-32
lines changed

.github/workflows/integration.yml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -143,6 +143,12 @@ jobs:
143143
sudo apt-get install python3 python-is-python3 python3-pip -y
144144
- name: Install pip dependencies
145145
run: pip3 install pytest requests "numpy<2" pillow huggingface_hub
146+
- name: Install torch
147+
# Use torch to get cuda capability of current device to selectively run tests
148+
# Torch version doesn't really matter that much
149+
if: ${{ matrix.test.test == 'TestVllm1' }}
150+
run: |
151+
pip3 install torch==2.3.0
146152
- name: Install awscurl
147153
working-directory: tests/integration
148154
run: |

engines/python/setup/djl_python/properties_manager/hf_properties.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,11 @@ class HFQuantizeMethods(str, Enum):
2323
# supported by vllm
2424
awq = 'awq'
2525
deepspeedfp = 'deepspeedfp'
26+
fp8 = 'fp8'
27+
gptq_marlin = 'gptq_marlin'
28+
gptq_marlin_24 = 'gptq_marlin_24'
29+
marlin = 'marlin'
30+
squeezellm = 'squeezellm'
2631

2732

2833
def get_torch_dtype_from_str(dtype: str):

engines/python/setup/djl_python/properties_manager/lmi_dist_rb_properties.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,9 +20,13 @@
2020

2121
class LmiDistQuantizeMethods(str, Enum):
2222
awq = 'awq'
23+
deepspeedfp = 'deepspeedfp'
24+
fp8 = 'fp8'
2325
gptq = 'gptq'
26+
gptq_marlin = 'gptq_marlin'
27+
gptq_marlin_24 = 'gptq_marlin_24'
28+
marlin = 'marlin'
2429
squeezellm = 'squeezellm'
25-
deepspeedfp = 'deepspeedfp'
2630

2731

2832
class LmiDistRbProperties(Properties):

engines/python/setup/djl_python/properties_manager/vllm_rb_properties.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,9 +20,13 @@
2020

2121
class VllmQuantizeMethods(str, Enum):
2222
awq = 'awq'
23+
deepspeedfp = 'deepspeedfp'
24+
fp8 = 'fp8'
2325
gptq = 'gptq'
26+
gptq_marlin = 'gptq_marlin'
27+
gptq_marlin_24 = 'gptq_marlin_24'
28+
marlin = 'marlin'
2429
squeezellm = 'squeezellm'
25-
deepspeedfp = 'deepspeedfp'
2630

2731

2832
class VllmRbProperties(Properties):

serving/docker/lmi.Dockerfile

Lines changed: 13 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -19,22 +19,22 @@ ARG torch_version=2.3.0
1919
ARG torch_vision_version=0.18.0
2020
ARG onnx_version=1.18.0
2121
ARG onnxruntime_wheel="https://publish.djl.ai/onnxruntime/1.18.0/onnxruntime_gpu-1.18.0-cp310-cp310-linux_x86_64.whl"
22-
ARG pydantic_version=2.7.1
22+
ARG pydantic_version=2.8.2
2323
ARG djl_converter_wheel="https://publish.djl.ai/djl_converter/djl_converter-0.28.0-py3-none-any.whl"
24-
ARG vllm_cuda_name="cu12"
25-
ARG vllm_nccl_version=2.18.1
2624
# HF Deps
2725
ARG protobuf_version=3.20.3
28-
ARG transformers_version=4.41.1
29-
ARG accelerate_version=0.30.1
26+
ARG transformers_version=4.42.3
27+
ARG accelerate_version=0.32.1
3028
ARG bitsandbytes_version=0.43.1
31-
ARG optimum_version=1.20.0
29+
ARG optimum_version=1.21.2
3230
ARG auto_gptq_version=0.7.1
33-
ARG datasets_version=2.19.1
31+
ARG datasets_version=2.20.0
3432
ARG autoawq_version=0.2.5
33+
ARG tokenizers_version=0.19.1
3534
# LMI-Dist Deps
36-
ARG vllm_wheel="https://publish.djl.ai/vllm/cu124-pt230/vllm-0.4.2%2Bcu124-cp310-cp310-linux_x86_64.whl"
37-
ARG flash_attn_2_wheel="https://publish.djl.ai/flash_attn/cu124-pt230/flash_attn-2.5.8-cp310-cp310-linux_x86_64.whl"
35+
ARG vllm_wheel="https://github.com/vllm-project/vllm/releases/download/v0.5.1/vllm-0.5.1-cp310-cp310-manylinux1_x86_64.whl"
36+
ARG flash_attn_2_wheel="https://github.com/vllm-project/flash-attention/releases/download/v2.5.9/vllm_flash_attn-2.5.9-cp310-cp310-manylinux1_x86_64.whl"
37+
ARG flash_infer_wheel="https://github.com/flashinfer-ai/flashinfer/releases/download/v0.0.8/flashinfer-0.0.8+cu121torch2.3-cp310-cp310-linux_x86_64.whl"
3838
# %2B is the url escape for the '+' character
3939
ARG lmi_dist_wheel="https://publish.djl.ai/lmi_dist/lmi_dist-11.0.0%2Bnightly-py3-none-any.whl"
4040
ARG seq_scheduler_wheel="https://publish.djl.ai/seq_scheduler/seq_scheduler-0.1.0-py3-none-any.whl"
@@ -57,7 +57,6 @@ ENV PYTORCH_PRECXX11=true
5757
ENV PYTORCH_VERSION=${torch_version}
5858
ENV PYTORCH_FLAVOR=cu121-precxx11
5959
ENV VLLM_NO_USAGE_STATS=1
60-
ENV VLLM_CONFIG_ROOT=/opt/djl/vllm/.config
6160

6261

6362
ENV HF_HOME=/tmp/.cache/huggingface
@@ -102,23 +101,18 @@ RUN pip3 install torch==${torch_version} torchvision==${torch_vision_version} --
102101
transformers==${transformers_version} hf-transfer zstandard datasets==${datasets_version} \
103102
mpi4py sentencepiece tiktoken blobfile einops accelerate==${accelerate_version} bitsandbytes==${bitsandbytes_version} \
104103
optimum==${optimum_version} auto-gptq==${auto_gptq_version} pandas pyarrow jinja2 \
105-
opencv-contrib-python-headless safetensors scipy onnx sentence_transformers ${onnxruntime_wheel} autoawq==${autoawq_version} && \
104+
opencv-contrib-python-headless safetensors scipy onnx sentence_transformers ${onnxruntime_wheel} autoawq==${autoawq_version} \
105+
tokenizers==${tokenizers_version} pydantic==${pydantic_version} && \
106106
pip3 install ${djl_converter_wheel} --no-deps && \
107+
git clone https://github.com/neuralmagic/AutoFP8.git && cd AutoFP8 && git reset --hard 4b2092c && pip3 install . && cd .. && rm -rf AutoFP8 \
107108
pip3 cache purge
108109

109-
RUN pip3 install ${flash_attn_2_wheel} ${lmi_dist_wheel} ${vllm_wheel} pydantic==${pydantic_version} && \
110+
RUN pip3 install ${flash_attn_2_wheel} ${lmi_dist_wheel} ${vllm_wheel} ${flash_infer_wheel} && \
110111
pip3 cache purge
111112

112113
# Add CUDA-Compat
113114
RUN apt-get update && apt-get install -y cuda-compat-12-4 && apt-get clean -y && rm -rf /var/lib/apt/lists/*
114115

115-
# We use the same NCCL version as vLLM for lmi-dist https://github.com/vllm-project/vllm/blob/v0.4.2/vllm/utils.py#L641-L646
116-
# This is due to https://github.com/vllm-project/vllm/blob/v0.4.2/vllm/distributed/device_communicators/pynccl.py#L1-L9
117-
RUN mkdir -p ${VLLM_CONFIG_ROOT}/vllm/nccl/$vllm_cuda_name/ && curl -L -o ${VLLM_CONFIG_ROOT}/vllm/nccl/$vllm_cuda_name/libnccl.so.$vllm_nccl_version \
118-
https://github.com/vllm-project/vllm-nccl/releases/download/v0.1.0/$vllm_cuda_name-libnccl.so.$vllm_nccl_version && \
119-
# The following is done only so that we can run the CI with `-u djl`. Sagemaker wouldn't require this.
120-
chmod -R a+w ${VLLM_CONFIG_ROOT}
121-
122116
RUN scripts/patch_oss_dlc.sh python && \
123117
scripts/security_patch.sh lmi && \
124118
useradd -m -d /home/djl djl && \

tests/integration/llm/client.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -276,6 +276,12 @@ def get_model_name():
276276
"seq_length": [256],
277277
"tokenizer": "amazon/MegaBeam-Mistral-7B-300k"
278278
},
279+
"mistral-7b-marlin": {
280+
"max_memory_per_gpu": [23.0],
281+
"batch_size": [1, 4],
282+
"seq_length": [256],
283+
"tokenizer": "amazon/MegaBeam-Mistral-7B-300k"
284+
},
279285
"llama-7b-unmerged-lora": {
280286
"max_memory_per_gpu": [15.0, 15.0],
281287
"batch_size": [3],
@@ -376,6 +382,15 @@ def get_model_name():
376382
"seq_length": [256],
377383
"tokenizer": "NousResearch/Nous-Hermes-2-Mixtral-8x7B-DPO"
378384
},
385+
"qwen2-7b-fp8": {
386+
"max_memory_per_gpu": [23.0],
387+
"batch_size": [1, 4],
388+
"seq_length": [256],
389+
"tokenizer": "Qwen/Qwen-7B",
390+
"parameters": {
391+
"decoder_input_details": True
392+
}
393+
},
379394
"llama-7b-unmerged-lora": {
380395
"max_memory_per_gpu": [15.0, 15.0],
381396
"batch_size": [3],

tests/integration/llm/prepare.py

Lines changed: 14 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -418,6 +418,13 @@
418418
"option.max_rolling_batch_size": 4,
419419
"option.quantize": "awq"
420420
},
421+
"mistral-7b-marlin": {
422+
"option.model_id": "neuralmagic/OpenHermes-2.5-Mistral-7B-marlin",
423+
"option.task": "text-generation",
424+
"option.tensor_parallel_degree": 4,
425+
"option.max_rolling_batch_size": 4,
426+
"option.quantize": "marlin"
427+
},
421428
"llama-7b-unmerged-lora": {
422429
"option.model_id": "s3://djl-llm/huggyllama-llama-7b",
423430
"option.tensor_parallel_degree": "max",
@@ -447,17 +454,6 @@
447454
"option.tensor_parallel_degree": 4,
448455
"option.max_rolling_batch_size": 4
449456
},
450-
"llama-7b-unmerged-lora": {
451-
"option.model_id": "s3://djl-llm/huggyllama-llama-7b",
452-
"option.tensor_parallel_degree": "max",
453-
"option.task": "text-generation",
454-
"option.dtype": "fp16",
455-
"option.adapters": "adapters",
456-
"option.enable_lora": "true",
457-
"adapter_ids": ["tloen/alpaca-lora-7b", "22h/cabrita-lora-v0-1"],
458-
"adapter_names": ["english-alpaca", "portugese-alpaca"],
459-
"option.gpu_memory_utilization": "0.8",
460-
},
461457
"llama2-13b-awq-unmerged-lora": {
462458
"option.model_id":
463459
"s3://djl-llm/TheBloke-Llama-2-13b-Chat-AWQ/",
@@ -602,6 +598,13 @@
602598
"option.max_rolling_batch_size": 32,
603599
"option.output_formatter": "jsonlines"
604600
},
601+
"qwen2-7b-fp8": {
602+
"option.model_id": "neuralmagic/Qwen2-7B-Instruct-FP8",
603+
"option.task": "text-generation",
604+
"option.tensor_parallel_degree": 4,
605+
"option.max_rolling_batch_size": 4,
606+
"option.quantize": "fp8"
607+
},
605608
"llama-7b-unmerged-lora": {
606609
"option.model_id": "s3://djl-llm/huggyllama-llama-7b",
607610
"option.tensor_parallel_degree": "max",

tests/integration/tests.py

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,15 @@
1111
djl_version = os.environ.get('TEST_DJL_VERSION', '').strip()
1212

1313

14+
def is_applicable_cuda_capability(arch: int) -> bool:
15+
import torch
16+
if not torch.cuda.is_available():
17+
return False
18+
19+
major, minor = torch.cuda.get_device_capability()
20+
return (10 * major + minor) >= arch
21+
22+
1423
class Runner:
1524

1625
def __init__(self, container, test_name=None, download=False):
@@ -383,6 +392,12 @@ def test_mpt_7b(self):
383392
r.launch()
384393
client.run("lmi_dist mpt-7b".split())
385394

395+
def test_mistral_7b_marlin(self):
396+
with Runner('lmi', 'mistral-7b-marlin') as r:
397+
prepare.build_lmi_dist_model("mistral-7b-marlin")
398+
r.launch()
399+
client.run("lmi_dist mistral-7b-marlin".split())
400+
386401
def test_llama2_tiny_autoawq(self):
387402
with Runner('lmi', 'llama-2-tiny-autoawq') as r:
388403
prepare.build_lmi_dist_model("llama-2-tiny")
@@ -494,6 +509,14 @@ def test_llama2_7b_chat(self):
494509
r.launch()
495510
client.run("vllm_chat llama2-7b-chat".split())
496511

512+
@pytest.mark.skipif(not is_applicable_cuda_capability(89),
513+
reason="Unsupported CUDA capability")
514+
def test_qwen2_7b_fp8(self):
515+
with Runner('lmi', 'qwen2-7b-fp8') as r:
516+
prepare.build_vllm_model("qwen2-7b-fp8")
517+
r.launch()
518+
client.run("vllm qwen2-7b-fp8".split())
519+
497520

498521
class TestVllmLora:
499522
# Runs on g5.12xl

0 commit comments

Comments
 (0)