Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions .github/workflows/integration.yml
Original file line number Diff line number Diff line change
Expand Up @@ -143,6 +143,11 @@ jobs:
sudo apt-get install python3 python-is-python3 python3-pip -y
- name: Install pip dependencies
run: pip3 install pytest requests "numpy<2" pillow huggingface_hub
- name: Install torch
# Use torch to get cuda capability of current device to selectively run tests
# Torch version doesn't really matter that much
run: |
pip3 install torch==2.3.0
- name: Install awscurl
working-directory: tests/integration
run: |
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,11 @@ class HFQuantizeMethods(str, Enum):
# supported by vllm
awq = 'awq'
deepspeedfp = 'deepspeedfp'
fp8 = 'fp8'
gptq_marlin = 'gptq_marlin'
gptq_marlin_24 = 'gptq_marlin_24'
marlin = 'marlin'
squeezellm = 'squeezellm'


def get_torch_dtype_from_str(dtype: str):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,13 @@

class LmiDistQuantizeMethods(str, Enum):
awq = 'awq'
deepspeedfp = 'deepspeedfp'
fp8 = 'fp8'
gptq = 'gptq'
gptq_marlin = 'gptq_marlin'
gptq_marlin_24 = 'gptq_marlin_24'
marlin = 'marlin'
squeezellm = 'squeezellm'
deepspeedfp = 'deepspeedfp'


class LmiDistRbProperties(Properties):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,13 @@

class VllmQuantizeMethods(str, Enum):
awq = 'awq'
deepspeedfp = 'deepspeedfp'
fp8 = 'fp8'
gptq = 'gptq'
gptq_marlin = 'gptq_marlin'
gptq_marlin_24 = 'gptq_marlin_24'
marlin = 'marlin'
squeezellm = 'squeezellm'
deepspeedfp = 'deepspeedfp'


class VllmRbProperties(Properties):
Expand Down
33 changes: 14 additions & 19 deletions serving/docker/lmi.Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -19,22 +19,22 @@ ARG torch_version=2.3.0
ARG torch_vision_version=0.18.0
ARG onnx_version=1.18.0
ARG onnxruntime_wheel="https://publish.djl.ai/onnxruntime/1.18.0/onnxruntime_gpu-1.18.0-cp310-cp310-linux_x86_64.whl"
ARG pydantic_version=2.7.1
ARG pydantic_version=2.8.2
ARG djl_converter_wheel="https://publish.djl.ai/djl_converter/djl_converter-0.28.0-py3-none-any.whl"
ARG vllm_cuda_name="cu12"
ARG vllm_nccl_version=2.18.1
# HF Deps
ARG protobuf_version=3.20.3
ARG transformers_version=4.41.1
ARG accelerate_version=0.30.1
ARG transformers_version=4.42.3
ARG accelerate_version=0.32.1
ARG bitsandbytes_version=0.43.1
ARG optimum_version=1.20.0
ARG optimum_version=1.21.2
ARG auto_gptq_version=0.7.1
ARG datasets_version=2.19.1
ARG datasets_version=2.20.0
ARG autoawq_version=0.2.5
ARG tokenizers_version=0.19.1
# LMI-Dist Deps
ARG vllm_wheel="https://publish.djl.ai/vllm/cu124-pt230/vllm-0.4.2%2Bcu124-cp310-cp310-linux_x86_64.whl"
ARG flash_attn_2_wheel="https://publish.djl.ai/flash_attn/cu124-pt230/flash_attn-2.5.8-cp310-cp310-linux_x86_64.whl"
ARG vllm_wheel="https://github.com/vllm-project/vllm/releases/download/v0.5.1/vllm-0.5.1-cp310-cp310-manylinux1_x86_64.whl"
ARG flash_attn_2_wheel="https://github.com/vllm-project/flash-attention/releases/download/v2.5.9/vllm_flash_attn-2.5.9-cp310-cp310-manylinux1_x86_64.whl"
ARG flash_infer_wheel="https://github.com/flashinfer-ai/flashinfer/releases/download/v0.0.8/flashinfer-0.0.8+cu121torch2.3-cp310-cp310-linux_x86_64.whl"
# %2B is the url escape for the '+' character
ARG lmi_dist_wheel="https://publish.djl.ai/lmi_dist/lmi_dist-11.0.0%2Bnightly-py3-none-any.whl"
ARG seq_scheduler_wheel="https://publish.djl.ai/seq_scheduler/seq_scheduler-0.1.0-py3-none-any.whl"
Expand All @@ -57,7 +57,7 @@ ENV PYTORCH_PRECXX11=true
ENV PYTORCH_VERSION=${torch_version}
ENV PYTORCH_FLAVOR=cu121-precxx11
ENV VLLM_NO_USAGE_STATS=1
ENV VLLM_CONFIG_ROOT=/opt/djl/vllm/.config
ENV VLLM_WORKER_MULTIPROC_METHOD=spawn


ENV HF_HOME=/tmp/.cache/huggingface
Expand Down Expand Up @@ -102,23 +102,18 @@ RUN pip3 install torch==${torch_version} torchvision==${torch_vision_version} --
transformers==${transformers_version} hf-transfer zstandard datasets==${datasets_version} \
mpi4py sentencepiece tiktoken blobfile einops accelerate==${accelerate_version} bitsandbytes==${bitsandbytes_version} \
optimum==${optimum_version} auto-gptq==${auto_gptq_version} pandas pyarrow jinja2 \
opencv-contrib-python-headless safetensors scipy onnx sentence_transformers ${onnxruntime_wheel} autoawq==${autoawq_version} && \
opencv-contrib-python-headless safetensors scipy onnx sentence_transformers ${onnxruntime_wheel} autoawq==${autoawq_version} \
tokenizers==${tokenizers_version} pydantic==${pydantic_version} && \
pip3 install ${djl_converter_wheel} --no-deps && \
git clone https://github.com/neuralmagic/AutoFP8.git && cd AutoFP8 && git reset --hard 4b2092c && pip3 install . && cd .. && rm -rf AutoFP8 \
pip3 cache purge

RUN pip3 install ${flash_attn_2_wheel} ${lmi_dist_wheel} ${vllm_wheel} pydantic==${pydantic_version} && \
RUN pip3 install ${flash_attn_2_wheel} ${lmi_dist_wheel} ${vllm_wheel} ${flash_infer_wheel} && \
pip3 cache purge

# Add CUDA-Compat
RUN apt-get update && apt-get install -y cuda-compat-12-4 && apt-get clean -y && rm -rf /var/lib/apt/lists/*

# We use the same NCCL version as vLLM for lmi-dist https://github.com/vllm-project/vllm/blob/v0.4.2/vllm/utils.py#L641-L646
# This is due to https://github.com/vllm-project/vllm/blob/v0.4.2/vllm/distributed/device_communicators/pynccl.py#L1-L9
RUN mkdir -p ${VLLM_CONFIG_ROOT}/vllm/nccl/$vllm_cuda_name/ && curl -L -o ${VLLM_CONFIG_ROOT}/vllm/nccl/$vllm_cuda_name/libnccl.so.$vllm_nccl_version \
https://github.com/vllm-project/vllm-nccl/releases/download/v0.1.0/$vllm_cuda_name-libnccl.so.$vllm_nccl_version && \
# The following is done only so that we can run the CI with `-u djl`. Sagemaker wouldn't require this.
chmod -R a+w ${VLLM_CONFIG_ROOT}

RUN scripts/patch_oss_dlc.sh python && \
scripts/security_patch.sh lmi && \
useradd -m -d /home/djl djl && \
Expand Down
20 changes: 20 additions & 0 deletions tests/integration/llm/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -276,6 +276,17 @@ def get_model_name():
"seq_length": [256],
"tokenizer": "amazon/MegaBeam-Mistral-7B-300k"
},
"mistral-7b-marlin": {
"max_memory_per_gpu": [23.0],
"batch_size": [1, 4],
"seq_length": [256],
"tokenizer": "amazon/MegaBeam-Mistral-7B-300k"
},
"llama-2-13b-flashinfer": {
"batch_size": [1, 4],
"seq_length": [256],
"tokenizer": "TheBloke/Llama-2-13B-fp16",
},
"llama-7b-unmerged-lora": {
"max_memory_per_gpu": [15.0, 15.0],
"batch_size": [3],
Expand Down Expand Up @@ -376,6 +387,15 @@ def get_model_name():
"seq_length": [256],
"tokenizer": "NousResearch/Nous-Hermes-2-Mixtral-8x7B-DPO"
},
"qwen2-7b-fp8": {
"max_memory_per_gpu": [23.0],
"batch_size": [1, 4],
"seq_length": [256],
"tokenizer": "Qwen/Qwen-7B",
"parameters": {
"decoder_input_details": True
}
},
"llama-7b-unmerged-lora": {
"max_memory_per_gpu": [15.0, 15.0],
"batch_size": [3],
Expand Down
31 changes: 20 additions & 11 deletions tests/integration/llm/prepare.py
Original file line number Diff line number Diff line change
Expand Up @@ -418,6 +418,19 @@
"option.max_rolling_batch_size": 4,
"option.quantize": "awq"
},
"mistral-7b-marlin": {
"option.model_id": "neuralmagic/OpenHermes-2.5-Mistral-7B-marlin",
"option.task": "text-generation",
"option.tensor_parallel_degree": 4,
"option.max_rolling_batch_size": 4,
"option.quantize": "marlin"
},
"llama-2-13b-flashinfer": {
"option.model_id": "s3://djl-llm/llama-2-13b-hf/",
"option.task": "text-generation",
"option.tensor_parallel_degree": 4,
"option.max_rolling_batch_size": 4,
},
"llama-7b-unmerged-lora": {
"option.model_id": "s3://djl-llm/huggyllama-llama-7b",
"option.tensor_parallel_degree": "max",
Expand Down Expand Up @@ -447,17 +460,6 @@
"option.tensor_parallel_degree": 4,
"option.max_rolling_batch_size": 4
},
"llama-7b-unmerged-lora": {
"option.model_id": "s3://djl-llm/huggyllama-llama-7b",
"option.tensor_parallel_degree": "max",
"option.task": "text-generation",
"option.dtype": "fp16",
"option.adapters": "adapters",
"option.enable_lora": "true",
"adapter_ids": ["tloen/alpaca-lora-7b", "22h/cabrita-lora-v0-1"],
"adapter_names": ["english-alpaca", "portugese-alpaca"],
"option.gpu_memory_utilization": "0.8",
},
"llama2-13b-awq-unmerged-lora": {
"option.model_id":
"s3://djl-llm/TheBloke-Llama-2-13b-Chat-AWQ/",
Expand Down Expand Up @@ -602,6 +604,13 @@
"option.max_rolling_batch_size": 32,
"option.output_formatter": "jsonlines"
},
"qwen2-7b-fp8": {
"option.model_id": "neuralmagic/Qwen2-7B-Instruct-FP8",
"option.task": "text-generation",
"option.tensor_parallel_degree": 4,
"option.max_rolling_batch_size": 4,
"option.quantize": "fp8"
},
"llama-7b-unmerged-lora": {
"option.model_id": "s3://djl-llm/huggyllama-llama-7b",
"option.tensor_parallel_degree": "max",
Expand Down
32 changes: 32 additions & 0 deletions tests/integration/tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,15 @@
djl_version = os.environ.get('TEST_DJL_VERSION', '').strip()


def is_applicable_cuda_capability(arch: int) -> bool:
import torch
if not torch.cuda.is_available():
return False

major, minor = torch.cuda.get_device_capability()
return (10 * major + minor) >= arch


class Runner:

def __init__(self, container, test_name=None, download=False):
Expand Down Expand Up @@ -383,6 +392,21 @@ def test_mpt_7b(self):
r.launch()
client.run("lmi_dist mpt-7b".split())

def test_mistral_7b_marlin(self):
with Runner('lmi', 'mistral-7b-marlin') as r:
prepare.build_lmi_dist_model("mistral-7b-marlin")
r.launch()
client.run("lmi_dist mistral-7b-marlin".split())

def test_llama2_13b_flashinfer(self):
with Runner('lmi', 'llama-2-13b-flashinfer') as r:
prepare.build_lmi_dist_model("llama-2-13b-flashinfer")
envs = [
"VLLM_ATTENTION_BACKEND=FLASHINFER",
]
r.launch(env_vars=envs)
client.run("lmi_dist llama-2-13b-flashinfer".split())

def test_llama2_tiny_autoawq(self):
with Runner('lmi', 'llama-2-tiny-autoawq') as r:
prepare.build_lmi_dist_model("llama-2-tiny")
Expand Down Expand Up @@ -494,6 +518,14 @@ def test_llama2_7b_chat(self):
r.launch()
client.run("vllm_chat llama2-7b-chat".split())

@pytest.mark.skipif(not is_applicable_cuda_capability(89),
reason="Unsupported CUDA capability")
def test_qwen2_7b_fp8(self):
with Runner('lmi', 'qwen2-7b-fp8') as r:
prepare.build_vllm_model("qwen2-7b-fp8")
r.launch()
client.run("vllm qwen2-7b-fp8".split())


class TestVllmLora:
# Runs on g5.12xl
Expand Down