Skip to content
Merged
Show file tree
Hide file tree
Changes from 28 commits
Commits
Show all changes
49 commits
Select commit Hold shift + click to select a range
d6bf991
[WIP][RC] Update PyTorch to 2.8.0
huydhn Jul 2, 2025
456985c
Handle xformers
huydhn Jul 2, 2025
4838d53
Some more tweaks
huydhn Jul 8, 2025
ca21216
Attempt to fix xformers build
huydhn Jul 8, 2025
0c43174
Silly typo
huydhn Jul 9, 2025
14c85d1
Few more tweaks for a greener CI
huydhn Jul 10, 2025
ad98d10
Attempt to offload to CPU to avoid OOM in CI
huydhn Jul 11, 2025
316f116
Merge branch 'main' into pytorch-2.8.0
huydhn Jul 11, 2025
460ed09
Merge branch 'main' into pytorch-2.8.0
huydhn Jul 13, 2025
7df288f
Merge branch 'main' into pytorch-2.8.0
huydhn Jul 15, 2025
6a08113
Fix lint
huydhn Jul 15, 2025
44f07c0
Run all test_sequence_parallel again
huydhn Jul 16, 2025
29fb5a0
Typo
huydhn Jul 16, 2025
6a7e3f8
Merge branch 'main' into pytorch-2.8.0
huydhn Jul 17, 2025
c5d8940
Try to reproduce OOM after recent rebase
huydhn Jul 17, 2025
f320d9d
Match xformers version
huydhn Jul 17, 2025
a5999e1
Not sure why building xformers 0.0.31 fails
huydhn Jul 18, 2025
d1dbb4e
Remove some doc changes what are not needed
huydhn Jul 18, 2025
6f394f5
Tweak some tests
huydhn Jul 19, 2025
f62f6cf
Merge branch 'main' into pytorch-2.8.0
huydhn Jul 22, 2025
f1a6642
Merge branch 'main' into pytorch-2.8.0
huydhn Jul 23, 2025
aa1d8c1
Lower memory usage for test_chatglm3_lora_tp4_fully_sharded_loras
huydhn Jul 23, 2025
6f2c684
Merge branch 'main' into pytorch-2.8.0
huydhn Jul 26, 2025
91ce20f
Build mamba_ssm from source
huydhn Jul 26, 2025
5ce81c7
Merge branch 'main' into pytorch-2.8.0
huydhn Aug 4, 2025
c789827
Merge branch 'main' into pytorch-2.8.0
huydhn Aug 6, 2025
93eb498
Ready 2.8.0
huydhn Aug 6, 2025
c741b0e
Update cpu.txt
huydhn Aug 6, 2025
f948f41
Merge branch 'main' into pytorch-2.8.0
huydhn Aug 12, 2025
1543b92
Resolve xformers and mamba_ssm
huydhn Aug 13, 2025
8de151c
Ready to land
huydhn Aug 13, 2025
bbf1ce3
xformers v0.0.32 is almost here
huydhn Aug 14, 2025
653ccd1
Use xformers 0.0.32
huydhn Aug 15, 2025
bcb7ffc
Merge branch 'main' into pytorch-2.8.0
huydhn Aug 15, 2025
684c24d
Install ao from cu128
huydhn Aug 15, 2025
456d284
Merge branch 'main' into pytorch-2.8.0
mgoin Aug 19, 2025
338d1b5
Merge branch 'main' into pytorch-2.8.0
mgoin Aug 20, 2025
984ff0f
Merge branch 'main' into pytorch-2.8.0
mgoin Aug 21, 2025
90d455e
Merge branch 'main' into pytorch-2.8.0
mgoin Aug 22, 2025
be370a8
Merge branch 'main' into pytorch-2.8.0
mgoin Aug 26, 2025
d8de108
Tweaking jason9693/Qwen2.5-1.5B-apeach memory usage
huydhn Aug 26, 2025
047e295
Same model, different test
huydhn Aug 26, 2025
a409dc2
Revert "Same model, different test"
huydhn Aug 27, 2025
b1c83ec
Revert "Tweaking jason9693/Qwen2.5-1.5B-apeach memory usage"
huydhn Aug 27, 2025
cafb50d
Merge branch 'main' into pytorch-2.8.0
huydhn Aug 27, 2025
56739de
Merge branch 'main' into pytorch-2.8.0
huydhn Aug 28, 2025
3526230
Switch to XFORMERS backend for some tests
huydhn Aug 28, 2025
6bae6f2
Revert "Switch to XFORMERS backend for some tests"
huydhn Aug 28, 2025
102d0d7
Apply #23853
huydhn Aug 28, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -45,8 +45,8 @@ set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1
# requirements.txt files and should be kept consistent. The ROCm torch
# versions are derived from docker/Dockerfile.rocm
#
set(TORCH_SUPPORTED_VERSION_CUDA "2.7.1")
set(TORCH_SUPPORTED_VERSION_ROCM "2.7.0")
set(TORCH_SUPPORTED_VERSION_CUDA "2.8.0")
set(TORCH_SUPPORTED_VERSION_ROCM "2.8.0")

#
# Try to find python package with an executable that exactly matches
Expand Down
15 changes: 15 additions & 0 deletions docker/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -160,6 +160,10 @@ RUN --mount=type=cache,target=/root/.cache/uv \
uv pip install --system -r requirements/cuda.txt \
--extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')

# Build from source to unblock PyTorch 2.8.0 update
RUN --mount=type=cache,target=/root/.cache/uv \
uv pip install --system --no-build-isolation "git+https://github.com/state-spaces/[email protected]"

# cuda arch list used by torch
# can be useful for both `dev` and `test`
# explicitly set the list to avoid issues with torch 2.2
Expand Down Expand Up @@ -376,6 +380,13 @@ RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist
uv pip install --system dist/*.whl --verbose \
--extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')

# TODO (huydhn): Remove this once xformers is released for 2.8.0
RUN --mount=type=cache,target=/root/.cache/uv bash - <<'BASH'
. /etc/environment
export TORCH_CUDA_ARCH_LIST='7.0 7.5 8.0 8.9 9.0 10.0 12.0'
uv pip install --system --no-build-isolation "git+https://github.com/facebookresearch/[email protected]"
BASH

# If we need to build FlashInfer wheel before its release:
# $ # Note we remove 7.0 from the arch list compared to the list below, since FlashInfer only supports sm75+
# $ export TORCH_CUDA_ARCH_LIST='7.5 8.0 8.9 9.0a 10.0a 12.0'
Expand Down Expand Up @@ -483,6 +494,10 @@ ENV UV_INDEX_STRATEGY="unsafe-best-match"
# Use copy mode to avoid hardlink failures with Docker cache mounts
ENV UV_LINK_MODE=copy

# Build from source to unblock PyTorch 2.8.0 update
RUN --mount=type=cache,target=/root/.cache/uv \
uv pip install --system --no-build-isolation "git+https://github.com/state-spaces/[email protected]"

# install development dependencies (for testing)
RUN --mount=type=cache,target=/root/.cache/uv \
CUDA_MAJOR="${CUDA_VERSION%%.*}"; \
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ requires = [
"packaging>=24.2",
"setuptools>=77.0.3,<80.0.0",
"setuptools-scm>=8.0",
"torch == 2.7.1",
"torch == 2.8.0",
"wheel",
"jinja2",
]
Expand Down
3 changes: 2 additions & 1 deletion requirements/build.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,8 @@ ninja
packaging>=24.2
setuptools>=77.0.3,<80.0.0
setuptools-scm>=8
torch==2.7.1
torch==2.8.0
wheel
jinja2>=3.1.6
regex
build
9 changes: 4 additions & 5 deletions requirements/cpu.txt
Original file line number Diff line number Diff line change
Expand Up @@ -9,17 +9,16 @@ packaging>=24.2
setuptools>=77.0.3,<80.0.0
--extra-index-url https://download.pytorch.org/whl/cpu
torch==2.6.0+cpu; platform_machine == "x86_64" # torch>2.6.0+cpu has performance regression on x86 platform, see https://github.com/pytorch/pytorch/pull/151218
torch==2.7.0; platform_system == "Darwin"
torch==2.7.0; platform_machine == "ppc64le"
torch==2.6.0; platform_machine == "aarch64" # for arm64 CPUs, torch 2.7.0 has a issue: https://github.com/vllm-project/vllm/issues/17960
torch==2.8.0; platform_system == "Darwin"
torch==2.8.0; platform_machine == "ppc64le" or platform_machine == "aarch64"

# required for the image processor of minicpm-o-2_6, this must be updated alongside torch
torchaudio; platform_machine != "ppc64le" and platform_machine != "s390x"
torchaudio==2.7.0; platform_machine == "ppc64le"
torchaudio==2.8.0; platform_machine == "ppc64le"

# required for the image processor of phi3v, this must be updated alongside torch
torchvision; platform_machine != "ppc64le" and platform_machine != "s390x"
torchvision==0.22.0; platform_machine == "ppc64le"
torchvision==0.23.0; platform_machine == "ppc64le"
datasets # for benchmark scripts

# Intel Extension for PyTorch, only for x86_64 CPUs
Expand Down
9 changes: 5 additions & 4 deletions requirements/cuda.txt
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,10 @@ numba == 0.61.2; python_version > '3.9'

# Dependencies for NVIDIA GPUs
ray[cgraph]>=2.48.0 # Ray Compiled Graph, required for pipeline parallelism in V1.
torch==2.7.1
torchaudio==2.7.1
torch==2.8.0
torchaudio==2.8.0
# These must be updated alongside torch
torchvision==0.22.1 # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version
torchvision==0.23.0 # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version
# TODO (huydhn): Re-enable this once xformers is released for 2.8.0
# https://github.com/facebookresearch/xformers/releases/tag/v0.0.31
xformers==0.0.31; platform_system == 'Linux' and platform_machine == 'x86_64' # Requires PyTorch >= 2.7
# xformers==0.0.31; platform_system == 'Linux' and platform_machine == 'x86_64' # Requires PyTorch >= 2.7
8 changes: 4 additions & 4 deletions requirements/rocm-build.txt
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
# Common dependencies
-r common.txt

--extra-index-url https://download.pytorch.org/whl/rocm6.2.4
torch==2.7.0
torchvision==0.22.0
torchaudio==2.7.0
--extra-index-url https://download.pytorch.org/whl/rocm6.3
torch==2.8.0
torchvision==0.23.0
torchaudio==2.8.0

triton==3.2
cmake>=3.26.1,<4
Expand Down
8 changes: 4 additions & 4 deletions requirements/test.in
Original file line number Diff line number Diff line change
Expand Up @@ -22,9 +22,9 @@ sentence-transformers # required for embedding tests
soundfile # required for audio tests
jiwer # required for audio tests
timm # required for internvl test
torch==2.7.1
torchaudio==2.7.1
torchvision==0.22.1
torch==2.8.0
torchaudio==2.8.0
torchvision==0.23.0
transformers_stream_generator # required for qwen-vl test
mamba_ssm==2.2.5 # required for plamo2 test
matplotlib # required for qwen-vl test
Expand Down Expand Up @@ -54,4 +54,4 @@ runai-model-streamer==0.11.0
runai-model-streamer-s3==0.11.0
fastsafetensors>=0.1.10
pydantic>=2.10 # 2.9 leads to error on python 3.10
terratorch==1.1rc2 # required for PrithviMAE test
terratorch==1.1rc2 # required for PrithviMAE test
36 changes: 18 additions & 18 deletions requirements/test.txt
Original file line number Diff line number Diff line change
Expand Up @@ -544,42 +544,42 @@ numpy==1.26.4
# tritonclient
# vocos
# xarray
nvidia-cublas-cu12==12.8.3.14
nvidia-cublas-cu12==12.8.4.1
# via
# nvidia-cudnn-cu12
# nvidia-cusolver-cu12
# torch
nvidia-cuda-cupti-cu12==12.8.57
nvidia-cuda-cupti-cu12==12.8.90
# via torch
nvidia-cuda-nvrtc-cu12==12.8.61
nvidia-cuda-nvrtc-cu12==12.8.93
# via torch
nvidia-cuda-runtime-cu12==12.8.57
nvidia-cuda-runtime-cu12==12.8.90
# via torch
nvidia-cudnn-cu12==9.7.1.26
nvidia-cudnn-cu12==9.10.2.21
# via torch
nvidia-cufft-cu12==11.3.3.41
nvidia-cufft-cu12==11.3.3.83
# via torch
nvidia-cufile-cu12==1.13.0.11
nvidia-cufile-cu12==1.13.1.3
# via torch
nvidia-curand-cu12==10.3.9.55
nvidia-curand-cu12==10.3.9.90
# via torch
nvidia-cusolver-cu12==11.7.2.55
nvidia-cusolver-cu12==11.7.3.90
# via torch
nvidia-cusparse-cu12==12.5.7.53
nvidia-cusparse-cu12==12.5.8.93
# via
# nvidia-cusolver-cu12
# torch
nvidia-cusparselt-cu12==0.6.3
nvidia-cusparselt-cu12==0.7.1
# via torch
nvidia-nccl-cu12==2.26.2
nvidia-nccl-cu12==2.27.3
# via torch
nvidia-nvjitlink-cu12==12.8.61
nvidia-nvjitlink-cu12==12.8.93
# via
# nvidia-cufft-cu12
# nvidia-cusolver-cu12
# nvidia-cusparse-cu12
# torch
nvidia-nvtx-cu12==12.8.55
nvidia-nvtx-cu12==12.8.90
# via torch
omegaconf==2.3.0
# via
Expand Down Expand Up @@ -1074,7 +1074,7 @@ tomli==2.2.1
# via schemathesis
tomli-w==1.2.0
# via schemathesis
torch==2.7.1+cu128
torch==2.8.0+cu128
# via
# -r requirements/test.in
# accelerate
Expand Down Expand Up @@ -1104,7 +1104,7 @@ torch==2.7.1+cu128
# torchvision
# vector-quantize-pytorch
# vocos
torchaudio==2.7.1+cu128
torchaudio==2.8.0+cu128
# via
# -r requirements/test.in
# encodec
Expand All @@ -1117,7 +1117,7 @@ torchmetrics==1.7.4
# pytorch-lightning
# terratorch
# torchgeo
torchvision==0.22.1+cu128
torchvision==0.23.0+cu128
# via
# -r requirements/test.in
# lightly
Expand Down Expand Up @@ -1159,7 +1159,7 @@ transformers==4.55.0
# transformers-stream-generator
transformers-stream-generator==0.0.5
# via -r requirements/test.in
triton==3.3.1
triton==3.4.0
# via
# mamba-ssm
# torch
Expand Down
2 changes: 1 addition & 1 deletion tests/distributed/test_sequence_parallel.py
Original file line number Diff line number Diff line change
Expand Up @@ -292,7 +292,7 @@ def _compare_sp(
# TODO support other models
# [LANGUAGE GENERATION]
"meta-llama/Llama-3.2-1B-Instruct",
"RedHatAI/Meta-Llama-3.1-8B-Instruct-FP8"
"RedHatAI/Meta-Llama-3.1-8B-Instruct-FP8",
]


Expand Down
4 changes: 2 additions & 2 deletions tests/entrypoints/openai/test_vision.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,11 +36,11 @@
],
[
"The image shows a Venn diagram with three over",
"The image shows a Venn diagram with three intersect",
"This image shows a Venn diagram with three over",
],
[
"This image displays a gradient of colors ranging from",
"The image displays a gradient of colors ranging from",
"This image displays a gradient of colors transitioning from",
],
]

Expand Down
6 changes: 5 additions & 1 deletion tests/lora/test_chatglm3_tp.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,9 @@ def test_chatglm3_lora_tp4(chatglm3_lora_files):
@multi_gpu_test(num_gpus=4)
@create_new_process_for_each_test()
def test_chatglm3_lora_tp4_fully_sharded_loras(chatglm3_lora_files):
# https://github.com/NVIDIA/nccl/issues/1790, set a lower value for
# gpu_memory_utilization here because NCCL >= 2.26.3 seems to use
# more GPU memory causing vLLM to OOM
llm = vllm.LLM(MODEL_PATH,
max_model_len=1024,
enable_lora=True,
Expand All @@ -95,7 +98,8 @@ def test_chatglm3_lora_tp4_fully_sharded_loras(chatglm3_lora_files):
tensor_parallel_size=4,
trust_remote_code=True,
fully_sharded_loras=True,
enable_chunked_prefill=True)
enable_chunked_prefill=True,
gpu_memory_utilization=0.85)
output1 = do_sample(llm, chatglm3_lora_files, lora_id=1)
for i in range(len(EXPECTED_LORA_OUTPUT)):
assert output1[i] == EXPECTED_LORA_OUTPUT[i]
Expand Down