Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
50 commits
Select commit Hold shift + click to select a range
04f2cfc
Remove duplicate code from dbrx.py (#17550)
sstamenk May 1, 2025
173daac
[Bug]change the position of cuda_graph_sizes in dataclasses (#17548)
CXIAAAAA May 1, 2025
9b70e2b
[Misc][Tools][Benchmark] Publish script to auto tune server parameter…
Chenyaaang May 1, 2025
39c0813
[V1][Spec Decode] Apply torch.compile & cudagraph to EAGLE3 (#17504)
zixi-qi May 1, 2025
24aebae
[Bugfix] Disable gptq_bitblas for <SM80 to fix GPTQ on V100/T4 (#17541)
mgoin May 2, 2025
afb12e4
[Doc] note that not all unit tests pass on CPU platforms (#17554)
davidxia May 2, 2025
afcb3f8
[Attention] MLA move o_proj q_proj into cuda-graph region (#17484)
LucasWilkinson May 2, 2025
292fc59
[CI] Actually run tests/kv_transfer/test_disagg.py in CI (#17555)
mgoin May 2, 2025
b4003d1
Check if bitblas is installed during support check (#17572)
mgoin May 2, 2025
f89d0e1
[Misc] Continue refactoring model tests (#17573)
DarkLight1337 May 2, 2025
f192ca9
Fix PixtralHF missing spatial_merge_size (#17571)
mgoin May 2, 2025
109e15a
Add `pt_load_map_location` to allow loading to cuda (#16869)
jerryzh168 May 2, 2025
9e2de9b
[Bugifx] Remove TritonPlaceholder from sys.modules (#17317)
Isotr0py May 2, 2025
cc2a77d
[Core] [Bugfix] Add Input Embeddings (#15428)
qthequartermasterman May 2, 2025
c777df7
[BugFix] Fix Memory Leak (#17567)
robertgshaw2-redhat May 2, 2025
d754386
[Misc] Rename assets for testing (#17575)
DarkLight1337 May 2, 2025
b8b0859
add more pytorch related tests for torch nightly (#17422)
yangw-dev May 2, 2025
6d1479c
[doc] add the print result (#17584)
reidliu41 May 2, 2025
785d75a
Automatically tell users that dict args must be valid JSON in CLI (#1…
hmellor May 2, 2025
99404f5
[Security] Fix image hash collision (#17378)
DarkLight1337 May 2, 2025
868c546
Support W8A8 INT8 MoE for compressed-tensors (#16745)
mgoin May 2, 2025
3a500cd
[doc] miss result (#17589)
reidliu41 May 2, 2025
cb23495
[Misc] Clean up input processing (#17582)
DarkLight1337 May 2, 2025
4c33d67
[Bugfix] fix tmp_out and exp_sums dimensions (#17438)
hliuca May 2, 2025
0f87d8f
[BugFix][Attention] Fix sliding window attention in V1 giving incorre…
LucasWilkinson May 2, 2025
3e887d2
permute/unpermute kernel for moe optimization (#14568)
CalebDu May 2, 2025
182f40e
Add NVIDIA TensorRT Model Optimizer in vLLM documentation (#17561)
Edwardf0t1 May 2, 2025
9352cdb
[Hardware][AMD] Improve OAM device ID + llama4 Maverick MOE tuning (#…
xw285cornell May 2, 2025
b90b085
[easy] Print number of needed GPUs in skip message (#17594)
zou3519 May 2, 2025
9b103a1
fix typo in logging (#17605)
ehartford May 3, 2025
3ec97e2
[release] Add command to clean up Docker containers/images in TPU rel…
khluu May 3, 2025
22c6f63
[Neuron][Build] Require setuptools >= 77.0.3 for PEP 639 (#17603)
liangfu May 3, 2025
d47b605
Update test requirements to CUDA 12.8 (#17576)
22quinn May 3, 2025
e3d0a1d
[Quantizaton] [AMD] Add support for running DeepSeek int8 w8a8 MoE on…
rasmith May 3, 2025
87baebe
[Frontend][TPU] Add TPU default max-num-batched-tokens based on devic…
Chenyaaang May 3, 2025
c8386fa
[Build/CI] Upgrade CUTLASS to 3.9.1 (#17602)
tlrmchlsmth May 3, 2025
a928424
[Bugfix][ROCm] Using device_type because on ROCm the API is still tor…
gshtras May 3, 2025
887d7af
[Core] Gate `prompt_embeds` behind a feature flag (#17607)
DarkLight1337 May 3, 2025
f66f1e0
[Bugfix] Fix broken Qwen2.5-omni tests (#17613)
Isotr0py May 3, 2025
46fae69
[Misc] V0 fallback for `--enable-prompt-embeds` (#17615)
DarkLight1337 May 3, 2025
d6484ef
Add full API docs and improve the UX of navigating them (#17485)
hmellor May 4, 2025
2858830
[Bugfix] Prioritize dtype in root config before checking text config …
DarkLight1337 May 4, 2025
68e1ee0
[Bugfix][Easy] Fix whitespace in shm_broadcast.py logging (#17635)
tlrmchlsmth May 5, 2025
5394ad7
[Bugfix] fix KeyError on top logprobs are special tokens (#17637)
chaunceyjiang May 5, 2025
f62cad6
[Build/CI] Upgrade CUTLASS to 3.9.2 (#17641)
tlrmchlsmth May 5, 2025
1d0c9d6
[Kernel] some optimizations for dense marlin and moe marlin (#16850)
jinzhen-lin May 5, 2025
cc05b90
[Doc] Fix broken cuda installation doc rendering (#17654)
Isotr0py May 5, 2025
aea302b
Use git-path commit in hook (#17616)
thomasjpfan May 5, 2025
d3efde8
[Benchmarks] Remove invalid option under V1 engine (#17651)
russellb May 5, 2025
5ea5c51
[BugFix] Increase timeout for startup failure test (#17642)
njhill May 5, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .buildkite/release-pipeline.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@ steps:
agents:
queue: tpu_queue_postmerge
commands:
- "yes | docker system prune -a"
- "git fetch --all"
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --tag vllm/vllm-tpu:nightly --tag vllm/vllm-tpu:$BUILDKITE_COMMIT --progress plain -f docker/Dockerfile.tpu ."
- "docker push vllm/vllm-tpu:nightly"
Expand Down
6 changes: 5 additions & 1 deletion .buildkite/test-pipeline.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ steps:
- pip install -r ../../requirements/docs.txt
- SPHINXOPTS=\"-W\" make html
# Check API reference (if it fails, you may have missing mock imports)
- grep \"sig sig-object py\" build/html/api/inference_params.html
- grep \"sig sig-object py\" build/html/api/vllm/vllm.sampling_params.html

- label: Async Engine, Inputs, Utils, Worker Test # 24min
source_file_dependencies:
Expand Down Expand Up @@ -293,6 +293,7 @@ steps:
parallelism: 4

- label: PyTorch Compilation Unit Tests
torch_nightly: true
source_file_dependencies:
- vllm/
- tests/compile
Expand All @@ -302,6 +303,7 @@ steps:
- pytest -v -s compile/test_sequence_parallelism.py

- label: PyTorch Fullgraph Smoke Test # 9min
torch_nightly: true
source_file_dependencies:
- vllm/
- tests/compile
Expand All @@ -312,6 +314,7 @@ steps:
- pytest -v -s compile/piecewise/test_toy_llama.py

- label: PyTorch Fullgraph Test # 18min
torch_nightly: true
source_file_dependencies:
- vllm/
- tests/compile
Expand Down Expand Up @@ -436,6 +439,7 @@ steps:
##### models test #####

- label: Basic Models Test # 24min
torch_nightly: true
source_file_dependencies:
- vllm/
- tests/models
Expand Down
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,7 @@ instance/
# Sphinx documentation
docs/_build/
docs/source/getting_started/examples/
docs/source/api/vllm

# PyBuilder
.pybuilder/
Expand Down
6 changes: 3 additions & 3 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ repos:
rev: 0.6.17
hooks:
- id: pip-compile
args: [requirements/test.in, -o, requirements/test.txt, --index-strategy, unsafe-best-match]
args: [requirements/test.in, -o, requirements/test.txt, --index-strategy, unsafe-best-match, --torch-backend, cu128]
files: ^requirements/test\.(in|txt)$
- repo: local
hooks:
Expand Down Expand Up @@ -101,8 +101,8 @@ repos:
args:
- -c
- |
if ! grep -q "^Signed-off-by: $(git config user.name) <$(git config user.email)>" .git/COMMIT_EDITMSG; then
printf "\nSigned-off-by: $(git config user.name) <$(git config user.email)>\n" >> .git/COMMIT_EDITMSG
if ! grep -q "^Signed-off-by: $(git config user.name) <$(git config user.email)>" "$(git rev-parse --git-path COMMIT_EDITMSG)"; then
printf "\nSigned-off-by: $(git config user.name) <$(git config user.email)>\n" >> "$(git rev-parse --git-path COMMIT_EDITMSG)"
fi
language: system
verbose: true
Expand Down
69 changes: 62 additions & 7 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@ project(vllm_extensions LANGUAGES CXX)

# CUDA by default, can be overridden by using -DVLLM_TARGET_DEVICE=... (used by setup.py)
set(VLLM_TARGET_DEVICE "cuda" CACHE STRING "Target device backend for vLLM")

message(STATUS "Build type: ${CMAKE_BUILD_TYPE}")
message(STATUS "Target device: ${VLLM_TARGET_DEVICE}")

Expand Down Expand Up @@ -250,9 +249,8 @@ set(VLLM_EXT_SRC
if(VLLM_GPU_LANG STREQUAL "CUDA")
SET(CUTLASS_ENABLE_HEADERS_ONLY ON CACHE BOOL "Enable only the header library")

# Set CUTLASS_REVISION manually -- its revision detection doesn't work in this case.
# Please keep this in sync with FetchContent_Declare line below.
set(CUTLASS_REVISION "v3.9.0" CACHE STRING "CUTLASS revision to use")
# Set CUTLASS_REVISION. Used for FetchContent. Also fixes some bogus messages when building.
set(CUTLASS_REVISION "v3.9.2" CACHE STRING "CUTLASS revision to use")

# Use the specified CUTLASS source directory for compilation if VLLM_CUTLASS_SRC_DIR is provided
if (DEFINED ENV{VLLM_CUTLASS_SRC_DIR})
Expand All @@ -270,7 +268,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
cutlass
GIT_REPOSITORY https://github.com/nvidia/cutlass.git
# Please keep this in sync with CUTLASS_REVISION line above.
GIT_TAG v3.9.0
GIT_TAG ${CUTLASS_REVISION}
GIT_PROGRESS TRUE

# Speed up CUTLASS download by retrieving only the specified GIT_TAG instead of the history.
Expand Down Expand Up @@ -303,8 +301,52 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
# are not supported by Machete yet.
cuda_archs_loose_intersection(MARLIN_ARCHS "8.0;8.6;8.7;8.9;9.0;10.0;10.1;12.0" "${CUDA_ARCHS}")
if (MARLIN_ARCHS)

#
# For the Marlin kernels we automatically generate sources for various
# preselected input type pairs and schedules.
# Generate sources:
set(MARLIN_GEN_SCRIPT
${CMAKE_CURRENT_SOURCE_DIR}/csrc/quantization/gptq_marlin/generate_kernels.py)
file(MD5 ${MARLIN_GEN_SCRIPT} MARLIN_GEN_SCRIPT_HASH)

message(STATUS "Marlin generation script hash: ${MARLIN_GEN_SCRIPT_HASH}")
message(STATUS "Last run Marlin generate script hash: $CACHE{MARLIN_GEN_SCRIPT_HASH}")

if (NOT DEFINED CACHE{MARLIN_GEN_SCRIPT_HASH}
OR NOT $CACHE{MARLIN_GEN_SCRIPT_HASH} STREQUAL ${MARLIN_GEN_SCRIPT_HASH})
execute_process(
COMMAND ${CMAKE_COMMAND} -E env
PYTHONPATH=$PYTHONPATH
${Python_EXECUTABLE} ${MARLIN_GEN_SCRIPT}
RESULT_VARIABLE marlin_generation_result
OUTPUT_VARIABLE marlin_generation_result
OUTPUT_FILE ${CMAKE_CURRENT_BINARY_DIR}/marlin_generation.log
ERROR_FILE ${CMAKE_CURRENT_BINARY_DIR}/marlin_generation.log
)

if (NOT marlin_generation_result EQUAL 0)
message(FATAL_ERROR "Marlin generation failed."
" Result: \"${marlin_generation_result}\""
"\nCheck the log for details: "
"${CMAKE_CURRENT_BINARY_DIR}/marlin_generation.log")
else()
set(MARLIN_GEN_SCRIPT_HASH ${MARLIN_GEN_SCRIPT_HASH}
CACHE STRING "Last run Marlin generate script hash" FORCE)
message(STATUS "Marlin generation completed successfully.")
endif()
else()
message(STATUS "Marlin generation script has not changed, skipping generation.")
endif()

file(GLOB MARLIN_TEMPLATE_KERNEL_SRC "csrc/quantization/gptq_marlin/kernel_*.cu")
set_gencode_flags_for_srcs(
SRCS "${MARLIN_TEMPLATE_KERNEL_SRC}"
CUDA_ARCHS "${MARLIN_ARCHS}")

list(APPEND VLLM_EXT_SRC ${MARLIN_TEMPLATE_KERNEL_SRC})

set(MARLIN_SRCS
"csrc/quantization/fp8/fp8_marlin.cu"
"csrc/quantization/marlin/dense/marlin_cuda_kernel.cu"
"csrc/quantization/marlin/sparse/marlin_24_cuda_kernel.cu"
"csrc/quantization/marlin/qqq/marlin_qqq_gemm_kernel.cu"
Expand Down Expand Up @@ -646,7 +688,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
OR NOT $CACHE{MOE_MARLIN_GEN_SCRIPT_HASH} STREQUAL ${MOE_MARLIN_GEN_SCRIPT_HASH})
execute_process(
COMMAND ${CMAKE_COMMAND} -E env
PYTHONPATH=${CMAKE_CURRENT_SOURCE_DIR}/csrc/cutlass_extensions/:${CUTLASS_DIR}/python/:${VLLM_PYTHON_PATH}:$PYTHONPATH
PYTHONPATH=$PYTHONPATH
${Python_EXECUTABLE} ${MOE_MARLIN_GEN_SCRIPT}
RESULT_VARIABLE moe_marlin_generation_result
OUTPUT_VARIABLE moe_marlin_generation_output
Expand Down Expand Up @@ -682,6 +724,17 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
endif()
endif()

if(VLLM_GPU_LANG STREQUAL "CUDA")
set(MOE_PERMUTE_SRC
"csrc/moe/permute_unpermute_kernels/moe_permute_unpermute_kernel.cu"
"csrc/moe/moe_permute_unpermute_op.cu")

set_gencode_flags_for_srcs(
SRCS "${MARLIN_PERMUTE_SRC}"
CUDA_ARCHS "${MOE_PERMUTE_ARCHS}")

list(APPEND VLLM_MOE_EXT_SRC "${MOE_PERMUTE_SRC}")
endif()
message(STATUS "Enabling moe extension.")
define_gpu_extension_target(
_moe_C
Expand All @@ -690,6 +743,8 @@ define_gpu_extension_target(
SOURCES ${VLLM_MOE_EXT_SRC}
COMPILE_FLAGS ${VLLM_GPU_FLAGS}
ARCHITECTURES ${VLLM_GPU_ARCHES}
INCLUDE_DIRECTORIES ${CUTLASS_INCLUDE_DIR}
INCLUDE_DIRECTORIES ${CUTLASS_TOOLS_UTIL_INCLUDE_DIR}
USE_SABI 3
WITH_SOABI)

Expand Down
Loading