Skip to content

[CPU] Fix torch version in x86 CPU backend #19258

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Jun 26, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 9 additions & 4 deletions csrc/cpu/torch_bindings.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -131,32 +131,37 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {

// Quantization
#ifdef __AVX512F__
at::Tag stride_tag = at::Tag::needs_fixed_stride_order;
// Compute int8 quantized tensor for given scaling factor.
ops.def(
"static_scaled_int8_quant(Tensor! out, Tensor input, Tensor scale,"
"Tensor? azp) -> ()");
"Tensor? azp) -> ()",
{stride_tag});
ops.impl("static_scaled_int8_quant", torch::kCPU, &static_scaled_int8_quant);

// Compute int8 quantized tensor and scaling factor
ops.def(
"dynamic_scaled_int8_quant(Tensor! out, Tensor input, Tensor! scale, "
"Tensor!? azp) -> ()");
"Tensor!? azp) -> ()",
{stride_tag});
ops.impl("dynamic_scaled_int8_quant", torch::kCPU,
&dynamic_scaled_int8_quant);
// W8A8 GEMM, supporting symmetric per-tensor or per-row/column
// quantization.
ops.def(
"cutlass_scaled_mm(Tensor! out, Tensor a,"
" Tensor b, Tensor a_scales,"
" Tensor b_scales, Tensor? bias) -> ()");
" Tensor b_scales, Tensor? bias) -> ()",
{stride_tag});
ops.impl("cutlass_scaled_mm", torch::kCPU, &int8_scaled_mm);
// w8a8 GEMM, supporting asymmetric per-tensor or per-row/column
// quantization.
ops.def(
"cutlass_scaled_mm_azp(Tensor! out, Tensor a,"
" Tensor b, Tensor a_scales,"
" Tensor b_scales, Tensor azp_adj,"
" Tensor? azp, Tensor? bias) -> ()");
" Tensor? azp, Tensor? bias) -> ()",
{stride_tag});
ops.impl("cutlass_scaled_mm_azp", torch::kCPU, &int8_scaled_mm_azp);
#elif defined(__powerpc64__)
// Compute int8 quantized tensor for given scaling factor.
Expand Down
33 changes: 20 additions & 13 deletions docker/Dockerfile.cpu
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ ENV VLLM_CPU_DISABLE_AVX512=${VLLM_CPU_DISABLE_AVX512}
WORKDIR /workspace/vllm

RUN --mount=type=cache,target=/root/.cache/uv \
--mount=type=bind,src=requirements/build.txt,target=requirements/build.txt \
--mount=type=bind,src=requirements/cpu-build.txt,target=requirements/build.txt \
uv pip install -r requirements/build.txt

COPY . .
Expand All @@ -79,6 +79,22 @@ RUN --mount=type=cache,target=/root/.cache/uv \
--mount=type=bind,source=.git,target=.git \
VLLM_TARGET_DEVICE=cpu python3 setup.py bdist_wheel

######################### TEST DEPS #########################
FROM base AS vllm-test-deps

WORKDIR /workspace/vllm

RUN --mount=type=bind,src=requirements/test.in,target=requirements/test.in \
cp requirements/test.in requirements/cpu-test.in && \
sed -i '/mamba_ssm/d' requirements/cpu-test.in && \
sed -i 's/torch==.*/torch==2.6.0/g' requirements/cpu-test.in && \
sed -i 's/torchaudio.*/torchaudio/g' requirements/cpu-test.in && \
sed -i 's/torchvision.*/torchvision/g' requirements/cpu-test.in && \
uv pip compile requirements/cpu-test.in -o requirements/cpu-test.txt --index-strategy unsafe-best-match --torch-backend cpu

RUN --mount=type=cache,target=/root/.cache/uv \
uv pip install -r requirements/cpu-test.txt

######################### DEV IMAGE #########################
FROM vllm-build AS vllm-dev

Expand All @@ -97,28 +113,19 @@ RUN --mount=type=cache,target=/root/.cache/uv \
--mount=type=bind,source=.git,target=.git \
VLLM_TARGET_DEVICE=cpu python3 setup.py develop

COPY --from=vllm-test-deps /workspace/vllm/requirements/cpu-test.txt requirements/test.txt

RUN --mount=type=cache,target=/root/.cache/uv \
--mount=type=bind,src=requirements/test.in,target=requirements/test.in \
cp requirements/test.in requirements/test-cpu.in && \
sed -i '/mamba_ssm/d' requirements/test-cpu.in && \
uv pip compile requirements/test-cpu.in -o requirements/test.txt && \
uv pip install -r requirements/dev.txt && \
pre-commit install --hook-type pre-commit --hook-type commit-msg

ENTRYPOINT ["bash"]

######################### TEST IMAGE #########################
FROM base AS vllm-test
FROM vllm-test-deps AS vllm-test

WORKDIR /workspace/

RUN --mount=type=cache,target=/root/.cache/uv \
--mount=type=bind,src=requirements/test.in,target=requirements/test.in \
cp requirements/test.in requirements/test-cpu.in && \
sed -i '/mamba_ssm/d' requirements/test-cpu.in && \
uv pip compile requirements/test-cpu.in -o requirements/cpu-test.txt && \
uv pip install -r requirements/cpu-test.txt

RUN --mount=type=cache,target=/root/.cache/uv \
--mount=type=bind,from=vllm-build,src=/workspace/vllm/dist,target=dist \
uv pip install dist/*.whl
Expand Down
12 changes: 12 additions & 0 deletions requirements/cpu-build.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
# Temporarily used for x86 CPU backend to avoid performance regression of torch>2.6.0+cpu,
# see https://github.com/pytorch/pytorch/pull/151218
cmake>=3.26.1
ninja
packaging>=24.2
setuptools>=77.0.3,<80.0.0
setuptools-scm>=8
--extra-index-url https://download.pytorch.org/whl/cpu
torch==2.6.0+cpu
wheel
jinja2>=3.1.6
regex
5 changes: 3 additions & 2 deletions requirements/cpu.txt
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ numba == 0.61.2; python_version > '3.9'
packaging>=24.2
setuptools>=77.0.3,<80.0.0
--extra-index-url https://download.pytorch.org/whl/cpu
torch==2.7.0+cpu; platform_machine == "x86_64"
torch==2.6.0+cpu; platform_machine == "x86_64" # torch>2.6.0+cpu has performance regression on x86 platform, see https://github.com/pytorch/pytorch/pull/151218
torch==2.7.0; platform_system == "Darwin"
torch==2.7.0; platform_machine == "ppc64le" or platform_machine == "aarch64"

Expand All @@ -23,6 +23,7 @@ datasets # for benchmark scripts

# Intel Extension for PyTorch, only for x86_64 CPUs
intel-openmp==2024.2.1; platform_machine == "x86_64"
intel_extension_for_pytorch==2.7.0; platform_machine == "x86_64"
intel_extension_for_pytorch==2.6.0; platform_machine == "x86_64" # torch>2.6.0+cpu has performance regression on x86 platform, see https://github.com/pytorch/pytorch/pull/151218
py-libnuma; platform_system != "Darwin"
psutil; platform_system != "Darwin"
triton==3.2.0; platform_machine == "x86_64" # Triton is required for torch 2.6+cpu, as it is imported in torch.compile.
2 changes: 2 additions & 0 deletions tests/models/multimodal/generation/test_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,8 @@
),
limit_mm_per_prompt={"image": 4},
)],
# TODO: Revert to "auto" when CPU backend can use torch > 2.6
dtype="bfloat16" if current_platform.is_cpu() else "auto",
marks=[pytest.mark.core_model, pytest.mark.cpu_model],
),
"paligemma": VLMTestInfo(
Expand Down
3 changes: 3 additions & 0 deletions tests/models/multimodal/generation/vlm_utils/builders.py
Original file line number Diff line number Diff line change
Expand Up @@ -203,6 +203,9 @@ def build_embedding_inputs_from_test_info(

images = [asset.pil_image for asset in image_assets]
embeds = test_info.convert_assets_to_embeddings(image_assets)
if test_info.dtype != "auto":
dtype = getattr(torch, test_info.dtype) # type: ignore
embeds = [e.to(dtype=dtype) for e in embeds]
assert len(images) == len(model_prompts)

inputs = build_single_image_inputs(images, model_prompts, size_wrapper)
Expand Down
2 changes: 2 additions & 0 deletions vllm/model_executor/layers/fused_moe/layer.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,8 @@
if is_rocm_aiter_moe_enabled():
from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import ( # noqa: E501
rocm_aiter_grouped_topk as grouped_topk)
elif current_platform.is_cpu():
pass
else:
from vllm.model_executor.layers.fused_moe.fused_moe import grouped_topk
if current_platform.is_tpu():
Expand Down
2 changes: 1 addition & 1 deletion vllm/model_executor/layers/quantization/ipex_quant.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
from vllm.model_executor.layers.quantization.gptq import GPTQLinearMethod
from vllm.platforms import current_platform

MIN_IPEX_VERSION = "2.7.0"
MIN_IPEX_VERSION = "2.6.0"


class IPEXConfig(QuantizationConfig):
Expand Down