Skip to content

Commit 2f31014

Browse files
committed
dowwngrade
Signed-off-by: jiang1.li <[email protected]>
1 parent 9502c38 commit 2f31014

File tree

8 files changed

+51
-20
lines changed

8 files changed

+51
-20
lines changed

csrc/cpu/torch_bindings.cpp

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -131,32 +131,37 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
131131

132132
// Quantization
133133
#ifdef __AVX512F__
134+
at::Tag stride_tag = at::Tag::needs_fixed_stride_order;
134135
// Compute int8 quantized tensor for given scaling factor.
135136
ops.def(
136137
"static_scaled_int8_quant(Tensor! out, Tensor input, Tensor scale,"
137-
"Tensor? azp) -> ()");
138+
"Tensor? azp) -> ()",
139+
{stride_tag});
138140
ops.impl("static_scaled_int8_quant", torch::kCPU, &static_scaled_int8_quant);
139141

140142
// Compute int8 quantized tensor and scaling factor
141143
ops.def(
142144
"dynamic_scaled_int8_quant(Tensor! out, Tensor input, Tensor! scale, "
143-
"Tensor!? azp) -> ()");
145+
"Tensor!? azp) -> ()",
146+
{stride_tag});
144147
ops.impl("dynamic_scaled_int8_quant", torch::kCPU,
145148
&dynamic_scaled_int8_quant);
146149
// W8A8 GEMM, supporting symmetric per-tensor or per-row/column
147150
// quantization.
148151
ops.def(
149152
"cutlass_scaled_mm(Tensor! out, Tensor a,"
150153
" Tensor b, Tensor a_scales,"
151-
" Tensor b_scales, Tensor? bias) -> ()");
154+
" Tensor b_scales, Tensor? bias) -> ()",
155+
{stride_tag});
152156
ops.impl("cutlass_scaled_mm", torch::kCPU, &int8_scaled_mm);
153157
// w8a8 GEMM, supporting asymmetric per-tensor or per-row/column
154158
// quantization.
155159
ops.def(
156160
"cutlass_scaled_mm_azp(Tensor! out, Tensor a,"
157161
" Tensor b, Tensor a_scales,"
158162
" Tensor b_scales, Tensor azp_adj,"
159-
" Tensor? azp, Tensor? bias) -> ()");
163+
" Tensor? azp, Tensor? bias) -> ()",
164+
{stride_tag});
160165
ops.impl("cutlass_scaled_mm_azp", torch::kCPU, &int8_scaled_mm_azp);
161166
#elif defined(__powerpc64__)
162167
// Compute int8 quantized tensor for given scaling factor.

docker/Dockerfile.cpu

Lines changed: 20 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -66,7 +66,7 @@ ENV VLLM_CPU_DISABLE_AVX512=${VLLM_CPU_DISABLE_AVX512}
6666
WORKDIR /workspace/vllm
6767

6868
RUN --mount=type=cache,target=/root/.cache/uv \
69-
--mount=type=bind,src=requirements/build.txt,target=requirements/build.txt \
69+
--mount=type=bind,src=requirements/cpu-build.txt,target=requirements/build.txt \
7070
uv pip install -r requirements/build.txt
7171

7272
COPY . .
@@ -79,6 +79,22 @@ RUN --mount=type=cache,target=/root/.cache/uv \
7979
--mount=type=bind,source=.git,target=.git \
8080
VLLM_TARGET_DEVICE=cpu python3 setup.py bdist_wheel
8181

82+
######################### TEST DEPS #########################
83+
FROM base AS vllm-test-deps
84+
85+
WORKDIR /workspace/vllm
86+
87+
RUN --mount=type=bind,src=requirements/test.in,target=requirements/test.in \
88+
cp requirements/test.in requirements/cpu-test.in && \
89+
sed -i '/mamba_ssm/d' requirements/cpu-test.in && \
90+
sed -i 's/torch==.*/torch==2.6.0/g' requirements/cpu-test.in && \
91+
sed -i 's/torchaudio.*/torchaudio/g' requirements/cpu-test.in && \
92+
sed -i 's/torchvision.*/torchvision/g' requirements/cpu-test.in && \
93+
uv pip compile requirements/cpu-test.in -o requirements/cpu-test.txt --index-strategy unsafe-best-match --torch-backend cpu
94+
95+
RUN --mount=type=cache,target=/root/.cache/uv \
96+
uv pip install -r requirements/cpu-test.txt
97+
8298
######################### DEV IMAGE #########################
8399
FROM vllm-build AS vllm-dev
84100

@@ -97,28 +113,19 @@ RUN --mount=type=cache,target=/root/.cache/uv \
97113
--mount=type=bind,source=.git,target=.git \
98114
VLLM_TARGET_DEVICE=cpu python3 setup.py develop
99115

116+
COPY --from=vllm-test-deps /workspace/vllm/requirements/cpu-test.txt requirements/test.txt
117+
100118
RUN --mount=type=cache,target=/root/.cache/uv \
101-
--mount=type=bind,src=requirements/test.in,target=requirements/test.in \
102-
cp requirements/test.in requirements/test-cpu.in && \
103-
sed -i '/mamba_ssm/d' requirements/test-cpu.in && \
104-
uv pip compile requirements/test-cpu.in -o requirements/test.txt && \
105119
uv pip install -r requirements/dev.txt && \
106120
pre-commit install --hook-type pre-commit --hook-type commit-msg
107121

108122
ENTRYPOINT ["bash"]
109123

110124
######################### TEST IMAGE #########################
111-
FROM base AS vllm-test
125+
FROM vllm-test-deps AS vllm-test
112126

113127
WORKDIR /workspace/
114128

115-
RUN --mount=type=cache,target=/root/.cache/uv \
116-
--mount=type=bind,src=requirements/test.in,target=requirements/test.in \
117-
cp requirements/test.in requirements/test-cpu.in && \
118-
sed -i '/mamba_ssm/d' requirements/test-cpu.in && \
119-
uv pip compile requirements/test-cpu.in -o requirements/cpu-test.txt && \
120-
uv pip install -r requirements/cpu-test.txt
121-
122129
RUN --mount=type=cache,target=/root/.cache/uv \
123130
--mount=type=bind,from=vllm-build,src=/workspace/vllm/dist,target=dist \
124131
uv pip install dist/*.whl

requirements/cpu-build.txt

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
# Temporarily used for x86 CPU backend to avoid performance regression of torch>=2.6.0+cpu,
2+
# see https://github.com/pytorch/pytorch/pull/151218
3+
cmake>=3.26.1
4+
ninja
5+
packaging>=24.2
6+
setuptools>=77.0.3,<80.0.0
7+
setuptools-scm>=8
8+
--extra-index-url https://download.pytorch.org/whl/cpu
9+
torch==2.6.0+cpu
10+
wheel
11+
jinja2>=3.1.6
12+
regex

requirements/cpu.txt

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ numba == 0.61.2; python_version > '3.9'
88
packaging>=24.2
99
setuptools>=77.0.3,<80.0.0
1010
--extra-index-url https://download.pytorch.org/whl/cpu
11-
torch==2.7.0+cpu; platform_machine == "x86_64"
11+
torch==2.6.0+cpu; platform_machine == "x86_64" # torch>=2.6.0+cpu has performance regression on x86 platform, see https://github.com/pytorch/pytorch/pull/151218
1212
torch==2.7.0; platform_system == "Darwin"
1313
torch==2.7.0; platform_machine == "ppc64le" or platform_machine == "aarch64"
1414

@@ -23,6 +23,7 @@ datasets # for benchmark scripts
2323

2424
# Intel Extension for PyTorch, only for x86_64 CPUs
2525
intel-openmp==2024.2.1; platform_machine == "x86_64"
26-
intel_extension_for_pytorch==2.7.0; platform_machine == "x86_64"
26+
intel_extension_for_pytorch==2.6.0; platform_machine == "x86_64" # torch>=2.6.0+cpu has performance regression on x86 platform, see https://github.com/pytorch/pytorch/pull/151218
2727
py-libnuma; platform_system != "Darwin"
2828
psutil; platform_system != "Darwin"
29+
triton==3.2.0; platform_machine == "x86_64" # Triton is required for torch 2.6+cpu, as it is imported in torch.compile.

tests/models/multimodal/generation/test_common.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -107,6 +107,7 @@
107107
),
108108
limit_mm_per_prompt={"image": 4},
109109
)],
110+
dtype="bfloat16" if current_platform.is_cpu() else "auto",
110111
marks=[pytest.mark.core_model, pytest.mark.cpu_model],
111112
),
112113
"paligemma": VLMTestInfo(

tests/models/multimodal/generation/vlm_utils/builders.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -203,6 +203,9 @@ def build_embedding_inputs_from_test_info(
203203

204204
images = [asset.pil_image for asset in image_assets]
205205
embeds = test_info.convert_assets_to_embeddings(image_assets)
206+
if test_info.dtype != "auto":
207+
dtype = getattr(torch, test_info.dtype) # type: ignore
208+
embeds = [e.to(dtype=dtype) for e in embeds]
206209
assert len(images) == len(model_prompts)
207210

208211
inputs = build_single_image_inputs(images, model_prompts, size_wrapper)

vllm/model_executor/layers/fused_moe/layer.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,8 @@
5454
if is_rocm_aiter_moe_enabled():
5555
from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import ( # noqa: E501
5656
rocm_aiter_grouped_topk as grouped_topk)
57+
elif current_platform.is_cpu():
58+
pass
5759
else:
5860
from vllm.model_executor.layers.fused_moe.fused_moe import grouped_topk
5961
if current_platform.is_tpu():

vllm/model_executor/layers/quantization/ipex_quant.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515
from vllm.model_executor.layers.quantization.gptq import GPTQLinearMethod
1616
from vllm.platforms import current_platform
1717

18-
MIN_IPEX_VERSION = "2.7.0"
18+
MIN_IPEX_VERSION = "2.4.0"
1919

2020

2121
class IPEXConfig(QuantizationConfig):

0 commit comments

Comments
 (0)