Skip to content

Commit 8b4bb22

Browse files
dbyoung18jikunshang
authored andcommitted
[Refactor]Abstract Platform Interface for Distributed Backend and Add xccl Support for Intel XPU (vllm-project#19410)
Signed-off-by: dbyoung18 <[email protected]> Signed-off-by: Kunshang Ji <[email protected]> Co-authored-by: Kunshang Ji <[email protected]>
1 parent c97e122 commit 8b4bb22

File tree

17 files changed

+44
-8
lines changed

17 files changed

+44
-8
lines changed

docs/getting_started/installation/gpu/xpu.inc.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -81,4 +81,9 @@ python -m vllm.entrypoints.openai.api_server \
8181
By default, a ray instance will be launched automatically if no existing one is detected in the system, with `num-gpus` equals to `parallel_config.world_size`. We recommend properly starting a ray cluster before execution, referring to the <gh-file:examples/online_serving/run_cluster.sh> helper script.
8282

8383
# --8<-- [end:supported-features]
84+
# --8<-- [start:distributed-backend]
85+
86+
XPU platform uses **torch-ccl** for torch<2.8 and **xccl** for torch>=2.8 as distributed backend, since torch 2.8 supports **xccl** as built-in backend for XPU.
87+
88+
# --8<-- [end:distributed-backend]
8489
# --8<-- [end:extra-information]

vllm/platforms/__init__.py

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
from typing import TYPE_CHECKING, Optional
88

99
from vllm.plugins import load_plugins_by_group
10-
from vllm.utils import resolve_obj_by_qualname
10+
from vllm.utils import resolve_obj_by_qualname, supports_xccl
1111

1212
from .interface import _Backend # noqa: F401
1313
from .interface import CpuArchEnum, Platform, PlatformEnum
@@ -139,10 +139,19 @@ def xpu_platform_plugin() -> Optional[str]:
139139
try:
140140
# installed IPEX if the machine has XPUs.
141141
import intel_extension_for_pytorch # noqa: F401
142-
import oneccl_bindings_for_pytorch # noqa: F401
143142
import torch
143+
if supports_xccl():
144+
dist_backend = "xccl"
145+
else:
146+
dist_backend = "ccl"
147+
import oneccl_bindings_for_pytorch # noqa: F401
148+
144149
if hasattr(torch, 'xpu') and torch.xpu.is_available():
145150
is_xpu = True
151+
from vllm.platforms.xpu import XPUPlatform
152+
XPUPlatform.dist_backend = dist_backend
153+
logger.debug("Confirmed %s backend is available.",
154+
XPUPlatform.dist_backend)
146155
logger.debug("Confirmed XPU platform is available.")
147156
except Exception as e:
148157
logger.debug("XPU platform is not available because: %s", str(e))

vllm/platforms/cpu.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@ class CpuPlatform(Platform):
3737
device_name: str = "cpu"
3838
device_type: str = "cpu"
3939
dispatch_key: str = "CPU"
40+
dist_backend: str = "gloo"
4041

4142
@property
4243
def supported_dtypes(self) -> list[torch.dtype]:

vllm/platforms/cuda.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,7 @@ class CudaPlatformBase(Platform):
5656
device_type: str = "cuda"
5757
dispatch_key: str = "CUDA"
5858
ray_device_key: str = "GPU"
59+
dist_backend: str = "nccl"
5960
device_control_env_var: str = "CUDA_VISIBLE_DEVICES"
6061

6162
@property

vllm/platforms/hpu.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ class HpuPlatform(Platform):
2626
device_type: str = "hpu"
2727
dispatch_key: str = "HPU"
2828
ray_device_key: str = "HPU"
29+
dist_backend: str = "hccl"
2930
device_control_env_var: str = "HABANA_VISIBLE_MODULES"
3031

3132
@classmethod

vllm/platforms/interface.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -129,6 +129,9 @@ class Platform:
129129
# compilation strategy.
130130
simple_compile_backend: str = "inductor"
131131

132+
# The backend used for distributed communication.
133+
dist_backend: str = ""
134+
132135
supported_quantization: list[str] = []
133136

134137
additional_env_vars: list[str] = []

vllm/platforms/neuron.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@ class NeuronPlatform(Platform):
3030
device_type: str = "neuron"
3131
ray_device_key: str = "neuron_cores"
3232
supported_quantization: list[str] = ["neuron_quant", "fbgemm_fp8"]
33+
dist_backend: str = "gloo"
3334
device_control_env_var: str = "NEURON_RT_VISIBLE_CORES"
3435

3536
@classmethod

vllm/platforms/rocm.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -164,6 +164,7 @@ class RocmPlatform(Platform):
164164
device_type: str = "cuda"
165165
dispatch_key: str = "CUDA"
166166
ray_device_key: str = "GPU"
167+
dist_backend: str = "nccl"
167168
# rocm shares the same device control env var as CUDA
168169
device_control_env_var: str = "CUDA_VISIBLE_DEVICES"
169170

vllm/platforms/tpu.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@ class TpuPlatform(Platform):
3131
device_type: str = "tpu"
3232
dispatch_key: str = "XLA"
3333
ray_device_key: str = "TPU"
34+
dist_backend: str = "gloo"
3435
device_control_env_var: str = "TPU_VISIBLE_CHIPS"
3536
simple_compile_backend: str = "openxla"
3637

vllm/platforms/xpu.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@ class XPUPlatform(Platform):
2929
# Intel XPU's device key is "GPU" for Ray.
3030
# see https://github.com/ray-project/ray/blob/6a5eb5865eeb9ccf058a79b44f107e327e360673/python/ray/_private/accelerators/intel_gpu.py#L20 # noqa: E501
3131
ray_device_key: str = "GPU"
32+
dist_backend: str = "ccl" # ccl | xccl
3233
device_control_env_var: str = "ONEAPI_DEVICE_SELECTOR"
3334

3435
@classmethod

0 commit comments

Comments
 (0)