|
15 | 15 | from vllm.sequence import ExecuteModelRequest
|
16 | 16 | from vllm.triton_utils import maybe_set_triton_cache_manager
|
17 | 17 | from vllm.utils import (_run_task_with_lock, cuda_device_count_stateless,
|
18 |
| - get_distributed_init_method, get_open_port, |
19 |
| - get_vllm_instance_id, make_async, |
| 18 | + cuda_is_initialized, get_distributed_init_method, |
| 19 | + get_open_port, get_vllm_instance_id, make_async, |
20 | 20 | update_environment_variables)
|
21 | 21 |
|
22 | 22 | logger = init_logger(__name__)
|
@@ -122,6 +122,13 @@ def _check_executor_parameters(self):
|
122 | 122 | "CUDA_VISIBLE_DEVICES": (",".join(map(str, range(world_size))))
|
123 | 123 | })
|
124 | 124 |
|
| 125 | + if (cuda_is_initialized() |
| 126 | + and os.environ.get("VLLM_WORKER_MULTIPROC_METHOD") != "spawn"): |
| 127 | + logger.warning("CUDA was previously initialized. We must use " |
| 128 | + "the `spawn` multiprocessing start method. Setting " |
| 129 | + "VLLM_WORKER_MULTIPROC_METHOD to 'spawn'.") |
| 130 | + os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" |
| 131 | + |
125 | 132 | cuda_device_count = cuda_device_count_stateless()
|
126 | 133 | # Use confusing message for more common TP-only case.
|
127 | 134 | assert tensor_parallel_size <= cuda_device_count, (
|
|
0 commit comments