[Core] Use spawn when cuda is already initialized

russellb · russellb · commit 35105e726be3 · 2024-09-26T16:47:14.000Z
One condition that we know will be broken with the default
multiprocessing method of `fork` is if a user of vllm as a library
initializes cuda prior to running vllm. This change detects this case,
emits a warning to the log, and force sets the method to `spawn`.

Similar code exists elsewhere (for AMD, Intel) to force the use of
`spawn` in all cases for those accelerators.

We retain the default behavior if the env var is not set and cuda is
not initialized. This seems to work fine and avoids potentially
breaking code using vllm as a library without protecting their code
under `if __name__ == "__main__"`.

Signed-off-by: Russell Bryant &lt;rbryant@redhat.com&gt;
diff --git a/vllm/executor/multiproc_gpu_executor.py b/vllm/executor/multiproc_gpu_executor.py
@@ -15,8 +15,8 @@
 from vllm.sequence import ExecuteModelRequest
 from vllm.triton_utils import maybe_set_triton_cache_manager
 from vllm.utils import (_run_task_with_lock, cuda_device_count_stateless,
-                        get_distributed_init_method, get_open_port,
-                        get_vllm_instance_id, make_async,
+                        cuda_is_initialized, get_distributed_init_method,
+                        get_open_port, get_vllm_instance_id, make_async,
                         update_environment_variables)
 
 logger = init_logger(__name__)
@@ -122,6 +122,13 @@ def _check_executor_parameters(self):
                 "CUDA_VISIBLE_DEVICES": (",".join(map(str, range(world_size))))
             })
 
+        if (cuda_is_initialized()
+                and os.environ.get("VLLM_WORKER_MULTIPROC_METHOD") != "spawn"):
+            logger.warning("CUDA was previously initialized. We must use "
+                           "the `spawn` multiprocessing start method. Setting "
+                           "VLLM_WORKER_MULTIPROC_METHOD to 'spawn'.")
+            os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
+
         cuda_device_count = cuda_device_count_stateless()
         # Use confusing message for more common TP-only case.
         assert tensor_parallel_size <= cuda_device_count, (
diff --git a/vllm/utils.py b/vllm/utils.py
@@ -1090,6 +1090,13 @@ def cuda_device_count_stateless() -> int:
     return _cuda_device_count_stateless(envs.CUDA_VISIBLE_DEVICES)
 
 
+def cuda_is_initialized() -> bool:
+    """Check if CUDA is initialized."""
+    if not torch.cuda._is_compiled():
+        return False
+    return torch.cuda.is_initialized()
+
+
 def weak_bind(bound_method: Callable[..., Any], ) -> Callable[..., None]:
     """Make an instance method that weakly references
     its associated instance and no-ops once that