[BugFix] Explicitly set gpu_memory_utilization (vllm-project#1560)

rahul-tuli · dsikka · aireilly · commit ebd377ebb074 · 2025-07-30T12:43:29.000+01:00
We started seeing the following failure with vllm ph-3 kv_cache quantization e2e test starting from version 0.9.1; using one 80GB A100 ```bash 2025-06-17T00:26:38.269335-0400 | test_vllm | INFO - ================= RUNNING vLLM ========================= INFO 06-17 00:26:46 [config.py:823] This model supports multiple tasks: {'classify', 'generate', 'embed', 'score', 'reward'}. Defaulting to 'generate'. INFO 06-17 00:26:46 [config.py:2195] Chunked prefill is enabled with max_num_batched_tokens=8192. WARNING 06-17 00:26:46 [utils.py:2597] We must use the `spawn` multiprocessing start method. Overriding VLLM_WORKER_MULTIPROC_METHOD to 'spawn'. See https://docs.vllm.ai/en/latest/usage/troubleshooting.html#python-multiprocessing for more information. Reason: CUDA is initialized WARNING 06-17 00:26:47 [env_override.py:17] NCCL_CUMEM_ENABLE is set to 0, skipping override. This may increase memory overhead with cudagraph+allreduce: NVIDIA/nccl#1234 INFO 06-17 00:26:49 [__init__.py:244] Automatically detected platform cuda. INFO 06-17 00:26:53 [core.py:455] Waiting for init message from front-end. INFO 06-17 00:26:53 [core.py:70] Initializing a V1 LLM engine (v0.9.1) with config: model='Phi-3-mini-4k-instruct-kv_cache_default_phi3', speculative_config=None, tokenizer='Phi-3-mini-4k-instruct-kv_cache_default_phi3', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config={}, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=4096, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=compressed-tensors, enforce_eager=False, kv_cache_dtype=auto, device_config=cuda, decoding_config=DecodingConfig(backend='auto', disable_fallback=False, disable_any_whitespace=False, disable_additional_properties=False, reasoning_backend=''), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None), seed=0, served_model_name=Phi-3-mini-4k-instruct-kv_cache_default_phi3, num_scheduler_steps=1, multi_step_stream_outputs=True, enable_prefix_caching=True, chunked_prefill_enabled=True, use_async_output_proc=True, pooler_config=None, compilation_config={"level":3,"debug_dump_path":"","cache_dir":"","backend":"","custom_ops":["none"],"splitting_ops":["vllm.unified_attention","vllm.unified_attention_with_output"],"use_inductor":true,"compile_sizes":[],"inductor_compile_config":{"enable_auto_functionalized_v2":false},"inductor_passes":{},"use_cudagraph":true,"cudagraph_num_of_warmups":1,"cudagraph_capture_sizes":[512,504,496,488,480,472,464,456,448,440,432,424,416,408,400,392,384,376,368,360,352,344,336,328,320,312,304,296,288,280,272,264,256,248,240,232,224,216,208,200,192,184,176,168,160,152,144,136,128,120,112,104,96,88,80,72,64,56,48,40,32,24,16,8,4,2,1],"cudagraph_copy_inputs":false,"full_cuda_graph":false,"max_capture_size":512,"local_cache_dir":null} WARNING 06-17 00:26:53 [utils.py:2737] Methods determine_num_available_blocks,device_config,get_cache_block_size_bytes,initialize_cache not implemented in <vllm.v1.worker.gpu_worker.Worker object at 0x7f4cc29e7f70> ERROR 06-17 00:26:53 [core.py:515] EngineCore failed to start. ERROR 06-17 00:26:53 [core.py:515] Traceback (most recent call last): ERROR 06-17 00:26:53 [core.py:515] File "/home/rahul/llm-compressor/.venv/lib/python3.10/site-packages/vllm/v1/engine/core.py", line 506, in run_engine_core ERROR 06-17 00:26:53 [core.py:515] engine_core = EngineCoreProc(*args, **kwargs) ERROR 06-17 00:26:53 [core.py:515] File "/home/rahul/llm-compressor/.venv/lib/python3.10/site-packages/vllm/v1/engine/core.py", line 390, in __init__ ERROR 06-17 00:26:53 [core.py:515] super().__init__(vllm_config, executor_class, log_stats, ERROR 06-17 00:26:53 [core.py:515] File "/home/rahul/llm-compressor/.venv/lib/python3.10/site-packages/vllm/v1/engine/core.py", line 76, in __init__ ERROR 06-17 00:26:53 [core.py:515] self.model_executor = executor_class(vllm_config) ERROR 06-17 00:26:53 [core.py:515] File "/home/rahul/llm-compressor/.venv/lib/python3.10/site-packages/vllm/executor/executor_base.py", line 53, in __init__ ERROR 06-17 00:26:53 [core.py:515] self._init_executor() ERROR 06-17 00:26:53 [core.py:515] File "/home/rahul/llm-compressor/.venv/lib/python3.10/site-packages/vllm/executor/uniproc_executor.py", line 47, in _init_executor ERROR 06-17 00:26:53 [core.py:515] self.collective_rpc("init_device") ERROR 06-17 00:26:53 [core.py:515] File "/home/rahul/llm-compressor/.venv/lib/python3.10/site-packages/vllm/executor/uniproc_executor.py", line 57, in collective_rpc ERROR 06-17 00:26:53 [core.py:515] answer = run_method(self.driver_worker, method, args, kwargs) ERROR 06-17 00:26:53 [core.py:515] File "/home/rahul/llm-compressor/.venv/lib/python3.10/site-packages/vllm/utils.py", line 2671, in run_method ERROR 06-17 00:26:53 [core.py:515] return func(*args, **kwargs) ERROR 06-17 00:26:53 [core.py:515] File "/home/rahul/llm-compressor/.venv/lib/python3.10/site-packages/vllm/worker/worker_base.py", line 606, in init_device ERROR 06-17 00:26:53 [core.py:515] self.worker.init_device() # type: ignore ERROR 06-17 00:26:53 [core.py:515] File "/home/rahul/llm-compressor/.venv/lib/python3.10/site-packages/vllm/v1/worker/gpu_worker.py", line 140, in init_device ERROR 06-17 00:26:53 [core.py:515] raise ValueError( ERROR 06-17 00:26:53 [core.py:515] ValueError: Free memory on device (70.82/79.25 GiB) on startup is less than desired GPU memory utilization (0.9, 71.33 GiB). Decrease GPU memory utilization or reduce GPU memory used by other processes. ---------------------------------------------------------------------------- Captured stderr call ---------------------------------------------------------------------------- Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00, 1.80it/s] 100%|██████████| 391/391 [00:00<00:00, 645912.90it/s] Calibrating weights: 100%|██████████| 391/391 [00:00<00:00, 334961.78it/s] Calibrating: 100%|██████████| 256/256 [00:28<00:00, 9.00it/s] Compressing model: 391it [00:00, 957928.07it/s] Process EngineCore_0: Traceback (most recent call last): File "/usr/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap self.run() File "/usr/lib/python3.10/multiprocessing/process.py", line 108, in run self._target(*self._args, **self._kwargs) File "/home/rahul/llm-compressor/.venv/lib/python3.10/site-packages/vllm/v1/engine/core.py", line 519, in run_engine_core raise e File "/home/rahul/llm-compressor/.venv/lib/python3.10/site-packages/vllm/v1/engine/core.py", line 506, in run_engine_core engine_core = EngineCoreProc(*args, **kwargs) File "/home/rahul/llm-compressor/.venv/lib/python3.10/site-packages/vllm/v1/engine/core.py", line 390, in __init__ super().__init__(vllm_config, executor_class, log_stats, File "/home/rahul/llm-compressor/.venv/lib/python3.10/site-packages/vllm/v1/engine/core.py", line 76, in __init__ self.model_executor = executor_class(vllm_config) File "/home/rahul/llm-compressor/.venv/lib/python3.10/site-packages/vllm/executor/executor_base.py", line 53, in __init__ self._init_executor() File "/home/rahul/llm-compressor/.venv/lib/python3.10/site-packages/vllm/executor/uniproc_executor.py", line 47, in _init_executor self.collective_rpc("init_device") File "/home/rahul/llm-compressor/.venv/lib/python3.10/site-packages/vllm/executor/uniproc_executor.py", line 57, in collective_rpc answer = run_method(self.driver_worker, method, args, kwargs) File "/home/rahul/llm-compressor/.venv/lib/python3.10/site-packages/vllm/utils.py", line 2671, in run_method return func(*args, **kwargs) File "/home/rahul/llm-compressor/.venv/lib/python3.10/site-packages/vllm/worker/worker_base.py", line 606, in init_device self.worker.init_device() # type: ignore File "/home/rahul/llm-compressor/.venv/lib/python3.10/site-packages/vllm/v1/worker/gpu_worker.py", line 140, in init_device raise ValueError( ValueError: Free memory on device (70.82/79.25 GiB) on startup is less than desired GPU memory utilization (0.9, 71.33 GiB). Decrease GPU memory utilization or reduce GPU memory used by other processes. ============================================================================== warnings summary ============================================================================== tests/e2e/vLLM/test_vllm.py::TestvLLM::test_vllm[tests/e2e/vLLM/configs/kv_cache_phi3.yaml] /home/rahul/llm-compressor/src/llmcompressor/pytorch/__init__.py:19: UserWarning: torch.compile is not supported by llmcompressor for torch 2.0.x warnings.warn( -- Docs: https://docs.pytest.org/en/stable/how-to/capture-warnings.html ========================================================================== short test summary info =========================================================================== FAILED tests/e2e/vLLM/test_vllm.py::TestvLLM::test_vllm[tests/e2e/vLLM/configs/kv_cache_phi3.yaml] - RuntimeError: Engine core initialization failed. See root cause above. ``` Explicitly setting gpu_memory_utilization to 0.8 fixes our test. This is a temporary solution till we ascertain why is vllm asking for memory now compared to version 0.9.0 ``` CUDA_VISIBLE_DEVICES=0 pytest -v /home/rahul/llm-compressor/tests/e2e/vLLM/test_vllm.py ============================================================================ test session starts ============================================================================= platform linux -- Python 3.10.12, pytest-8.4.0, pluggy-1.6.0 -- /home/rahul/llm-compressor/.venv/bin/python3 cachedir: .pytest_cache rootdir: /home/rahul/llm-compressor configfile: pyproject.toml plugins: anyio-4.9.0, rerunfailures-15.1, mock-3.14.1 collected 1 item tests/e2e/vLLM/test_vllm.py::TestvLLM::test_vllm[tests/e2e/vLLM/configs/kv_cache_phi3.yaml] PASSED [100%] ===================================================================== warnings summary ====================================================================== tests/e2e/vLLM/test_vllm.py::TestvLLM::test_vllm[tests/e2e/vLLM/configs/kv_cache_phi3.yaml] /home/rahul/llm-compressor/src/llmcompressor/pytorch/__init__.py:19: UserWarning: torch.compile is not supported by llmcompressor for torch 2.0.x warnings.warn( -- Docs: https://docs.pytest.org/en/stable/how-to/capture-warnings.html ========================================================= 1 passed, 1 warning in 130.84s (0:02:10) ========================================================== ``` --------- Signed-off-by: Rahul Tuli <rtuli@redhat.com> Signed-off-by: Rahul <rahul@example.com> Co-authored-by: Dipika Sikka <dipikasikka1@gmail.com>
diff --git a/tests/e2e/vLLM/configs/kv_cache_phi3.yaml b/tests/e2e/vLLM/configs/kv_cache_phi3.yaml
@@ -4,4 +4,5 @@ model: microsoft/Phi-3-mini-4k-instruct
 recipe: tests/e2e/vLLM/recipes/kv_cache/default.yaml
 dataset_id: HuggingFaceH4/ultrachat_200k
 dataset_split: train_sft
-scheme: kv_cache_default_phi3
+scheme: kv_cache_default_phi3
+gpu_memory_utilization: 0.8
diff --git a/tests/e2e/vLLM/test_vllm.py b/tests/e2e/vLLM/test_vllm.py
@@ -81,6 +81,8 @@ def set_up(self, test_data_file: str):
         self.save_compressed = eval_config.get("save_compressed", True)
         self.num_calibration_samples = eval_config.get("num_calibration_samples", 256)
         self.max_seq_length = eval_config.get("max_seq_length", 2048)
+        # GPU memory utilization - only set if explicitly provided in config
+        self.gpu_memory_utilization = eval_config.get("gpu_memory_utilization")
 
         if not self.save_dir:
             self.save_dir = self.model.split("/")[1] + f"-{self.scheme}"
@@ -195,11 +197,16 @@ def _run_vllm(self):
         import torch
 
         sampling_params = SamplingParams(temperature=0.80, top_p=0.95)
+        llm_kwargs = {"model": self.save_dir}
+
         if "W4A16_2of4" in self.scheme:
             # required by the kernel
-            llm = LLM(model=self.save_dir, dtype=torch.float16)
-        else:
-            llm = LLM(model=self.save_dir)
+            llm_kwargs["dtype"] = torch.float16
+
+        if self.gpu_memory_utilization is not None:
+            llm_kwargs["gpu_memory_utilization"] = self.gpu_memory_utilization
+
+        llm = LLM(**llm_kwargs)
         outputs = llm.generate(self.prompts, sampling_params)
         return outputs