vllm-project · dsikka · Jun 18, 2025 · Jun 17, 2025 · Jun 17, 2025 · Jun 17, 2025
diff --git a/tests/e2e/vLLM/configs/kv_cache_phi3.yaml b/tests/e2e/vLLM/configs/kv_cache_phi3.yaml
@@ -4,4 +4,5 @@ model: microsoft/Phi-3-mini-4k-instruct
 recipe: tests/e2e/vLLM/recipes/kv_cache/default.yaml
 dataset_id: HuggingFaceH4/ultrachat_200k
 dataset_split: train_sft
-scheme: kv_cache_default_phi3
+scheme: kv_cache_default_phi3
+gpu_memory_utilization: 0.8
diff --git a/tests/e2e/vLLM/test_vllm.py b/tests/e2e/vLLM/test_vllm.py
@@ -81,6 +81,8 @@ def set_up(self, test_data_file: str):
         self.save_compressed = eval_config.get("save_compressed", True)
         self.num_calibration_samples = eval_config.get("num_calibration_samples", 256)
         self.max_seq_length = eval_config.get("max_seq_length", 2048)
+        # GPU memory utilization - only set if explicitly provided in config
+        self.gpu_memory_utilization = eval_config.get("gpu_memory_utilization")
 
         if not self.save_dir:
             self.save_dir = self.model.split("/")[1] + f"-{self.scheme}"
@@ -195,11 +197,16 @@ def _run_vllm(self):
         import torch
 
         sampling_params = SamplingParams(temperature=0.80, top_p=0.95)
+        llm_kwargs = {"model": self.save_dir}
+
         if "W4A16_2of4" in self.scheme:
             # required by the kernel
-            llm = LLM(model=self.save_dir, dtype=torch.float16)
-        else:
-            llm = LLM(model=self.save_dir)
+            llm_kwargs["dtype"] = torch.float16
+
+        if self.gpu_memory_utilization is not None:
+            llm_kwargs["gpu_memory_utilization"] = self.gpu_memory_utilization
+
+        llm = LLM(**llm_kwargs)
         outputs = llm.generate(self.prompts, sampling_params)
         return outputs