File tree Expand file tree Collapse file tree 2 files changed +12
-4
lines changed Expand file tree Collapse file tree 2 files changed +12
-4
lines changed Original file line number Diff line number Diff line change @@ -4,4 +4,5 @@ model: microsoft/Phi-3-mini-4k-instruct
4
4
recipe : tests/e2e/vLLM/recipes/kv_cache/default.yaml
5
5
dataset_id : HuggingFaceH4/ultrachat_200k
6
6
dataset_split : train_sft
7
- scheme : kv_cache_default_phi3
7
+ scheme : kv_cache_default_phi3
8
+ gpu_memory_utilization : 0.8
Original file line number Diff line number Diff line change @@ -81,6 +81,8 @@ def set_up(self, test_data_file: str):
81
81
self .save_compressed = eval_config .get ("save_compressed" , True )
82
82
self .num_calibration_samples = eval_config .get ("num_calibration_samples" , 256 )
83
83
self .max_seq_length = eval_config .get ("max_seq_length" , 2048 )
84
+ # GPU memory utilization - only set if explicitly provided in config
85
+ self .gpu_memory_utilization = eval_config .get ("gpu_memory_utilization" )
84
86
85
87
if not self .save_dir :
86
88
self .save_dir = self .model .split ("/" )[1 ] + f"-{ self .scheme } "
@@ -195,11 +197,16 @@ def _run_vllm(self):
195
197
import torch
196
198
197
199
sampling_params = SamplingParams (temperature = 0.80 , top_p = 0.95 )
200
+ llm_kwargs = {"model" : self .save_dir }
201
+
198
202
if "W4A16_2of4" in self .scheme :
199
203
# required by the kernel
200
- llm = LLM (model = self .save_dir , dtype = torch .float16 )
201
- else :
202
- llm = LLM (model = self .save_dir )
204
+ llm_kwargs ["dtype" ] = torch .float16
205
+
206
+ if self .gpu_memory_utilization is not None :
207
+ llm_kwargs ["gpu_memory_utilization" ] = self .gpu_memory_utilization
208
+
209
+ llm = LLM (** llm_kwargs )
203
210
outputs = llm .generate (self .prompts , sampling_params )
204
211
return outputs
205
212
You can’t perform that action at this time.
0 commit comments