Skip to content

Commit 0f7e679

Browse files
authored
[BugFix] Raise error when max_model_len is larger than KV cache (vllm-project#2163)
1 parent 499beb9 commit 0f7e679

File tree

1 file changed

+8
-0
lines changed

1 file changed

+8
-0
lines changed

vllm/engine/llm_engine.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -227,6 +227,14 @@ def _init_cache(self) -> None:
227227
raise ValueError("No available memory for the cache blocks. "
228228
"Try increasing `gpu_memory_utilization` when "
229229
"initializing the engine.")
230+
max_seq_len = self.cache_config.block_size * num_gpu_blocks
231+
if self.model_config.max_model_len > max_seq_len:
232+
raise ValueError(
233+
f"The model's max seq len ({self.model_config.max_model_len}) "
234+
"is larger than the maximum number of tokens that can be "
235+
f"stored in KV cache ({max_seq_len}). Try increasing "
236+
"`gpu_memory_utilization` or decreasing `max_model_len` when "
237+
"initializing the engine.")
230238

231239
self.cache_config.num_gpu_blocks = num_gpu_blocks
232240
self.cache_config.num_cpu_blocks = num_cpu_blocks

0 commit comments

Comments
 (0)