[BugFix] Raise error when max_model_len is larger than KV cache (vllm-project#2163)

WoosukKwon · web-flow · commit 0f7e67917215 · 2023-12-17T17:08:23.000-08:00
diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
@@ -227,6 +227,14 @@ def _init_cache(self) -> None:
             raise ValueError("No available memory for the cache blocks. "
                              "Try increasing `gpu_memory_utilization` when "
                              "initializing the engine.")
+        max_seq_len = self.cache_config.block_size * num_gpu_blocks
+        if self.model_config.max_model_len > max_seq_len:
+            raise ValueError(
+                f"The model's max seq len ({self.model_config.max_model_len}) "
+                "is larger than the maximum number of tokens that can be "
+                f"stored in KV cache ({max_seq_len}). Try increasing "
+                "`gpu_memory_utilization` or decreasing `max_model_len` when "
+                "initializing the engine.")
 
         self.cache_config.num_gpu_blocks = num_gpu_blocks
         self.cache_config.num_cpu_blocks = num_cpu_blocks