[Perf] Improve Llama4 performance for small max_seqlen cases (#6306)

nv-yilinf · web-flow · commit d643aef73cdd · 2025-08-09T02:58:31.000-04:00
Signed-off-by: Yilin Fan &lt;206948969+nv-yilinf@users.noreply.github.com&gt;
diff --git a/tensorrt_llm/_torch/models/modeling_llama.py b/tensorrt_llm/_torch/models/modeling_llama.py
@@ -74,6 +74,11 @@ def __init__(
         elif get_sm_version() <= 90 and model_config.spec_config is not None:
             # pre-Blackwell spec-dec kernel does not support
             attention_chunk_size = None
+        else:
+            # Disable chunked attention when max_seq_len is smaller than attention_chunk_size
+            # TODO: Remove this after all attention kernels in TRTLLM backend support chunked attention
+            if attention_chunk_size and model_config.max_seq_len and model_config.max_seq_len < attention_chunk_size:
+                attention_chunk_size = None
 
         super().__init__(
             hidden_size=config.hidden_size,