[Perf] Improve Llama4 performance for small max_seqlen cases

nv-yilinf · nv-yilinf · commit 8358e01ccb3c · 2025-07-23T11:22:45.000-07:00
Signed-off-by: Yilin Fan &lt;206948969+nv-yilinf@users.noreply.github.com&gt;
diff --git a/tensorrt_llm/_torch/models/modeling_llama.py b/tensorrt_llm/_torch/models/modeling_llama.py
@@ -69,6 +69,11 @@ def __init__(
             # This is safe to do because we limit seqlen to 8k for
             # non TRTLLM backends.
             attention_chunk_size = None
+        else:
+            # Disable chunked attention when max_num_tokens is smaller than attention_chunk_size
+            # TODO: Remove this after all attention kernels in TRTLLM backend support chunked attention
+            if attention_chunk_size and model_config.max_num_tokens < attention_chunk_size:
+                attention_chunk_size = None
 
         super().__init__(
             hidden_size=config.hidden_size,