Skip to content

Commit 8358e01

Browse files
committed
[Perf] Improve Llama4 performance for small max_seqlen cases
Signed-off-by: Yilin Fan <[email protected]>
1 parent cf4f4e8 commit 8358e01

File tree

1 file changed

+5
-0
lines changed

1 file changed

+5
-0
lines changed

tensorrt_llm/_torch/models/modeling_llama.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,11 @@ def __init__(
6969
# This is safe to do because we limit seqlen to 8k for
7070
# non TRTLLM backends.
7171
attention_chunk_size = None
72+
else:
73+
# Disable chunked attention when max_num_tokens is smaller than attention_chunk_size
74+
# TODO: Remove this after all attention kernels in TRTLLM backend support chunked attention
75+
if attention_chunk_size and model_config.max_num_tokens < attention_chunk_size:
76+
attention_chunk_size = None
7277

7378
super().__init__(
7479
hidden_size=config.hidden_size,

0 commit comments

Comments
 (0)