Skip to content

Commit d643aef

Browse files
authored
[Perf] Improve Llama4 performance for small max_seqlen cases (#6306)
Signed-off-by: Yilin Fan <[email protected]>
1 parent bcf5ec0 commit d643aef

File tree

1 file changed

+5
-0
lines changed

1 file changed

+5
-0
lines changed

tensorrt_llm/_torch/models/modeling_llama.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,11 @@ def __init__(
7474
elif get_sm_version() <= 90 and model_config.spec_config is not None:
7575
# pre-Blackwell spec-dec kernel does not support
7676
attention_chunk_size = None
77+
else:
78+
# Disable chunked attention when max_seq_len is smaller than attention_chunk_size
79+
# TODO: Remove this after all attention kernels in TRTLLM backend support chunked attention
80+
if attention_chunk_size and model_config.max_seq_len and model_config.max_seq_len < attention_chunk_size:
81+
attention_chunk_size = None
7782

7883
super().__init__(
7984
hidden_size=config.hidden_size,

0 commit comments

Comments
 (0)