We read every piece of feedback, and take your input very seriously.
To see all available qualifiers, see our documentation.
There was an error while loading. Please reload this page.
1 parent bcf5ec0 commit d643aefCopy full SHA for d643aef
tensorrt_llm/_torch/models/modeling_llama.py
@@ -74,6 +74,11 @@ def __init__(
74
elif get_sm_version() <= 90 and model_config.spec_config is not None:
75
# pre-Blackwell spec-dec kernel does not support
76
attention_chunk_size = None
77
+ else:
78
+ # Disable chunked attention when max_seq_len is smaller than attention_chunk_size
79
+ # TODO: Remove this after all attention kernels in TRTLLM backend support chunked attention
80
+ if attention_chunk_size and model_config.max_seq_len and model_config.max_seq_len < attention_chunk_size:
81
+ attention_chunk_size = None
82
83
super().__init__(
84
hidden_size=config.hidden_size,
0 commit comments