We read every piece of feedback, and take your input very seriously.
To see all available qualifiers, see our documentation.
There was an error while loading. Please reload this page.
1 parent cf4f4e8 commit 8358e01Copy full SHA for 8358e01
tensorrt_llm/_torch/models/modeling_llama.py
@@ -69,6 +69,11 @@ def __init__(
69
# This is safe to do because we limit seqlen to 8k for
70
# non TRTLLM backends.
71
attention_chunk_size = None
72
+ else:
73
+ # Disable chunked attention when max_num_tokens is smaller than attention_chunk_size
74
+ # TODO: Remove this after all attention kernels in TRTLLM backend support chunked attention
75
+ if attention_chunk_size and model_config.max_num_tokens < attention_chunk_size:
76
+ attention_chunk_size = None
77
78
super().__init__(
79
hidden_size=config.hidden_size,
0 commit comments