Skip to content

Commit c4efb0d

Browse files
qli88minpeter
authored andcommitted
[Bugfix] Enable PP with AITER+V1 (vllm-project#19822)
Signed-off-by: Qiang Li <[email protected]> Signed-off-by: minpeter <[email protected]>
1 parent 09a270f commit c4efb0d

File tree

2 files changed

+3
-11
lines changed

2 files changed

+3
-11
lines changed

vllm/model_executor/layers/layernorm.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,6 @@ def fused_add_rms_norm(
4545

4646
def rocm_aiter_rms_norm(x: torch.Tensor, weight: torch.Tensor,
4747
variance_epsilon: float) -> torch.Tensor:
48-
4948
import aiter as rocm_aiter
5049
if x.dim() > 2:
5150
x_original_shape = x.shape

vllm/v1/attention/backends/mla/rocm_aiter_mla.py

Lines changed: 3 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -201,16 +201,9 @@ def _forward_decode(
201201

202202
kv_buffer = kv_c_and_k_pe_cache.unsqueeze(2)
203203

204-
if self.num_heads == 16:
205-
# AITER MLA decode kernel only supports
206-
# max_seqlen_q=1 when using 16 heads.
207-
max_seqlen_qo = 1
208-
else:
209-
# AITER MLA decode Kernel handles arbitrary
210-
# max_seqlen_q values when using 128 heads.
211-
assert attn_metadata.prefill is not None
212-
max_seqlen_qo = attn_metadata.prefill.max_query_len
213-
204+
# max_seqlen_qo must be 1 except for MTP
205+
# TODO: Find the best value for MTP
206+
max_seqlen_qo = 1
214207
aiter_mla_decode_fwd(q, kv_buffer, o, self.scale,
215208
attn_metadata.decode.qo_indptr, max_seqlen_qo,
216209
attn_metadata.decode.paged_kv_indptr,

0 commit comments

Comments
 (0)