[Attention] MLA move rotary embedding to cuda-graph region (#17668)

LucasWilkinson · web-flow · commit 5e6f93948449 · 2025-05-09T11:14:42.000+08:00
Signed-off-by: Lucas Wilkinson &lt;lwilkinson@neuralmagic.com&gt;
diff --git a/vllm/attention/backends/mla/common.py b/vllm/attention/backends/mla/common.py
@@ -211,8 +211,6 @@
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
                                                LinearBase,
                                                UnquantizedLinearMethod)
-from vllm.model_executor.layers.rotary_embedding import (
-    DeepseekScalingRotaryEmbedding, RotaryEmbedding)
 from vllm.multimodal import MultiModalPlaceholderMap
 from vllm.platforms import current_platform
 from vllm.triton_utils import HAS_TRITON
@@ -377,7 +375,6 @@ def graph_capture_get_metadata_for_batch(
             seq_start_loc=None,
             context_lens_tensor=None,
             block_tables=self._graph_block_tables[:batch_size],
-            input_positions=self._positions[:batch_size],
             head_dim=self.runner.model_config.get_head_size())
 
         if is_encoder_decoder_model:
@@ -393,7 +390,6 @@ def get_graph_input_buffers(self,
             "slot_mapping": attn_metadata.slot_mapping,
             "seq_lens_tensor": attn_metadata.decode_metadata.seq_lens_tensor,
             "block_tables": attn_metadata.decode_metadata.block_tables,
-            "input_positions": attn_metadata.decode_metadata.input_positions,
         }
         if is_encoder_decoder_model:
             raise NotImplementedError(
@@ -405,16 +401,10 @@ def prepare_graph_input_buffers(self,
                                     input_buffers,
                                     attn_metadata,
                                     is_encoder_decoder_model: bool = False):
-        input_positions = attn_metadata.input_positions
-        num_positions = input_positions.shape[0]
         input_buffers["seq_lens_tensor"].copy_(
             attn_metadata.decode_metadata.seq_lens_tensor, non_blocking=True)
         input_buffers["block_tables"].copy_(
             attn_metadata.decode_metadata.block_tables, non_blocking=True)
-        # CUDA graph buffer is padded so only perform a partial copy based on
-        # num_positions
-        input_buffers["input_positions"][:num_positions].copy_(
-            input_positions, non_blocking=True)
         if is_encoder_decoder_model:
             raise NotImplementedError(
                 "TritonMLAState does not support encoder/decoder yet")
@@ -456,11 +446,6 @@ class MLACommonMetadata(AttentionMetadata):
     # TODO(woosuk): Move `use_cuda_graph` out since it's unrelated to attention.
     use_cuda_graph: bool
 
-    # New for MLA (compared to FlashAttention)
-    # Input positions for rotrary embeddings since for MLA the rotary
-    # position embeddings are applied inside the attention backend
-    input_positions: torch.Tensor
-
     # NOTE(sang): Definition of context_len, query_len, and seq_len.
     # |---------- N-1 iteration --------|
     # |---------------- N iteration ---------------------|
@@ -563,8 +548,6 @@ def prefill_metadata(self):
                                self.context_lens_tensor[:self.num_prefills])
         block_tables = (None if self.block_tables is None else
                         self.block_tables[:self.num_prefills])
-        input_positions = (None if self.input_positions is None else
-                           self.input_positions[:self.num_prefill_tokens])
 
         self._cached_prefill_metadata = self.__class__(
             # Required by ModelRunner
@@ -578,7 +561,6 @@ def prefill_metadata(self):
             multi_modal_placeholder_index_maps=None,
             enable_kv_scales_calculation=False,
             # MLACommonMetadata
-            input_positions=input_positions,
             seq_lens=seq_lens,
             seq_lens_tensor=seq_lens_tensor,
             max_query_len=self.max_query_len,
@@ -615,8 +597,6 @@ def decode_metadata(self):
                            self.seq_lens_tensor[self.num_prefills:])
         block_tables = (None if self.block_tables is None else
                         self.block_tables[self.num_prefills:])
-        input_positions = (None if self.input_positions is None else
-                           self.input_positions[self.num_prefill_tokens:])
 
         self._cached_decode_metadata = self.__class__(
             # Required by ModelRunner
@@ -646,7 +626,6 @@ def decode_metadata(self):
             if self.seq_start_loc is not None else None,
             context_lens_tensor=None,
             block_tables=block_tables,
-            input_positions=input_positions,
             head_dim=self.head_dim,
             is_profile_run=self.is_profile_run)
         return self._cached_decode_metadata
@@ -765,7 +744,6 @@ def prepare(self):
         self.context_lens: List[int] = []
         self.block_tables: List[List[int]] = []
         self.curr_seq_lens: List[int] = []
-        self.input_positions: List[int] = []
         self.multimodal_placeholder_maps: Dict[
             str,
             MultiModalPlaceholderMap] = defaultdict(MultiModalPlaceholderMap)
@@ -786,13 +764,11 @@ def _add_seq_group(
         block_tables = inter_data.block_tables
 
         for (seq_id, token_len, seq_len, curr_seq_len, query_len, context_len,
-             curr_sliding_window_block, input_positions) in zip(
+             curr_sliding_window_block) in zip(
                  inter_data.seq_ids, [len(t) for t in inter_data.input_tokens],
                  inter_data.orig_seq_lens, inter_data.seq_lens,
                  inter_data.query_lens, inter_data.context_lens,
-                 inter_data.curr_sliding_window_blocks,
-                 inter_data.input_positions):
-            self.input_positions.extend(input_positions)
+                 inter_data.curr_sliding_window_blocks):
             self.context_lens.append(context_len)
             if is_prompt:
                 self.num_prefills += 1
@@ -912,8 +888,6 @@ def build(self, seq_lens: List[int], query_lens: List[int],
                                                device, self.runner.pin_memory)
         seq_lens_tensor = async_tensor_h2d(seq_lens, torch.int, device,
                                            self.runner.pin_memory)
-        input_positions = async_tensor_h2d(self.input_positions, torch.long,
-                                           device, self.runner.pin_memory)
         slot_mapping_tensor = async_tensor_h2d(self.slot_mapping, torch.long,
                                                device, self.runner.pin_memory)
         query_start_loc_tensor = async_tensor_h2d(query_start_loc, torch.int32,
@@ -987,7 +961,6 @@ def build(self, seq_lens: List[int], query_lens: List[int],
             multi_modal_placeholder_index_maps=None,  # Not Attention Related
             enable_kv_scales_calculation=False,
             # MLACommonMetadata
-            input_positions=input_positions,
             seq_lens=seq_lens,
             seq_lens_tensor=seq_lens_tensor,
             max_query_len=max_query_len,
@@ -1033,7 +1006,6 @@ def __init__(
         qk_rope_head_dim: int,
         qk_head_dim: int,
         v_head_dim: int,
-        rotary_emb: RotaryEmbedding,
         kv_b_proj: ColumnParallelLinear,
     ) -> None:
         self.num_heads = num_heads
@@ -1048,10 +1020,6 @@ def __init__(
         self.qk_rope_head_dim = qk_rope_head_dim
         self.qk_head_dim = qk_head_dim
         self.v_head_dim = v_head_dim
-
-        self.rotary_emb = rotary_emb
-        self.use_yarn_rope = isinstance(rotary_emb,
-                                        DeepseekScalingRotaryEmbedding)
         self.kv_b_proj = kv_b_proj
 
         self.triton_fa_func = triton_attention
@@ -1367,41 +1335,15 @@ def forward(
         has_decode = attn_metadata.decode_metadata is not None
         has_prefill = attn_metadata.prefill_metadata is not None
 
-        # Restore head dim (for rotary embedding)
-        k_pe = k_pe.unsqueeze(1)
-        assert hasattr(attn_metadata, "input_positions")
-
         num_prefill_tokens: int = attn_metadata.num_prefill_tokens
         q = q.view(-1, self.num_heads, self.qk_head_dim)
 
         decode_q = q[num_prefill_tokens:]
-        decode_k_pe = k_pe[num_prefill_tokens:]
-        decode_input_positions = \
-            attn_metadata.input_positions[num_prefill_tokens:]
 
         prefill_q = q[:num_prefill_tokens]
         prefill_k_pe = k_pe[:num_prefill_tokens]
-        prefill_input_positions = \
-            attn_metadata.input_positions[:num_prefill_tokens]
         prefill_k_c_normed = k_c_normed[:num_prefill_tokens]
 
-        if has_decode:
-            decode_q_nope, decode_q_pe = decode_q.split(
-                [self.qk_nope_head_dim, self.qk_rope_head_dim], dim=-1)
-            # Convert from (B, N, P) to (N, B, P)
-            decode_q_nope = decode_q_nope.transpose(0, 1)
-            # Multiply (N, B, P) x (N, P, L) -> (N, B, L)
-            decode_ql_nope = torch.bmm(decode_q_nope, self.W_UK_T)
-            # Convert from (N, B, L) to (B, N, L)
-            decode_ql_nope = decode_ql_nope.transpose(0, 1)
-            decode_q_pe[...], decode_k_pe[...] = self.rotary_emb(
-                decode_input_positions, decode_q_pe, decode_k_pe)
-
-        if has_prefill:
-            prefill_q_pe = prefill_q[..., self.qk_nope_head_dim:]
-            prefill_q_pe[...], prefill_k_pe[...] = self.rotary_emb(
-                prefill_input_positions, prefill_q_pe, prefill_k_pe)
-
         # write the latent and rope to kv cache
         if kv_cache.numel() > 0:
             ops.concat_and_cache_mla(
@@ -1424,6 +1366,15 @@ def forward(
                 attn_metadata)
 
         if has_decode:
+            decode_q_nope, decode_q_pe = decode_q.split(
+                [self.qk_nope_head_dim, self.qk_rope_head_dim], dim=-1)
+            # Convert from (B, N, P) to (N, B, P)
+            decode_q_nope = decode_q_nope.transpose(0, 1)
+            # Multiply (N, B, P) x (N, P, L) -> (N, B, L)
+            decode_ql_nope = torch.bmm(decode_q_nope, self.W_UK_T)
+            # Convert from (N, B, L) to (B, N, L)
+            decode_ql_nope = decode_ql_nope.transpose(0, 1)
+
             output[num_prefill_tokens:] = self._forward_decode(
                 decode_ql_nope, decode_q_pe, kv_cache, attn_metadata)
 
diff --git a/vllm/attention/backends/rocm_aiter_mla.py b/vllm/attention/backends/rocm_aiter_mla.py
@@ -148,13 +148,11 @@ def _add_seq_group(self, inter_data, chunked_prefill_enabled: bool,
         block_tables = inter_data.block_tables
 
         for (seq_id, token_len, seq_len, curr_seq_len, query_len, context_len,
-             curr_sliding_window_block, input_positions) in zip(
+             curr_sliding_window_block) in zip(
                  inter_data.seq_ids, [len(t) for t in inter_data.input_tokens],
                  inter_data.orig_seq_lens, inter_data.seq_lens,
                  inter_data.query_lens, inter_data.context_lens,
-                 inter_data.curr_sliding_window_blocks,
-                 inter_data.input_positions):
-            self.input_positions.extend(input_positions)
+                 inter_data.curr_sliding_window_blocks):
             self.context_lens.append(context_len)
             if is_prompt:
                 self.num_prefills += 1
diff --git a/vllm/model_executor/layers/rotary_embedding.py b/vllm/model_executor/layers/rotary_embedding.py
@@ -808,8 +808,9 @@ def forward(
             query_pass = query[..., self.rotary_dim:]
             key_pass = key[..., self.rotary_dim:]
 
-        self.cos_sin_cache: torch.Tensor = self.cos_sin_cache.to(
-            positions.device)
+        if self.cos_sin_cache.device != positions.device:
+            self.cos_sin_cache: torch.Tensor = self.cos_sin_cache.to(
+                positions.device)
         cos_sin = self.cos_sin_cache[torch.add(positions, offsets)
                                      if offsets is not None else positions]
         cos, sin = cos_sin.chunk(2, dim=-1)
diff --git a/vllm/model_executor/models/deepseek_v2.py b/vllm/model_executor/models/deepseek_v2.py
@@ -453,7 +453,6 @@ def __init__(
             qk_rope_head_dim=self.qk_rope_head_dim,
             qk_head_dim=self.qk_head_dim,
             v_head_dim=self.v_head_dim,
-            rotary_emb=self.rotary_emb,
             kv_b_proj=self.kv_b_proj,
         )
 
@@ -475,6 +474,13 @@ def forward(
             [self.kv_lora_rank, self.qk_rope_head_dim], dim=-1)
         kv_c_normed = self.kv_a_layernorm(kv_c.contiguous())
 
+        q = q.view(-1, self.num_local_heads, self.qk_head_dim)
+        # Add head dim of 1 to k_pe
+        k_pe = k_pe.unsqueeze(1)
+
+        q[..., self.qk_nope_head_dim:], k_pe = self.rotary_emb(
+            positions, q[..., self.qk_nope_head_dim:], k_pe)
+
         attn_out = self.mla_attn(
             q,
             kv_c_normed,
diff --git a/vllm/v1/attention/backends/mla/common.py b/vllm/v1/attention/backends/mla/common.py
diff --git a/vllm/v1/attention/backends/mla/flashmla.py b/vllm/v1/attention/backends/mla/flashmla.py