refactor

Varun Sundar Rabindranath · Varun Sundar Rabindranath · commit 001ac2ee39fc · 2025-06-12T08:05:11.000-07:00
Signed-off-by: Varun Sundar Rabindranath &lt;vsundarr@redhat.com&gt;
diff --git a/vllm/model_executor/layers/fused_moe/batched_deep_gemm_moe.py b/vllm/model_executor/layers/fused_moe/batched_deep_gemm_moe.py
@@ -47,10 +47,12 @@ def workspace_shapes(
         N: int,
         K: int,
         topk: int,
-        num_experts: int,
+        global_num_experts: int,
+        local_num_experts: int,
     ) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...], torch.dtype]:
         assert a.dim() == 2
         num_dp = self.dp_size
+        num_experts = local_num_experts
         max_num_tokens = a.size(
             0) if self.max_num_tokens is None else self.max_num_tokens
         workspace13 = (num_experts, max_num_tokens * num_dp, max(K, N))
diff --git a/vllm/model_executor/layers/fused_moe/batched_triton_or_deep_gemm_moe.py b/vllm/model_executor/layers/fused_moe/batched_triton_or_deep_gemm_moe.py
@@ -81,18 +81,19 @@ def workspace_shapes(
         N: int,
         K: int,
         topk: int,
-        num_experts: int,
+        global_num_experts: int,
+        local_num_experts: int,
     ) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...], torch.dtype]:
         # Note: the deep gemm workspaces are strictly larger than the triton
         # workspaces so we can be pessimistic here and allocate for DeepGemm
         # even if we fall back to triton later, e.g. if expert maps are set.
         if self.allow_deep_gemm and self.batched_deep_gemm_experts is not None:
             return self.batched_deep_gemm_experts.workspace_shapes(
-                a, aq, M, N, K, topk, num_experts)
+                a, aq, M, N, K, topk, global_num_experts, local_num_experts)
         else:
             assert self.batched_triton_experts is not None
             return self.batched_triton_experts.workspace_shapes(
-                a, aq, M, N, K, topk, num_experts)
+                a, aq, M, N, K, topk, global_num_experts, local_num_experts)
 
     def apply(
         self,
diff --git a/vllm/model_executor/layers/fused_moe/cutlass_moe.py b/vllm/model_executor/layers/fused_moe/cutlass_moe.py
@@ -230,7 +230,8 @@ def workspace_shapes(
         N: int,
         K: int,
         topk: int,
-        num_experts: int,
+        global_num_experts: int,
+        local_num_experts: int,
     ) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...], torch.dtype]:
         workspace1: tuple[int, ...] = ()
         workspace2: tuple[int, ...] = ()
diff --git a/vllm/model_executor/layers/fused_moe/deep_gemm_moe.py b/vllm/model_executor/layers/fused_moe/deep_gemm_moe.py
@@ -74,15 +74,12 @@ def supports_chunking(self) -> bool:
         return True
 
     def workspace_shapes(
-        self,
-        a: torch.Tensor,
-        aq: torch.Tensor,
-        M: int,
-        N: int,
-        K: int,
-        topk: int,
-        num_experts: int,
+        self, a: torch.Tensor, aq: torch.Tensor, M: int, N: int, K: int,
+        topk: int, global_num_experts: int, local_num_experts: int
     ) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...], torch.dtype]:
+        # We use global_num_experts due to how moe_align_block_size handles
+        # expert_maps.
+        num_experts = global_num_experts
         block_m = self.block_shape[0]
         M_sum = (M * topk) + num_experts * (block_m - 1)
         M_sum = round_up(M_sum, block_m)
diff --git a/vllm/model_executor/layers/fused_moe/fused_batched_moe.py b/vllm/model_executor/layers/fused_moe/fused_batched_moe.py
@@ -521,10 +521,12 @@ def workspace_shapes(
         N: int,
         K: int,
         topk: int,
-        num_experts: int,
+        global_num_experts: int,
+        local_num_experts: int,
     ) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...], torch.dtype]:
         assert a.dim() == 2
         num_dp = self.dp_size
+        num_experts = local_num_experts
         workspace13 = (num_experts, self.max_num_tokens * num_dp, K)
         workspace2 = (self.max_num_tokens * num_dp, N)
         return (workspace13, workspace2, workspace13, a.dtype)
@@ -624,10 +626,12 @@ def workspace_shapes(
         N: int,
         K: int,
         topk: int,
-        num_experts: int,
+        global_num_experts: int,
+        local_num_experts: int,
     ) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...], torch.dtype]:
         assert a.dim() == 2
         num_dp = self.world_size // self.dp_size
+        num_experts = local_num_experts
         max_num_tokens = a.size(
             0) if self.max_num_tokens is None else self.max_num_tokens
         workspace13 = (num_experts, max_num_tokens * num_dp, max(K, N))
diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -1553,7 +1553,8 @@ def workspace_shapes(
         N: int,
         K: int,
         topk: int,
-        num_experts: int,
+        global_num_experts: int,
+        local_num_experts: int,
     ) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...], torch.dtype]:
         workspace1 = (M, topk, max(N * 2, K))
         workspace2 = (M, topk, N)
diff --git a/vllm/model_executor/layers/fused_moe/modular_kernel.py b/vllm/model_executor/layers/fused_moe/modular_kernel.py
@@ -194,7 +194,8 @@ def workspace_shapes(
         N: int,
         K: int,
         topk: int,
-        num_experts: int,
+        global_num_experts: int,
+        local_num_experts: int,
     ) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...], torch.dtype]:
         """
         Compute the shapes for the temporary and final outputs of the two gemms
@@ -372,8 +373,9 @@ def forward(
         a1 = hidden_states
         output = a1 if inplace else torch.zeros_like(a1)
 
+        local_num_experts = w1.size(0)
         if global_num_experts == -1:
-            global_num_experts = w1.size(0)
+            global_num_experts = local_num_experts
 
         (a1q, a1q_scale, expert_num_tokens, _expert_topk_ids,
          _expert_topk_weights) = self.prepare_finalize.prepare(
@@ -405,44 +407,22 @@ def forward(
                 CHUNK_SIZE = M
                 num_chunks = 1
 
-            # Batched experts don't support chunking at this level as the
-            # chunking had already happened at an higher level - in
-            # fused_moe/layer.py
-            is_batched_fused_experts = not self.fused_experts.supports_chunking(
-            )
-
-            # TODO (varun): In the case of a non-batched fused_experts
-            # implementation the input tokens are usually aligned to a
-            # "block-size" by moe_align_block_size. In the case of
-            # expert_parallel, moe_align_block_size initially considers all
-            # experts as valid and aligns all tokens appropriately. Before
-            # moe_align_block_size returns it marks the experts_ids that are
-            # not in the current GPU rank as -1 so the MoE matmuls could skip
-            # those blocks. This is sub-optimal.
-            # Due to how moe_align_block_size is implemented at the
-            # moment, it is required that we use `global_num_experts` in the
-            # workspace calculations. However for the batched case, we don't
-            # use `moe_align_block_size`, as the input is already aligned
-            # (batched). This lets us use `local_num_experts`, which is
-            # much lesser than global_num_experts, in the workspace
-            # calculation.
-            num_experts_workspace = w1.size(
-                0) if is_batched_fused_experts else global_num_experts
-
             if num_chunks == 1:
                 (workspace13_shape, workspace2_shape, fused_out_shape,
                  workspace_dtype) = self.fused_experts.workspace_shapes(
-                     a1, a1q, M, N, K, top_k, num_experts_workspace)
+                     a1, a1q, M, N, K, top_k, global_num_experts,
+                     local_num_experts)
             else:
                 # Use the full M to get the final output shape.
                 _, _, fused_out_shape, _ = (
                     self.fused_experts.workspace_shapes(
-                        a1, a1q, M, N, K, top_k, num_experts_workspace))
+                        a1, a1q, M, N, K, top_k, global_num_experts,
+                        local_num_experts))
                 # Use the CHUNK_SIZE to get the workspace shapes.
                 workspace13_shape, workspace2_shape, _, workspace_dtype = (
                     self.fused_experts.workspace_shapes(
-                        a1, a1q, CHUNK_SIZE, N, K, top_k,
-                        num_experts_workspace))
+                        a1, a1q, CHUNK_SIZE, N, K, top_k, global_num_experts,
+                        local_num_experts))
 
             # We can reuse the memory between cache1 and cache3 because by the
             # time we need cache3, we're done with cache1.
diff --git a/vllm/model_executor/layers/fused_moe/moe_align_block_size.py b/vllm/model_executor/layers/fused_moe/moe_align_block_size.py
@@ -159,6 +159,12 @@ def moe_align_block_size(
     Aligns the token distribution across experts to be compatible with block
     size for matrix multiplication.
 
+    Note: In the case of expert_parallel, moe_align_block_size initially
+    considers all experts as valid and aligns all tokens appropriately.
+    Before the function returns it marks the experts_ids that are not in
+    the current GPU rank as -1 so the MoE matmuls could skip those blocks.
+    This requires the num_experts input arg to be the num global experts.
+
     Parameters:
     - topk_ids: A tensor of shape [total_tokens, top_k] representing the
         top-k expert indices for each token.
diff --git a/vllm/model_executor/layers/fused_moe/triton_deep_gemm_moe.py b/vllm/model_executor/layers/fused_moe/triton_deep_gemm_moe.py
@@ -48,18 +48,20 @@ def workspace_shapes(
         N: int,
         K: int,
         topk: int,
-        num_experts: int,
+        global_num_experts: int,
+        local_num_experts: int,
     ) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...], torch.dtype]:
         # Note: the deep gemm workspaces are strictly larger than the triton
         # workspaces so we can be pessimistic here and allocate for DeepGemm
         # even if we fall back to triton later, e.g. if expert maps are set.
         if self.allow_deep_gemm and _valid_deep_gemm_shape(M, N, K):
             assert self.deep_gemm_expert is not None
             return self.deep_gemm_expert.workspace_shapes(
-                a, aq, M, N, K, topk, num_experts)
+                a, aq, M, N, K, topk, global_num_experts, local_num_experts)
         else:
             return self.triton_expert.workspace_shapes(a, aq, M, N, K, topk,
-                                                       num_experts)
+                                                       global_num_experts,
+                                                       local_num_experts)
 
     def apply(
         self,