Add unit test case that would fail without filling zeros to c1

minosfuture · minosfuture · commit 66c457b83c15 · 2025-06-26T22:06:17.000-07:00
Signed-off-by: Ming Yang &lt;yming@meta.com&gt;
diff --git a/tests/kernels/moe/test_cutlass_moe.py b/tests/kernels/moe/test_cutlass_moe.py
@@ -1,16 +1,20 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import dataclasses
+from math import prod
 from typing import Optional
 
 import pytest
 import torch
 
 from vllm import _custom_ops as ops
 from vllm.config import ParallelConfig, VllmConfig, set_current_vllm_config
-from vllm.model_executor.layers.fused_moe.cutlass_moe import cutlass_moe_fp8
+from vllm.model_executor.layers.fused_moe.cutlass_moe import (
+    cutlass_moe_fp8, run_cutlass_moe_fp8)
 from vllm.model_executor.layers.fused_moe.fused_moe import (fused_experts,
                                                             fused_topk)
+from vllm.model_executor.layers.fused_moe.utils import (
+    moe_kernel_quantize_input)
 from vllm.platforms import current_platform
 
 NUM_EXPERTS = [40, 64]
@@ -365,3 +369,129 @@ def test_cutlass_moe_8_bit_EP(
                                    cutlass_output,
                                    atol=5e-2,
                                    rtol=1e-2)
+
+
+@pytest.mark.parametrize("m,n,k,topk", [(1, 8192, 5120, 31)])
+@pytest.mark.parametrize("e", [128])
+@pytest.mark.parametrize("per_act_token", [False])
+@pytest.mark.parametrize("per_out_channel", [True])
+@pytest.mark.parametrize("ep_size", [8])
+@pytest.mark.skipif(
+    (lambda x: x is None or not ops.cutlass_group_gemm_supported(x.to_int()))(
+        current_platform.get_device_capability()),
+    reason="Grouped gemm is not supported on this GPU type.")
+def test_cutlass_moe_8_bit_EP_large(
+    m: int,
+    n: int,
+    k: int,
+    e: int,
+    topk: int,
+    per_act_token: bool,
+    per_out_channel: bool,
+    ep_size: int,
+):
+    current_platform.seed_everything(7)
+    with set_current_vllm_config(vllm_config):
+        mt = MOETensors8Bit.make_moe_tensors_8bit(m, k, n, e, per_act_token,
+                                                  per_out_channel)
+
+        score = torch.randn((m, e), device="cuda", dtype=torch.half)
+        topk_weights, topk_ids, _ = fused_topk(mt.a,
+                                               score,
+                                               topk,
+                                               renormalize=False)
+
+        # Note that we are using the dequantized versions of the tensors.
+        # Using a, w1 and w2 directly results in minor output differences.
+        triton_output = fused_experts(mt.a_d, mt.w1_d, mt.w2_d, topk_weights,
+                                      topk_ids)
+
+        assert e % ep_size == 0, "Cannot distribute experts evenly"
+        cutlass_output = run_8_bit(mt,
+                                   topk_weights,
+                                   topk_ids,
+                                   num_local_experts=e // ep_size)
+
+        torch.testing.assert_close(triton_output,
+                                   cutlass_output,
+                                   atol=5e-2,
+                                   rtol=1e-2)
+
+
+@pytest.mark.parametrize("m,n,k,topk", [(1, 8192, 5120, 31)])
+@pytest.mark.parametrize("e", [128])
+@pytest.mark.parametrize("per_act_token", [False])
+@pytest.mark.parametrize("per_out_channel", [True])
+@pytest.mark.parametrize("ep_size", [8])
+@pytest.mark.skipif(
+    (lambda x: x is None or not ops.cutlass_group_gemm_supported(x.to_int()))(
+        current_platform.get_device_capability()),
+    reason="Grouped gemm is not supported on this GPU type.")
+def test_run_cutlass_moe_fp8(
+    m: int,
+    n: int,
+    k: int,
+    e: int,
+    topk: int,
+    per_act_token: bool,
+    per_out_channel: bool,
+    ep_size: int,
+):
+    current_platform.seed_everything(7)
+    with set_current_vllm_config(vllm_config):
+        mt = MOETensors8Bit.make_moe_tensors_8bit(m, k, n, e, per_act_token,
+                                                  per_out_channel)
+
+        score = torch.randn((m, e), device="cuda", dtype=torch.half)
+        topk_weights, topk_ids, _ = fused_topk(mt.a,
+                                               score,
+                                               topk,
+                                               renormalize=False)
+        # we want to make sure there is at least one token that's generated in this expert shard
+        # and at least one token that's NOT generated in this expert shard
+        topk_ids[0][0] = -1
+        topk_ids[0][1] = 1
+
+        workspace13_shape = (m * topk, max(2 * n, k))
+        workspace2_shape = (m * topk, n)
+        output_shape = (m * topk, k)
+
+        workspace13 = torch.empty(prod(workspace13_shape),
+                                  device="cuda",
+                                  dtype=mt.a.dtype)
+        workspace2 = torch.empty(prod(workspace2_shape),
+                                 device="cuda",
+                                 dtype=mt.a.dtype)
+
+        num_local_experts = e // ep_size
+        start, end = 0, num_local_experts
+        expert_map = [-1] * e
+        expert_map[start:end] = list(range(num_local_experts))
+        expert_map = torch.tensor(expert_map, dtype=torch.int32, device="cuda")
+
+        activation = lambda o, i: torch.ops._C.silu_and_mul(o, i)
+        a1q, a1q_scale = moe_kernel_quantize_input(mt.a, mt.a_scale,
+                                                   torch.float8_e4m3fn,
+                                                   per_act_token)
+        func = lambda output: run_cutlass_moe_fp8(
+            output, a1q, mt.w1_q, mt.w2_q, topk_ids, activation, mt.w1_q.size(
+                0), expert_map, mt.w1_scale, mt.w2_scale, a1q_scale, None,
+            workspace13, workspace2, None, mt.a.dtype, per_act_token,
+            per_out_channel, False)
+
+        workspace13.random_()
+        output_random_workspace = torch.empty(output_shape,
+                                              device="cuda",
+                                              dtype=mt.a.dtype)
+        func(output_random_workspace)
+
+        workspace13.fill_(0)
+        output_zero_workspace = torch.zeros(output_shape,
+                                            device="cuda",
+                                            dtype=mt.a.dtype)
+        func(output_zero_workspace)
+
+        torch.testing.assert_close(output_random_workspace,
+                                   output_zero_workspace,
+                                   atol=5e-3,
+                                   rtol=1e-3)
diff --git a/vllm/model_executor/layers/fused_moe/cutlass_moe.py b/vllm/model_executor/layers/fused_moe/cutlass_moe.py
@@ -270,7 +270,7 @@ def apply(
     ):
         assert w1_zp is None, "w1_zp is not supported in CUTLASS MoE"
         assert w2_zp is None, "w2_zp is not supported in CUTLASS MoE"
-        activation_callable = lambda i, o: self.activation(activation, i, o)
+        activation_callable = lambda o, i: self.activation(activation, o, i)
         run_cutlass_moe_fp8(output, hidden_states, w1, w2, topk_ids,
                             activation_callable, global_num_experts,
                             expert_map, w1_scale, w2_scale, a1q_scale,