Add comments and improve condition for zero-out

minosfuture · minosfuture · commit b2c0bef85b20 · 2025-07-03T10:49:26.000-07:00
Signed-off-by: Ming Yang &lt;yming@meta.com&gt;
diff --git a/vllm/model_executor/layers/fused_moe/cutlass_moe.py b/vllm/model_executor/layers/fused_moe/cutlass_moe.py
@@ -178,7 +178,10 @@ def run_cutlass_moe_fp8(
         c2 = _resize_cache(workspace2, (M * topk, N))
         c3 = _resize_cache(workspace13, (M * topk, K))
 
-    if expert_map is not None and not per_act_token:
+    if not per_act_token and (expert_map is not None or use_batched_format):
+        # this is necessary to avoid imprecise scale calculation caused by
+        # random data in the unused workspace. The workspace is unused when
+        # this rank handles only partial tokens, or when it is batched .
         c1.fill_(0)
 
     ops.cutlass_moe_mm(c1, a1q, w1, a1q_scale, w1_scale, expert_offsets,