Move fc2_latent_proj into combine method to make sure tensors are the right shape

deepakn94 · deepakn94 · commit 44630814bef7 · 2025-11-21T16:34:00.000-08:00
Signed-off-by: Deepak Narayanan &lt;dnarayanan@nvidia.com&gt;
diff --git a/megatron/core/transformer/moe/moe_layer.py b/megatron/core/transformer/moe/moe_layer.py
@@ -269,6 +269,10 @@ def combine(self, output: torch.Tensor, shared_expert_output: Optional[torch.Ten
         """
         output = self.token_dispatcher.token_combine(output)
         output = self.token_dispatcher.combine_postprocess(output)
+        # Project the output back from latent dimension to hidden dimension after combine
+        # in latent dimension.
+        if self.config.moe_latent_size:
+            output, _ = self.fc2_latent_proj(output)
         if shared_expert_output is not None:
             output = output + shared_expert_output
         return output
@@ -313,10 +317,6 @@ def custom_forward(hidden_states):
                 output = output + mlp_bias
                 mlp_bias = None
             output = self.combine(output, shared_expert_output)
-            # Project the output back from latent dimension to hidden dimension after combine
-            # in latent dimension.
-            if self.config.moe_latent_size:
-                output, _ = self.fc2_latent_proj(output)
 
             return output, mlp_bias