[MoE] fix the bug when using 0-D tensor in MoE model (#5538)

pkuzyc · web-flow · commit 80cc85956ce7 · 2023-05-09T21:26:28.000+08:00
diff --git a/examples/language_model/moe/dygraph/run_moe_pretrain.py b/examples/language_model/moe/dygraph/run_moe_pretrain.py
@@ -551,7 +551,7 @@ def do_train(args):
 
                     if args.gate != "naive" and args.balance_loss_weight:
                         aux_loss_list = [
-                            l.moe_mlp.gate.get_loss(clear=False)
+                            l.moe_mlp.gate.get_loss(clear=False).reshape([-1])
                             for l in model.gpt.decoder.layers
                             if hasattr(l.moe_mlp, "gate")
                         ]

Original file line number	Diff line number	Diff line change
`@@ -551,7 +551,7 @@ def do_train(args):`
`551`	`551`
`552`	`552`	`if args.gate != "naive" and args.balance_loss_weight:`
`553`	`553`	`aux_loss_list = [`
`554`		`- l.moe_mlp.gate.get_loss(clear=False)`
	`554`	`+ l.moe_mlp.gate.get_loss(clear=False).reshape([-1])`
`555`	`555`	`for l in model.gpt.decoder.layers`
`556`	`556`	`if hasattr(l.moe_mlp, "gate")`
`557`	`557`	`]`