@@ -877,13 +877,6 @@ def compute_ffn_layernorm(self, out_linear_out, residual_input, i):
877877 return tmp_out , residual_input
878878
879879 def compute_fused_moe (self , tmp_out , i ):
880- # todo[xinhw]: make bias optional
881- if self .ffn1_biases [i ] is None :
882- shape1 = paddle .to_tensor ([self .ffn1_weights [i ].shape [0 ], 1 , self .dim_feedforward * 2 ])
883- self .ffn1_biases [i ] = paddle .zeros (shape1 )
884- if self .ffn2_biases [i ] is None :
885- shape2 = paddle .to_tensor ([self .ffn1_weights [i ].shape [0 ], 1 , self .embed_dim ])
886- self .ffn2_biases [i ] = paddle .zeros (shape2 )
887880 fused_moe_out = fused_moe (
888881 tmp_out ,
889882 self .gate_weights [i ],
@@ -1302,13 +1295,6 @@ def compute_out_linear(self, fmha_out, i):
13021295 )
13031296
13041297 def compute_fused_moe (self , tmp_out , i ):
1305- # todo[xinhw]: make bias optional
1306- if self .ffn1_biases [i ] is None :
1307- shape1 = paddle .to_tensor ([self .ffn1_weights [i ].shape [0 ], 1 , self .dim_feedforward * 2 ])
1308- self .ffn1_biases [i ] = paddle .zeros (shape1 )
1309- if self .ffn2_biases [i ] is None :
1310- shape2 = paddle .to_tensor ([self .ffn1_weights [i ].shape [0 ], 1 , self .embed_dim ])
1311- self .ffn2_biases [i ] = paddle .zeros (shape2 )
13121298 fused_moe_out = fused_moe (
13131299 tmp_out ,
13141300 self .gate_weights [i ],
0 commit comments