fix

liym27 · liym27 · commit d2ef6916357e · 2024-04-01T11:06:47.000+08:00
diff --git a/paddlenlp/trainer/trainer.py b/paddlenlp/trainer/trainer.py
@@ -34,7 +34,6 @@
 from typing import Any, Callable, Dict, List, Optional, Tuple, Union
 
 import numpy as np
-import hashlib
 import paddle
 import paddle.amp.auto_cast as autocast
 import paddle.distributed as dist
@@ -1273,8 +1272,6 @@ def _maybe_log_save_evaluate(self, tr_loss, model, epoch, ignore_keys_for_eval,
                     seq_length=seq_length,
                 )
             )
-            logs["loss_md5"] = hashlib.md5(
-                np.array(tr_loss_scalar).tobytes()).hexdigest()
 
             self._total_loss_scalar += tr_loss_scalar
             self._globalstep_last_logged = self.state.global_step
diff --git a/paddlenlp/transformers/gpt/modeling_auto.py b/paddlenlp/transformers/gpt/modeling_auto.py
@@ -396,8 +396,6 @@ def forward(self,
         out = self.out_proj(out)
         # if sequence_parallel is true, out shape are [bs * seq_len / n, dim]
         # else their shape are [bs, seq_len, dim], n is mp parallelism.
-        out = dist.reshard(out, get_mesh(self.ipp),
-                           [dist.Shard(0), dist.Replicate()])
         outs = [out]
         if output_attentions:
             outs.append(weights)
@@ -672,11 +670,7 @@ def forward(self,
             if not self.config.use_fused_dropout_add:
                 act = self.activation(self.linear1(hidden_states),
                                       approximate=True)
-                act = dist.reshard(act, get_mesh(
-                    self.ipp), [dist.Shard(0), dist.Shard(2)])
                 l_2 = self.linear2(act)
-                l_2 = dist.reshard(l_2, get_mesh(
-                    self.ipp), [dist.Shard(0), dist.Replicate()])
                 hidden_states = residual + self.dropout2(l_2)
             else:
                 hidden_states = self.fused_dropout_add2(
@@ -769,10 +763,6 @@ def forward(self, input_ids, position_ids=None, inputs_embeddings=None):
         # The 'with' block ensures the correct seed context is used
         with seed_guard_context(current_seed):
             embeddings = self.dropout(embeddings)
-            embeddings = dist.reshard(
-                embeddings, get_mesh(),
-                [dist.Shard(0), dist.Replicate()])
-
         return embeddings
 
 
@@ -1338,8 +1328,6 @@ def forward(self, hidden_states, tensor_parallel_output=None):
         y = dist.reshard(self.weight, get_mesh(self.ipp),
                          [dist.Replicate(), dist.Shard(0)])
         logits = paddle.matmul(hidden_states, y, transpose_y=self.transpose_y)
-        logits = dist.reshard(logits, get_mesh(self.ipp),
-                              [dist.Shard(0), dist.Replicate()])
         return logits
 
 
@@ -1441,8 +1429,6 @@ def forward(
         else:
             hidden_states = outputs[0]
         logits = self.lm_head(hidden_states)
-        logits = dist.reshard(logits, get_mesh(self.ipp),
-                              [dist.Shard(0), dist.Replicate()])
         return logits
 
         # NOTE: The following code failed to run from dynamic to static mode
diff --git a/scripts/distribute/ci_case_auto.sh b/scripts/distribute/ci_case_auto.sh
@@ -30,42 +30,42 @@ unset CUDA_VISIBLE_DEVICES
 
 function gpt_case_list_auto() {
     gpt_auto_recompute_bs16_fp32_DP1-MP1-PP1
-    # gpt_auto_recompute_bs16_fp16_o2_DP1-MP1-PP8
-    # gpt_auto_recompute_bs16_fp16_o2_DP1-MP2-PP4
-    # gpt_auto_recompute_bs16_fp16_o2_DP2-MP2-PP2
-    # gpt_auto_recompute_bs16_fp16_o2_DP4-MP2-Sharding4_stage1
-    # gpt_auto_recompute_bs16_fp16_o2_DP4-MP2-Sharding4_stage2
-    # gpt_auto_recompute_bs16_fp16_o2_DP4-MP2-Sharding4_stage3
-    # gpt_auto_recompute_bs16_fp16_o2_DP2-MP1-PP4_Sharding2_stage1
-    # gpt_auto_recompute_bs16_fp16_o2_DP2-MP1-PP4_Sharding2_stage2
-    # gpt_auto_recompute_bs16_fp16_o2_DP2-MP1-PP4_Sharding2_stage3
-    # gpt_auto_recompute_bs16_fp16_o2_DP2-MP2-PP2_Sharding2_stage1
-    # gpt_auto_recompute_bs16_fp16_o2_DP2-MP2-PP2_Sharding2_stage2
-    # gpt_auto_recompute_bs16_fp16_o2_DP2-MP2-PP2_Sharding2_stage3
-    # gpt_auto_sp_acc_check
+    gpt_auto_recompute_bs16_fp16_o2_DP1-MP1-PP8
+    gpt_auto_recompute_bs16_fp16_o2_DP1-MP2-PP4
+    gpt_auto_recompute_bs16_fp16_o2_DP2-MP2-PP2
+    gpt_auto_recompute_bs16_fp16_o2_DP4-MP2-Sharding4_stage1
+    gpt_auto_recompute_bs16_fp16_o2_DP4-MP2-Sharding4_stage2
+    gpt_auto_recompute_bs16_fp16_o2_DP4-MP2-Sharding4_stage3
+    gpt_auto_recompute_bs16_fp16_o2_DP2-MP1-PP4_Sharding2_stage1
+    gpt_auto_recompute_bs16_fp16_o2_DP2-MP1-PP4_Sharding2_stage2
+    gpt_auto_recompute_bs16_fp16_o2_DP2-MP1-PP4_Sharding2_stage3
+    gpt_auto_recompute_bs16_fp16_o2_DP2-MP2-PP2_Sharding2_stage1
+    gpt_auto_recompute_bs16_fp16_o2_DP2-MP2-PP2_Sharding2_stage2
+    gpt_auto_recompute_bs16_fp16_o2_DP2-MP2-PP2_Sharding2_stage3
+    gpt_auto_sp_acc_check
 }
 
 function llama_case_list_auto() {
     llama_dygraph_auto_bs8_fp32_DP2
-    # llama_dygraph_auto_bs8_fp32_DP2-MP2
-    # llama_dygraph_auto_bs8_fp32_DP2-MP2-PP2
-    # llama_dygraph_auto_bs8_fp16_DP2-MP2-PP2
-
-    # llama_static_auto_recompute_bs8_fp32_DP1-MP1-PP1
-    # llama_static_auto_recompute_bs16_fp32_DP2-MP1-PP1
-    # llama_static_auto_recompute_bs16_fp32_DP2-MP2-PP1
-    # llama_static_auto_recompute_bs16_fp32_DP2-MP2-PP2
-    # llama_static_auto_recompute_bs16_fp32_DP2-MP2-PP2-VPP2-Sharding2_stage2
-    # llama_static_auto_recompute_bs16_fp16_DP2-MP2-PP2-VPP2-Sharding2_stage2
+    llama_dygraph_auto_bs8_fp32_DP2-MP2
+    llama_dygraph_auto_bs8_fp32_DP2-MP2-PP2
+    llama_dygraph_auto_bs8_fp16_DP2-MP2-PP2
+
+    llama_static_auto_recompute_bs8_fp32_DP1-MP1-PP1
+    llama_static_auto_recompute_bs16_fp32_DP2-MP1-PP1
+    llama_static_auto_recompute_bs16_fp32_DP2-MP2-PP1
+    llama_static_auto_recompute_bs16_fp32_DP2-MP2-PP2
+    llama_static_auto_recompute_bs16_fp32_DP2-MP2-PP2-VPP2-Sharding2_stage2
+    llama_static_auto_recompute_bs16_fp16_DP2-MP2-PP2-VPP2-Sharding2_stage2
 }
 
 function gpt_case_list_auto_pir() {
     gpt_auto_recompute_bs16_fp16_o2_DP1-MP1-PP8_pir
-    # gpt_auto_recompute_bs16_fp16_o2_DP2-MP2-PP2_pir
-    # gpt_auto_recompute_bs16_fp16_o2_DP4-MP2-Sharding4_stage1_pir
-    # gpt_auto_recompute_bs16_fp16_o2_DP2-MP1-PP4_Sharding2_stage1_pir
-    # gpt_auto_recompute_bs16_fp16_o2_DP2-MP2-PP2_Sharding2_stage2_pir
-    # gpt_auto_recompute_bs16_fp16_o2_DP2-MP2-PP2_Sharding2_stage3_pir
+    gpt_auto_recompute_bs16_fp16_o2_DP2-MP2-PP2_pir
+    gpt_auto_recompute_bs16_fp16_o2_DP4-MP2-Sharding4_stage1_pir
+    gpt_auto_recompute_bs16_fp16_o2_DP2-MP1-PP4_Sharding2_stage1_pir
+    gpt_auto_recompute_bs16_fp16_o2_DP2-MP2-PP2_Sharding2_stage2_pir
+    gpt_auto_recompute_bs16_fp16_o2_DP2-MP2-PP2_Sharding2_stage3_pir
 }
 
 function llm_gpt_case_list_auto() {
@@ -1578,11 +1578,11 @@ function llm_gpt_dygraph_auto_bs8_fp32_DP2() {
     ips=-1
     mem=-1
     echo "result: loss=$loss ips=$ips mem=$mem loss_md5=$loss_md5"
-    loss_base=10.57663822
-    loss_md5_base=86a59936c56ae83ce556dec0833ca35e
+    loss_base=10.57664108
+    loss_md5_base=0ebf68698887b33b33a46518621cf412
     ips_base=-1
     mem_base=-1
-    check_result $FUNCNAME ${loss_base} ${loss} ${loss_md5_base} ${loss_md5} ${mem_base} ${mem}
+    check_result $FUNCNAME ${loss_base} ${loss} ${ips_base} ${ips} ${mem_base} ${mem}
     echo "=========== $FUNCNAME run  end ==========="
 }
 
@@ -1648,11 +1648,11 @@ function llm_gpt_dygraph_auto_bs8_fp32_DP2-MP2() {
     ips=-1
     mem=-1
     echo "result: loss=$loss ips=$ips mem=$mem loss_md5=$loss_md5"
-    loss_base=10.57694435
-    loss_md5_base=42649563a5ae2af87b9322d33f75deb1
+    loss_base=10.57694054
+    loss_md5_base=6df87d01bd08113a92930f6349514b35
     ips_base=-1
     mem_base=-1
-    check_result $FUNCNAME ${loss_base} ${loss} ${loss_md5_base} ${loss_md5} ${mem_base} ${mem}
+    check_result $FUNCNAME ${loss_base} ${loss} ${ips_base} ${ips} ${mem_base} ${mem}
     echo "=========== $FUNCNAME run  end ==========="
 }
 
@@ -1718,11 +1718,11 @@ function llm_gpt_dygraph_auto_bs8_fp32_DP2-MP2-PP2() {
     ips=-1
     mem=-1
     echo "result: loss=$loss ips=$ips mem=$mem loss_md5=$loss_md5"
-    loss_base=10.57580185
-    loss_md5_base=9751dab0842de5905a8c0b87d1f06d67
+    loss_base=10.5758028
+    loss_md5_base=6cb4e151b35f026190df90ab240d9a95
     ips_base=-1
     mem_base=-1
-    check_result $FUNCNAME ${loss_base} ${loss} ${loss_md5_base} ${loss_md5} ${mem_base} ${mem}
+    check_result $FUNCNAME ${loss_base} ${loss} ${ips_base} ${ips} ${mem_base} ${mem}
     echo "=========== $FUNCNAME run  end ==========="
 }
 
@@ -1792,7 +1792,7 @@ function llm_gpt_dygraph_auto_bs8_fp16_DP2-MP2-PP2() {
     loss_md5_base=e82a1f5668870d18a2d45b3ee0a25386
     ips_base=-1
     mem_base=-1
-    check_result $FUNCNAME ${loss_base} ${loss} ${loss_md5_base} ${loss_md5} ${mem_base} ${mem}
+    check_result $FUNCNAME ${loss_base} ${loss} ${ips_base} ${ips} ${mem_base} ${mem}
     echo "=========== $FUNCNAME run  end ==========="
 }
 

Original file line number	Diff line number	Diff line change
`@@ -34,7 +34,6 @@`
`34`	`34`	`from typing import Any, Callable, Dict, List, Optional, Tuple, Union`
`35`	`35`
`36`	`36`	`import numpy as np`
`37`		`-import hashlib`
`38`	`37`	`import paddle`
`39`	`38`	`import paddle.amp.auto_cast as autocast`
`40`	`39`	`import paddle.distributed as dist`
`@@ -1273,8 +1272,6 @@ def _maybe_log_save_evaluate(self, tr_loss, model, epoch, ignore_keys_for_eval,`
`1273`	`1272`	`seq_length=seq_length,`
`1274`	`1273`	`)`
`1275`	`1274`	`)`
`1276`		`- logs["loss_md5"] = hashlib.md5(`
`1277`		`- np.array(tr_loss_scalar).tobytes()).hexdigest()`
`1278`	`1275`
`1279`	`1276`	`self._total_loss_scalar += tr_loss_scalar`
`1280`	`1277`	`self._globalstep_last_logged = self.state.global_step`