PaddlePaddle
diff --git a/‎model_zoo/bert/run_pretrain.py‎
Lines changed: 4 additions & 0 deletions b/‎model_zoo/bert/run_pretrain.py‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎paddlenlp/trainer/plugins/unified_checkpoint.py‎
Lines changed: 24 additions & 20 deletions b/‎paddlenlp/trainer/plugins/unified_checkpoint.py‎
Lines changed: 24 additions & 20 deletions
diff --git a/‎paddlenlp/trainer/trainer.py‎
Lines changed: 8 additions & 3 deletions b/‎paddlenlp/trainer/trainer.py‎
Lines changed: 8 additions & 3 deletions
diff --git a/‎paddlenlp/transformers/conversion_utils.py‎
Lines changed: 13 additions & 3 deletions b/‎paddlenlp/transformers/conversion_utils.py‎
Lines changed: 13 additions & 3 deletions
diff --git a/‎paddlenlp/transformers/llama/modeling.py‎
Lines changed: 9 additions & 8 deletions b/‎paddlenlp/transformers/llama/modeling.py‎
Lines changed: 9 additions & 8 deletions
@@ -424,6 +424,10 @@ def do_train(args):
                     optimizer.step()
                 lr_scheduler.step()
                 optimizer.clear_grad()
+
+                # NOTE: For accurate data statistics, please open the comments below，especially when args.logging_steps==1.
+                # if global_step % args.logging_steps == 0:
+                #     loss = loss.numpy()
                 total_samples += args.batch_size
                 train_run_cost = time.time() - batch_start
                 train_cost_avg.record(train_run_cost)
 
@@ -30,6 +30,7 @@
 from paddlenlp.transformers.model_utils import (
     PretrainedModel,
     _load_state_dict_into_model,
+    faster_set_state_dict,
     get_parameter_dtype,
     load_state_dict,
     unwrap_model,
@@ -65,9 +66,10 @@
 from paddlenlp.utils.nested import nested_copy, nested_copy_place
 
 if is_safetensors_available():
-    from safetensors import safe_open
+    # from safetensors import safe_open
     from safetensors.numpy import save_file as safe_save_file
 
+    from paddlenlp.utils.safetensors import fast_safe_open as safe_open
 
 FP32_MASTER = "fp32_master_0"
 optimizer_scalar_name = [
@@ -196,7 +198,6 @@ def load_unified_checkpoint(args, model, optimizer, resume_from_checkpoint: str,
     Returns:
         None
     """
-
     if paddle.distributed.get_world_size() <= 1:
         load_single_card_checkpoint(args, model, resume_from_checkpoint)
         return
@@ -222,7 +223,6 @@ def load_unified_checkpoint_locally(args, model, resume_from_checkpoint: str, sa
         pretrained_model_name_or_path=resume_from_checkpoint,
         index_filename=os.path.join(resume_from_checkpoint, index_filename),
     )
-
     loaded_keys = sharded_metadata["all_checkpoint_keys"]
 
     model_state_dict = get_expected_state_dict(model)
@@ -266,7 +266,9 @@ def _remove_unused_keys(
             else:
                 tp_actions = model.get_tensor_parallel_convert_actions(model.config, loaded_keys, ignore_error=True)
         # Here we use expected_keys to optimize weights loading for pipeline model. Only works for safetensors
-        state_dict = load_state_dict(shard_file, tp_actions if pre_tensor_parallel_split else None, expected_keys)
+        state_dict = load_state_dict(
+            shard_file, tp_actions if pre_tensor_parallel_split else None, expected_keys, device="expected"
+        )
 
         if not pre_tensor_parallel_split:
             # Since we load all keys but we only need one of pipeline stages
@@ -279,11 +281,12 @@ def _remove_unused_keys(
                 None, model.config, state_dict=state_dict, ignore_error=len(resolved_archive_file) > 1
             )
 
-        error_msgs += _load_state_dict_into_model(model, state_dict, "")
+        # error_msgs += _load_state_dict_into_model(model, state_dict, "")
+        error_msgs += faster_set_state_dict(model, state_dict, strict_dtype=False)
 
         # force memory release
         del state_dict
-        gc.collect()
+        # gc.collect()
 
     if len(error_msgs) > 0:
         error_msg = "\n\t".join(error_msgs)
@@ -337,6 +340,7 @@ def unified_checkpoint_into_shards(
             tp_actions = model_to_save.get_tensor_parallel_convert_actions(
                 model_to_save.config, state_dict.keys(), is_split=False, ignore_error=True
             )
+        logger.info("Unified model tensor parallel weights in shards")
         state_dict = merge_tensor_parallel_with_shard(state_dict, tp_actions, all_filter_keys)
 
     # build index json file
@@ -490,6 +494,7 @@ def load_unified_optimizer_locally(args, model, optimizer, resume_from_checkpoin
     # This should always be a list but, just to be sure.
     if not isinstance(resolved_archive_file, list):
         resolved_archive_file = [resolved_archive_file]
+
     if len(resolved_archive_file) > 1:
         resolved_archive_file = tqdm(resolved_archive_file, desc="Loading optimizer shards")
 
@@ -537,10 +542,10 @@ def load_resolved_archive_file(resolved_archive_file, sharded_metadata, expected
                         tp_actions = mapping_optimizer_tp_actions(tp_actions, expected_keys)
 
                     # Here we use expected_keys to optimize weights loading for pipeline model. Only works for safetensors
-                    state_dict = load_state_dict(shard_file, tp_actions, expected_keys)
+                    state_dict = load_state_dict(shard_file, tp_actions, expected_keys, device="expected")
                 else:
                     # for pipeline model, we don't need to use tp_actions
-                    state_dict = load_state_dict(shard_file, None, expected_keys)
+                    state_dict = load_state_dict(shard_file, None, expected_keys, device="expected")
 
             returned_state_dict.update(state_dict)
             # force memory release
@@ -553,7 +558,6 @@ def load_resolved_archive_file(resolved_archive_file, sharded_metadata, expected
         state_dict_master_weight = load_resolved_archive_file(
             resolved_archive_file_mw, sharded_metadata_mw, expected_keys_mw, is_master_weights=True
         )
-
     # rename optimizer param
     for key in list(state_dict_optim.keys()):
         key_name = key.split("/")
@@ -562,13 +566,13 @@ def load_resolved_archive_file(resolved_archive_file, sharded_metadata, expected
             key_name = "_".join([static_name, FP32_MASTER, key_name[1]])
         else:
             key_name = "_".join([static_name, key_name[1]])
-        returned_optim_state_dict[key_name] = state_dict_optim[key]
+        returned_optim_state_dict[key_name] = state_dict_optim.pop(key)
         returned_optim_state_dict[key_name].name = key_name
 
     if has_master_weights:
         for key in list(state_dict_master_weight.keys()):
             static_name = struct2static_name_mappings[key]
-            returned_optim_state_dict["master_weights"][static_name] = state_dict_master_weight[key]
+            returned_optim_state_dict["master_weights"][static_name] = state_dict_master_weight.pop(key)
             returned_optim_state_dict["master_weights"][static_name].name = "_".join([static_name, FP32_MASTER])
 
     returned_optim_state_dict = nested_copy_place(
@@ -640,6 +644,7 @@ def unified_optimizer_into_shards(
             tp_actions = model.get_tensor_parallel_convert_actions(
                 model.config, model_keys, is_split=False, ignore_error=True
             )
+        logger.info("Unified optimizer tensor parallel in shards")
         optim_state_dict = merge_tensor_parallel_for_optimizer(
             optim_state_dict,
             tp_actions,
@@ -648,6 +653,7 @@ def unified_optimizer_into_shards(
         paddle.device.cuda.empty_cache()
 
         if master_weights is not None:
+            logger.info("Unified master weight tensor parallel in shards")
             master_weights = merge_tensor_parallel_for_optimizer(
                 master_weights,
                 tp_actions,
@@ -703,7 +709,6 @@ def unified_optimizer_into_shards(
 def check_unified_checkpoint(args, model, resume_from_checkpoint, safe_serialization=False):
     index_filename = select_model_weight_index(args, model, resume_from_checkpoint, safe_serialization, local=False)
     index_filename = os.path.join(resume_from_checkpoint, index_filename)
-
     # Find index json file and distribute this file in global group.
     if distributed_isfile(index_filename):
         distributed_file(index_filename)
@@ -1605,7 +1610,9 @@ def gather_sharded_object(index_file, total_size, is_optimizer=False):
     tp_group = hcg.get_model_parallel_group()
     pp_group = hcg.get_pipe_parallel_group()
 
-    logger.info("Unified checkpoint generating sharded_index json files.")
+    logger.info(
+        f"Unified checkpoint: generating sharded_index json files for {'optimizer or master weight' if is_optimizer else 'model weight'}."
+    )
 
     if tp_group.nranks > 1:
         dist.all_gather_object(index_file_list, index_file, tp_group)
@@ -1714,8 +1721,6 @@ def filter_params(model_to_save, state_dict, is_optimizer=False):
 
 
 def merge_tensor_parallel_with_shard(state_dict, tp_actions, all_filter_keys):
-    logger.info("Unified checkpoint merge tensor parallel in shards")
-
     hcg = fleet.get_hybrid_communicate_group()
     tp_group = hcg.get_model_parallel_group()
     tp_rank = tp_group.rank
@@ -1741,7 +1746,7 @@ def merge_tensor_parallel_with_shard(state_dict, tp_actions, all_filter_keys):
                 action = tp_actions.pop(key)
                 tensor = action(ret) if is_dst else None
             else:
-                tensor = tensor._copy_to(paddle.CPUPlace(), False) if is_dst else None
+                tensor = tensor._copy_to(paddle.CUDAPinnedPlace(), False) if is_dst else None
 
             if is_dst:
                 state_dict_to_save[key] = tensor
@@ -1754,8 +1759,7 @@ def merge_tensor_parallel_with_shard(state_dict, tp_actions, all_filter_keys):
 
 
 def merge_tensor_parallel_for_optimizer(state_dict, tp_actions, all_filter_keys):
-    logger.info("Unified optimizer tensor parallel in shards")
-
+    # Core function for UC
     hcg = fleet.get_hybrid_communicate_group()
     tp_group = hcg.get_model_parallel_group()
     tp_rank = tp_group.rank
@@ -1774,14 +1778,14 @@ def merge_tensor_parallel_for_optimizer(state_dict, tp_actions, all_filter_keys)
                 # for example: beta1, beta2
                 if tensor.numel().item() == 1:
                     tensor = (
-                        tensor._copy_to(paddle.CPUPlace(), False) if is_dst else None
+                        tensor._copy_to(paddle.CUDAPinnedPlace(), False) if is_dst else None
                     )  # Need broadcast when loaded
                 else:
                     ret = distributed_gather(tensor, dst=j, group=tp_group, offload=False)
                     action = tp_actions[model_key]
                     tensor = action(ret) if is_dst else None
             else:
-                tensor = tensor._copy_to(paddle.CPUPlace(), False) if is_dst else None
+                tensor = tensor._copy_to(paddle.CUDAPinnedPlace(), False) if is_dst else None
 
             if is_dst:
                 state_dict_to_save[filter_keys[i]] = tensor
 
@@ -2436,6 +2436,7 @@ def _load_optimizer_and_scheduler(self, checkpoint):
             self.runtime_timer.stop()
             return
 
+        logger.info("Loading optimizer and scheduler...")
         if (not self.args.should_load_sharding_stage1_model) and self.args.ignore_load_lr_and_optim:
             self.runtime_timer.stop()
             return
@@ -2765,11 +2766,15 @@ def evaluation_loop(
         # Number of losses has been rounded to a multiple of batch_size and in a distributed training, the number of
         # samplers has been rounded to a multiple of batch_size, so we truncate.
         if all_losses is not None:
-            all_losses = all_losses[:num_samples]
+            all_losses = all_losses[: num_samples * int(self.args.world_size / self.args.dataset_world_size)]
         if all_preds is not None:
-            all_preds = nested_truncate(all_preds, num_samples)
+            all_preds = nested_truncate(
+                all_preds, num_samples * int(self.args.world_size / self.args.dataset_world_size)
+            )
         if all_labels is not None:
-            all_labels = nested_truncate(all_labels, num_samples)
+            all_labels = nested_truncate(
+                all_labels, num_samples * int(self.args.world_size / self.args.dataset_world_size)
+            )
 
         model.train()
 
 
@@ -285,8 +285,12 @@ def naive_fuse_merge_tp(weight_list, is_column=True, fuse_tensor_parts=2):
 
     if isinstance(weight_list[0], np.ndarray):
         return np.concatenate([reorder[i] for i in index], axis=axis)
+    else:
+        tensor = paddle.concat([reorder[i] for i in index], axis=axis)
 
-    return paddle.concat([reorder[i] for i in index], axis=axis)._copy_to(paddle.CPUPlace(), False)
+        if tensor.place.is_gpu_place():
+            tensor = tensor._copy_to(paddle.CUDAPinnedPlace(), False)
+        return tensor
 
 
 def naive_fuse_split_tp(
@@ -361,12 +365,18 @@ def normal_fuse_merge_tp(weight_list, is_column=True):
         if isinstance(weight_list[0], np.ndarray):
             return np.concatenate(weight_list, axis=-1)
         else:
-            return paddle.concat(weight_list, axis=-1)._copy_to(paddle.CPUPlace(), False)
+            tensor = paddle.concat(weight_list, axis=-1)
+            if tensor.place.is_gpu_place():
+                tensor = tensor._copy_to(paddle.CUDAPinnedPlace(), False)
+            return tensor
     else:
         if isinstance(weight_list[0], np.ndarray):
             return np.concatenate(weight_list, axis=0)
         else:
-            return paddle.concat(weight_list, axis=0)._copy_to(paddle.CPUPlace(), False)
+            tensor = paddle.concat(weight_list, axis=0)
+            if tensor.place.is_gpu_place():
+                tensor = tensor._copy_to(paddle.CUDAPinnedPlace(), False)
+            return tensor
 
 
 def normal_fuse_split_tp(weight, tensor_parallel_degree, tensor_parallel_rank=None, is_column=True):
 
@@ -96,9 +96,11 @@ def swiglu(x, y=None):
     "LlamaForCausalLM",
     "LlamaPretrainingCriterion",
 ]
-global npu_is_casual
+
+
 npu_is_casual = False
 
+
 def _get_interleave(n):
     def _get_interleave_power_of_2(n):
         start = 2 ** (-(2 ** -(math.log2(n) - 3)))
@@ -213,7 +215,7 @@ def scaled_dot_product_attention(
 ):
     bsz, q_len, num_heads, head_dim = query_states.shape
     _, kv_seq_len, _, _ = value_states.shape
-    global npu_is_casual
+
     if config.use_flash_attention and flash_attention:
         # Paddle Flash Attention input [ bz, seqlen, nhead, head_dim]
         # Torch Flash Attention input [ bz, nhead, seqlen, head_dim]
@@ -1119,7 +1121,6 @@ def __init__(self, config, layerwise_recompute: bool = False):
         self.layerwise_recompute = layerwise_recompute
         self.recompute_granularity = config.recompute_granularity
 
-
     def forward(
         self,
         hidden_states: paddle.Tensor,
@@ -1613,14 +1614,12 @@ def forward(
         attention_mask = self._prepare_decoder_attention_mask(
             attention_mask, (batch_size, seq_length), cache_length, inputs_embeds.dtype
         )  # [bs, 1, seq_len, seq_len]
-        global npu_is_casual
         if self.config.use_flash_attention:
             is_casual = is_casual_mask(attention_mask)
             if get_env_device() != "npu":
                 if is_casual and alibi is None:
                     attention_mask = None
             else:
-                npu_is_casual = is_casual
                 attention_mask = attention_mask.astype("bool")
         hidden_states = inputs_embeds
         # decoder layers
@@ -1728,10 +1727,12 @@ def forward(self, prediction_scores, masked_lm_labels):
             # skip ignore_index which loss == 0
             # masked_lm_loss = masked_lm_loss[masked_lm_loss > 0]
             # loss = paddle.mean(masked_lm_loss)
-            binary_sequence = paddle.where(masked_lm_loss > 0, paddle.ones_like(masked_lm_loss), paddle.zeros_like(masked_lm_loss))
+            binary_sequence = paddle.where(
+                masked_lm_loss > 0, paddle.ones_like(masked_lm_loss), paddle.zeros_like(masked_lm_loss)
+            )
             sum_ = paddle.sum(binary_sequence)
-            loss = 0 if sum_ == 0 else paddle.sum(masked_lm_loss * binary_sequence) / sum_ 
-            
+            loss = 0 if sum_ == 0 else paddle.sum(masked_lm_loss * binary_sequence) / sum_
+
         return loss