PaddlePaddle
diff --git a/‎python/paddle/distributed/auto_parallel/api.py‎
Lines changed: 38 additions & 28 deletions b/‎python/paddle/distributed/auto_parallel/api.py‎
Lines changed: 38 additions & 28 deletions
diff --git a/‎python/paddle/distributed/auto_parallel/constants.py‎
Lines changed: 1 addition & 1 deletion b/‎python/paddle/distributed/auto_parallel/constants.py‎
Lines changed: 1 addition & 1 deletion
@@ -79,7 +79,7 @@
     to_placements,
 )
 from .random import determinate_rng, rng_state
-from .sharding import ShardingOptimizerStage1
+from .sharding import ShardingOptimizerStage1, get_placement_with_sharding
 
 if TYPE_CHECKING:
     from collections.abc import Callable, Sequence
@@ -992,30 +992,6 @@ def replicate_layer_params_and_buffers(
         )
 
 
-def get_placement_with_sharding(param, sharding_axis):
-    shard_axis = -1
-    for placement in param.placements:
-        if isinstance(placement, dist.Shard):
-            # the parameter can't be shard twice with sharding on different mesh now
-            # for example, [Shard(0), Shard(1)], assert here in case
-            assert (
-                shard_axis == -1
-            ), "The parameter can't be shard twice with sharding strategy even in different mesh now."
-            shard_axis = placement.get_dim()
-
-    placement_with_sharding = None
-    for dim in range(param.ndim):
-        if dim != shard_axis:
-            placement_with_sharding = dist.Shard(dim)
-            break
-
-    new_placements = param.placements
-    if placement_with_sharding is not None:
-        new_placements[sharding_axis] = placement_with_sharding
-
-    return new_placements
-
-
 class _ShardOptimizer(Optimizer):
     def __init__(self, optimizer, shard_fn=None, gradient_accumulation_steps=1):
         assert (
@@ -2548,6 +2524,7 @@ def state_dict(
         self,
         mode: Literal['opt', 'param', 'all'] = "all",
         split_fusion: bool = True,
+        load_sharded_model: bool = True,
     ) -> dict[str, Tensor]:
         """
         Get the state dict of model and optimizer.
@@ -2559,7 +2536,6 @@ def state_dict(
                 'all' : The return value contains the variable in the network and optimizer.
                 Default: 'all'
         """
-
         if use_pir_api():
             scope = paddle.static.global_scope()
             local_state_dict = self.dist_main_program(
@@ -2626,6 +2602,21 @@ def state_dict(
                                     ] = dist_tensor
                                 dist_state_dict.pop(param)
 
+        # when tensor-fusion is enabled, the optimizer parameters are unbalanced
+        # in their sharding. We need to process the optimizer parameters to make
+        # them evenly balanced
+        if self._engine._optimizer is not None and load_sharded_model:
+            optimizer = self._engine._optimizer
+            if isinstance(
+                optimizer,
+                paddle.static.amp.decorator.OptimizerWithMixedPrecision,
+            ):
+                optimizer = optimizer._optimizer
+            if isinstance(optimizer, ShardingOptimizerStage1):
+                optimizer.convert_state_dict_without_tensor_fusion_param(
+                    dist_state_dict
+                )
+
         mapping_names = [
             (
                 self._parameter_to_structured_name[k]
@@ -2669,7 +2660,8 @@ def build_distributed_tensor(local_tensor, dist_attr):
             mesh = ProcessMesh(
                 np.array(dist_attr["process_group"]).reshape(
                     dist_attr["process_shape"]
-                )
+                ),
+                dim_names=dist_attr["dim_names"],
             )
             placements = to_placements(dist_attr["dims_mapping"], mesh)
             dist_tensor = dtensor_from_local(local_tensor, mesh, placements)
@@ -2693,7 +2685,25 @@ def build_distributed_tensor(local_tensor, dist_attr):
     def set_state_dict(self, state_dict: dict[str, Tensor]) -> None:
         local_state_dict = {}
         dist_main_program = self.dist_main_program(mode=self._engine._mode)
-        cur_state_dict = self.state_dict(split_fusion=False)
+        cur_state_dict = self.state_dict(
+            split_fusion=False, load_sharded_model=False
+        )
+
+        # For sharding with tensor-fusion, we need to convert the state_dict
+        # to include tensor-fusion parameters before calling set_state_dict,
+        # as stored parameters are processed as if tensor-fusion is not applied
+        if self._engine._optimizer is not None:
+            optimizer = self._engine._optimizer
+            if isinstance(
+                optimizer,
+                paddle.static.amp.decorator.OptimizerWithMixedPrecision,
+            ):
+                optimizer = optimizer._optimizer
+            if isinstance(optimizer, ShardingOptimizerStage1):
+                optimizer.convert_state_dict_with_tensor_fusion_param(
+                    state_dict
+                )
+
         for k, v in state_dict.items():
             assert v.is_dist(), f"key {k} value:{v} is not a dist tensor."
             if k in cur_state_dict:
 
@@ -158,7 +158,7 @@ class _AMPConfig(TypedDict, total=False):  # noqa: PYI049
 set_field_default_config(SHARDING, "enable_tuning", False)
 set_field_default_config(SHARDING, "tuning_range", [])
 set_field_default_config(SHARDING, "release_gradients", False)
-set_field_default_config(SHARDING, "comm_buffer_size_MB", -1)
+set_field_default_config(SHARDING, "comm_buffer_size_MB", 256)
 
 if TYPE_CHECKING: