[Auto Parallel] fix sharding bug && automatically mark split points (#69426)

deepllz · web-flow · commit 6aa71b4b9b99 · 2024-11-16T14:12:32.000+08:00
diff --git a/python/paddle/distributed/auto_parallel/api.py b/python/paddle/distributed/auto_parallel/api.py
@@ -1055,6 +1055,7 @@ def _set_and_check_sharding_prop_from_param(self):
             'dp' in self._shard_fn._mesh.dim_names
         ):
             self._sharding_degree = self._shard_fn._mesh.get_dim_size('dp')
+            self._sharding_mesh_axis = 0
         else:
             param_list = self._inner_opt._parameter_list
             for param in param_list:
diff --git a/python/paddle/distributed/auto_parallel/intermediate/pipeline_parallel.py b/python/paddle/distributed/auto_parallel/intermediate/pipeline_parallel.py
@@ -13,14 +13,18 @@
 # limitations under the License.
 
 import itertools
+import re
 from collections import OrderedDict
 from enum import Enum
 
 import paddle.distributed as dist
 from paddle.distributed import fleet
+from paddle.distributed.utils.log_utils import get_logger
 
 from .parallel_base import ParallelModel, ParallelOptimizer, is_tensor
 
+logger = get_logger("INFO", __name__)
+
 
 class SplitPoint(Enum):
     BEGINNING = 0
@@ -137,7 +141,11 @@ def pipeline_parallel(model, optimizer, split_spec, mesh=None, dimension=None):
     Args:
         model (paddle.nn.Layer): A single card model to be distributed
         optimizer (paddle.optimizer.Optimizer): An optimizer to be distributed
-        split_spec (OrderedDict): Pipeline parallel split point, the order of the keys is the order of the pipeline stage
+        split_spec (OrderedDict|dict|str): The pipeline parallel split point.
+            if split_spec is a string, such as "llama.layer", Then the layer with same prefix a will be divided equally according to the size of pipeline degree.
+            if split_spec is a OrderedDict|dict, key is the layer name, and the value is the split position that can be SplitPoint.BEGINNING or SplitPoint.END, the order of the keys is the order of the pipeline stage.
+            NOTE: dict is also ordered after python3.7, so use dict at this time.
+        the order of the keys is the order of the pipeline stage
         mesh (ProcessMesh): A ProcessMesh Object.
         dimension (int|str): The mesh dimension to pipeline the model.
 
@@ -158,7 +166,40 @@ def pipeline_parallel(model, optimizer, split_spec, mesh=None, dimension=None):
             "Specifying a custom mesh is not supported currently"
         )
 
-    model = PipelineParallel(model, split_spec)
+    if isinstance(split_spec, str):
+        # match layer_name with split_spec following by a dot and numbers and no other characters
+        # such as split_spec = "llama.layer", then llama.layer.0 is matched, llama.layer.0.mlp is not matched
+        pattern = rf"{split_spec}\.\d+$"
+        matched_layer_name = [
+            name
+            for name, _ in model.named_sublayers()
+            if re.match(pattern, name)
+        ]
+
+        pp_size = mesh.get_dim_size("pp")
+        layer_num = len(matched_layer_name)
+        assert (
+            layer_num > 0
+        ), "No layer match the split_spec, please check its correctness"
+        assert (
+            layer_num % pp_size == 0
+        ), f"The number of layers must be divisible by the pp size, but got {layer_num} and {pp_size}"
+        layers_per_rank = layer_num // pp_size
+        split_spec_dict = OrderedDict(
+            [
+                (
+                    f"{split_spec}.{i * layers_per_rank - 1}",
+                    SplitPoint.END,
+                )
+                for i in range(1, pp_size)
+            ]
+        )
+    else:
+        split_spec_dict = split_spec
+
+    logger.info(f"split_spec_dict: {split_spec_dict}")
+
+    model = PipelineParallel(model, split_spec_dict)
     if optimizer is not None:
         optimizer = ParallelOptimizer(optimizer)
 
diff --git a/test/auto_parallel/hybrid_strategy/parallel_api.py b/test/auto_parallel/hybrid_strategy/parallel_api.py
@@ -14,7 +14,6 @@
 import logging
 import os
 import random
-from collections import OrderedDict
 from functools import reduce
 
 import numpy as np
@@ -26,9 +25,6 @@
 from paddle.distributed.auto_parallel.intermediate.parallelize import (
     parallelize,
 )
-from paddle.distributed.auto_parallel.intermediate.pipeline_parallel import (
-    SplitPoint,
-)
 from paddle.distributed.auto_parallel.intermediate.tensor_parallel import (
     ColWiseParallel,
     RowWiseParallel,
@@ -145,6 +141,10 @@ def __init__(self):
         if os.getenv("prepare_input_output") == "true":
             self.sequence_parallel = True
 
+        num_hidden_layers = os.getenv("num_hidden_layers")
+        if num_hidden_layers:
+            self.config.num_hidden_layers = int(num_hidden_layers)
+
         seed = int(os.getenv("seed", 2024))
         np.random.seed(seed)
         random.seed(seed)
@@ -204,17 +204,12 @@ def parallel_model(self, layer, optimizer=None):
         mp_config = None
         pp_config = None
         if self.pp > 1:
-            decoders_per_rank = self.config.num_hidden_layers // self.pp
-            split_spec = OrderedDict(
-                [
-                    (
-                        f"llama.layers.{i * decoders_per_rank - 1}",
-                        SplitPoint.END,
-                    )
-                    for i in range(1, self.pp)
-                ]
-            )
-            pp_config = {'split_spec': split_spec}
+            # decoders_per_rank = self.config.num_hidden_layers // self.pp
+            # split_spec = {
+            #     f"llama.layers.{i * decoders_per_rank - 1}": SplitPoint.END
+            #     for i in range(1, self.pp)
+            # }
+            pp_config = {'split_spec': "llama.layers"}
         if self.dp > 1:
             dp_config = {'sharding_level': self.level}
         if self.mp > 1:
diff --git a/test/auto_parallel/hybrid_strategy/test_parallel_api_with_llama_1d.py b/test/auto_parallel/hybrid_strategy/test_parallel_api_with_llama_1d.py
@@ -33,15 +33,9 @@ def setUp(self):
             "backend": ["gpu"],
             "amp": ["true"],
             "amp_level": ["O2"],
-            "amp_dtype": [
-                "bfloat16",
-            ],
-            "amp_master_grad": [
-                "False",
-            ],
-            "sharding_stage": [
-                "1",
-            ],
+            "amp_dtype": ["bfloat16"],
+            "amp_master_grad": ["False"],
+            "sharding_stage": ["0", "1"],
         }
 
     def test_simple_net_dp2(self):
@@ -73,12 +67,9 @@ def setUp(self):
             "backend": ["gpu"],
             "amp": ["true"],
             "amp_level": ["O2"],
-            "amp_dtype": [
-                "bfloat16",
-            ],
-            "amp_master_grad": [
-                "False",
-            ],
+            "amp_dtype": ["bfloat16"],
+            "amp_master_grad": ["False"],
+            "num_hidden_layers": ["2", "4"],
         }
 
     def test_simple_net_pp2(self):
diff --git a/test/auto_parallel/hybrid_strategy/test_parallel_api_with_llama_2d.py b/test/auto_parallel/hybrid_strategy/test_parallel_api_with_llama_2d.py
@@ -54,5 +54,78 @@ def test_simple_net_mp2_pp2(self):
             ckpt_path.cleanup()
 
 
+class TestDPPPAPI(test_base.CommunicationTestDistBase):
+    def setUp(self):
+        super().setUp(num_of_devices=4, timeout=120, nnode=1)
+        self._default_envs = {
+            "dtype": "float32",
+            "seed": "2023",
+            "dp": "2",
+            "mp": "1",
+            "pp": "2",
+            "acc_step": "2",
+        }
+        self._changeable_envs = {
+            "backend": ["gpu"],
+            "amp": ["true"],
+            "amp_level": ["O2"],
+            "amp_dtype": ["bfloat16"],
+            "amp_master_grad": ["true"],
+            "use_lazy_init": ["true"],
+            "num_hidden_layers": ["2", "4"],
+            "sharding_stage": ["0"],
+        }
+
+    def test_simple_net_dp2_pp2(self):
+        envs_list = test_base.gen_product_envs_list(
+            self._default_envs, self._changeable_envs
+        )
+        for envs in envs_list:
+            ckpt_path = tempfile.TemporaryDirectory()
+            envs["ckpt_path"] = ckpt_path.name
+            self.run_test_case(
+                "parallel_api.py",
+                user_defined_envs=envs,
+            )
+            ckpt_path.cleanup()
+
+
+class TestDPMPAPI(test_base.CommunicationTestDistBase):
+    def setUp(self):
+        super().setUp(num_of_devices=4, timeout=120, nnode=1)
+        self._default_envs = {
+            "dtype": "float32",
+            "seed": "2023",
+            "dp": "2",
+            "mp": "2",
+            "pp": "1",
+            "acc_step": "2",
+        }
+        self._changeable_envs = {
+            "backend": ["gpu"],
+            "amp": ["true"],
+            "amp_level": ["O2"],
+            "amp_dtype": ["bfloat16"],
+            "amp_master_grad": ["true"],
+            "use_lazy_init": ["true"],
+            "sequence_parallel": ["true"],
+            "prepare_input_output": ["false"],
+            "sharding_stage": ["0"],
+        }
+
+    def test_simple_net_mp2_pp2(self):
+        envs_list = test_base.gen_product_envs_list(
+            self._default_envs, self._changeable_envs
+        )
+        for envs in envs_list:
+            ckpt_path = tempfile.TemporaryDirectory()
+            envs["ckpt_path"] = ckpt_path.name
+            self.run_test_case(
+                "parallel_api.py",
+                user_defined_envs=envs,
+            )
+            ckpt_path.cleanup()
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/test/auto_parallel/hybrid_strategy/test_parallel_api_with_llama_3d.py b/test/auto_parallel/hybrid_strategy/test_parallel_api_with_llama_3d.py
@@ -38,7 +38,7 @@ def setUp(self):
             "use_lazy_init": ["true"],
             "sequence_parallel": ["true"],
             "prepare_input_output": ["false"],
-            "sharding_stage": ["0"],
+            "sharding_stage": ["0", "1"],
         }
 
     def test_simple_net_dp2_mp2_pp2(self):

Original file line number	Diff line number	Diff line change
`@@ -38,7 +38,7 @@ def setUp(self):`
`38`	`38`	`"use_lazy_init": ["true"],`
`39`	`39`	`"sequence_parallel": ["true"],`
`40`	`40`	`"prepare_input_output": ["false"],`
`41`		`- "sharding_stage": ["0"],`
	`41`	`+ "sharding_stage": ["0", "1"],`
`42`	`42`	`}`
`43`	`43`
`44`	`44`	`def test_simple_net_dp2_mp2_pp2(self):`