refactor code

wtmlon · wtmlon · commit 4947a8c6e018 · 2024-11-07T19:29:04.000+08:00
diff --git a/paddlenlp/quantization/unified_checkpoint_quantization.py b/paddlenlp/quantization/unified_checkpoint_quantization.py
@@ -14,13 +14,15 @@
 
 import paddle
 import paddle.distributed as dist
+from paddle.distributed import fleet
 
 from paddlenlp.quantization.checkpoint_quantization_utils import (
     asymmetry_qdq_weight,
     cal_ratio,
     group_wise_quant_dequant,
     merge_int4,
     qdq_weight,
+    split_int8,
 )
 from paddlenlp.utils.env import (
     ASYMMETRY_QUANT_SCALE_MAX,
@@ -32,6 +34,100 @@
 from paddlenlp.utils.log import logger
 
 
+def dequant_unified_optimizer(state_dict, ckpt_quant_stage, scale_dict):
+    rank, world_size = -1, 1
+    if paddle.distributed.get_world_size() > 1:
+        hcg = fleet.get_hybrid_communicate_group()
+        tp_group = hcg.get_model_parallel_group()
+        rank, world_size = tp_group.rank, tp_group.nranks
+
+    if ckpt_quant_stage == "O1":
+        # set eps
+        eps = 1e-8
+        for quant_key in state_dict.keys():
+            is_moment1 = MOMENT1_KEYNAME in quant_key
+            is_moment2 = MOMENT2_KEYNAME in quant_key
+            if is_moment1:
+                # dequant m1
+                scale_key = quant_key + SYMMETRY_QUANT_SCALE
+                weight = state_dict[quant_key]
+                scales = scale_dict[scale_key]
+                weight, _ = qdq_weight(
+                    weight,
+                    scales=scales,
+                    quant_bit=8,
+                    dequant=True,
+                    rank=rank,
+                    world_size=world_size,
+                    use_pd=True,
+                )
+                state_dict[quant_key] = weight
+            elif is_moment2:
+                # dequant ratio
+                weight = state_dict[quant_key]
+                min_scale_key = quant_key + ASYMMETRY_QUANT_SCALE_MIN
+                max_scale_key = quant_key + ASYMMETRY_QUANT_SCALE_MAX
+                mins, maxs = scale_dict[min_scale_key], scale_dict[max_scale_key]
+                weight, _ = asymmetry_qdq_weight(
+                    weight,
+                    mins=mins,
+                    maxs=maxs,
+                    quant_bit=8,
+                    dequant=True,
+                    rank=rank,
+                    world_size=world_size,
+                    use_pd=True,
+                )
+                # cal m2
+                weight = paddle.square(1.0 / weight - eps)
+                state_dict[quant_key] = weight
+    elif ckpt_quant_stage == "O2":
+        # set eps
+        eps = 1e-8
+        m1_state_dict = {}
+        for quant_key in state_dict.keys():
+            if state_dict[quant_key].dtype != paddle.int8:
+                logger.info(f"{quant_key} skip.")
+                continue
+            # split int8
+            weight = state_dict[quant_key]
+            m1_quant, ratio_quant = split_int8(weight.numpy())
+            # dequant ratio
+            ratio_min_scale_key = quant_key + ASYMMETRY_QUANT_SCALE_MIN
+            ratio_max_scale_key = quant_key + ASYMMETRY_QUANT_SCALE_MAX
+            m1_scale_key = quant_key[: -len(MOMENT2_KEYNAME)] + MOMENT1_KEYNAME + SYMMETRY_QUANT_SCALE
+            m1_codebook = scale_dict[m1_scale_key]
+            ratio_mins, ratio_maxs = scale_dict[ratio_min_scale_key], scale_dict[ratio_max_scale_key]
+            m1_weight = group_wise_quant_dequant(
+                m1_quant,
+                mins=m1_codebook,
+                maxs=None,
+                quant_bits=4,
+                quant=False,
+                rank=rank,
+                world_size=world_size,
+                use_pd=True,
+                symmetry=True,
+            )
+            ratio_weight = group_wise_quant_dequant(
+                ratio_quant,
+                mins=ratio_mins,
+                maxs=ratio_maxs,
+                quant_bits=4,
+                quant=False,
+                rank=rank,
+                world_size=world_size,
+                use_pd=True,
+            )
+
+            ratio_weight = paddle.square(1.0 / ratio_weight - eps)
+            state_dict[quant_key] = ratio_weight
+            m1_state_dict[quant_key[: -len(MOMENT2_KEYNAME)] + MOMENT1_KEYNAME] = m1_weight
+            state_dict.update(m1_state_dict)
+
+    return state_dict
+
+
 def quant_unified_optimizer(state_dict, state_dict_type, ckpt_quant_stage, async_save=False):
     quant = False
     if ckpt_quant_stage != "O0":
diff --git a/paddlenlp/trainer/unified_checkpoint/async_handler.py b/paddlenlp/trainer/unified_checkpoint/async_handler.py
@@ -27,7 +27,10 @@
 if is_safetensors_available():
     from safetensors.numpy import save_file as safe_save_file
 
-from .quantization import quant_unified_optimizer
+from paddlenlp.quantization.unified_checkpoint_quantization import (
+    quant_unified_optimizer,
+)
+
 from .shared_memory_utils import (
     _read_state_dict_from_shm,
     _traverse_copy_to_shm,
diff --git a/paddlenlp/transformers/model_utils.py b/paddlenlp/transformers/model_utils.py
@@ -43,7 +43,6 @@
 )
 from huggingface_hub.utils import EntryNotFoundError
 from paddle import Tensor
-from paddle.distributed import fleet
 from paddle.distributed.fleet.meta_parallel.parallel_layers import (
     PipelineLayer,
     SharedLayerDesc,
@@ -59,8 +58,6 @@
     ASYMMETRY_QUANT_SCALE_MIN,
     CONFIG_NAME,
     LEGACY_CONFIG_NAME,
-    MOMENT1_KEYNAME,
-    MOMENT2_KEYNAME,
     PADDLE_WEIGHTS_INDEX_NAME,
     PADDLE_WEIGHTS_NAME,
     PYTORCH_WEIGHTS_INDEX_NAME,
@@ -74,12 +71,7 @@
 from paddlenlp.utils.log import logger
 
 from ..generation import GenerationConfig, GenerationMixin
-from ..quantization.checkpoint_quantization_utils import (
-    asymmetry_qdq_weight,
-    group_wise_quant_dequant,
-    qdq_weight,
-    split_int8,
-)
+from ..quantization.unified_checkpoint_quantization import dequant_unified_optimizer
 from ..utils import device_guard
 from ..utils.download import resolve_file_path
 from .configuration_utils import PretrainedConfig
@@ -332,100 +324,6 @@ def get_parameter_dtype(parameter: nn.Layer) -> paddle.dtype:
     return last_dtype
 
 
-def dequant_unified_optimizer(state_dict, ckpt_quant_stage, scale_dict):
-    rank, world_size = -1, 1
-    if paddle.distributed.get_world_size() > 1:
-        hcg = fleet.get_hybrid_communicate_group()
-        tp_group = hcg.get_model_parallel_group()
-        rank, world_size = tp_group.rank, tp_group.nranks
-
-    if ckpt_quant_stage == "O1":
-        # set eps
-        eps = 1e-8
-        for quant_key in state_dict.keys():
-            is_moment1 = MOMENT1_KEYNAME in quant_key
-            is_moment2 = MOMENT2_KEYNAME in quant_key
-            if is_moment1:
-                # dequant m1
-                scale_key = quant_key + SYMMETRY_QUANT_SCALE
-                weight = state_dict[quant_key]
-                scales = scale_dict[scale_key]
-                weight, _ = qdq_weight(
-                    weight,
-                    scales=scales,
-                    quant_bit=8,
-                    dequant=True,
-                    rank=rank,
-                    world_size=world_size,
-                    use_pd=True,
-                )
-                state_dict[quant_key] = weight
-            elif is_moment2:
-                # dequant ratio
-                weight = state_dict[quant_key]
-                min_scale_key = quant_key + ASYMMETRY_QUANT_SCALE_MIN
-                max_scale_key = quant_key + ASYMMETRY_QUANT_SCALE_MAX
-                mins, maxs = scale_dict[min_scale_key], scale_dict[max_scale_key]
-                weight, _ = asymmetry_qdq_weight(
-                    weight,
-                    mins=mins,
-                    maxs=maxs,
-                    quant_bit=8,
-                    dequant=True,
-                    rank=rank,
-                    world_size=world_size,
-                    use_pd=True,
-                )
-                # cal m2
-                weight = paddle.square(1.0 / weight - eps)
-                state_dict[quant_key] = weight
-    elif ckpt_quant_stage == "O2":
-        # set eps
-        eps = 1e-8
-        m1_state_dict = {}
-        for quant_key in state_dict.keys():
-            if state_dict[quant_key].dtype != paddle.int8:
-                logger.info(f"{quant_key} skip.")
-                continue
-            # split int8
-            weight = state_dict[quant_key]
-            m1_quant, ratio_quant = split_int8(weight.numpy())
-            # dequant ratio
-            ratio_min_scale_key = quant_key + ASYMMETRY_QUANT_SCALE_MIN
-            ratio_max_scale_key = quant_key + ASYMMETRY_QUANT_SCALE_MAX
-            m1_scale_key = quant_key[: -len(MOMENT2_KEYNAME)] + MOMENT1_KEYNAME + SYMMETRY_QUANT_SCALE
-            m1_codebook = scale_dict[m1_scale_key]
-            ratio_mins, ratio_maxs = scale_dict[ratio_min_scale_key], scale_dict[ratio_max_scale_key]
-            m1_weight = group_wise_quant_dequant(
-                m1_quant,
-                mins=m1_codebook,
-                maxs=None,
-                quant_bits=4,
-                quant=False,
-                rank=rank,
-                world_size=world_size,
-                use_pd=True,
-                symmetry=True,
-            )
-            ratio_weight = group_wise_quant_dequant(
-                ratio_quant,
-                mins=ratio_mins,
-                maxs=ratio_maxs,
-                quant_bits=4,
-                quant=False,
-                rank=rank,
-                world_size=world_size,
-                use_pd=True,
-            )
-
-            ratio_weight = paddle.square(1.0 / ratio_weight - eps)
-            state_dict[quant_key] = ratio_weight
-            m1_state_dict[quant_key[: -len(MOMENT2_KEYNAME)] + MOMENT1_KEYNAME] = m1_weight
-            state_dict.update(m1_state_dict)
-
-    return state_dict
-
-
 def _split_keys_evenly(keys: list, n: int) -> list:
     """Split a list into n lists with an equal number of elements.
 
@@ -471,9 +369,18 @@ def _load_part_state_dict(
     scale_dict = {}
     with safe_open(checkpoint_file, framework="np") as f:
         for key in keys:
-            # non merge ckpt loading dont have filter key.
-            if key.endswith(SYMMETRY_QUANT_SCALE) or (fliter_dict_keys is not None and key not in fliter_dict_keys):
+            # 1. non-merge ckpt loading dont have filter key.
+            # 2. merge ckpt will skip quant scale by `fliter_dict_keys`
+            if (
+                key.endswith(SYMMETRY_QUANT_SCALE)
+                or key.endswith(ASYMMETRY_QUANT_SCALE_MIN)
+                or key.endswith(ASYMMETRY_QUANT_SCALE_MAX)
+            ):
                 continue
+
+            if fliter_dict_keys is not None and key not in fliter_dict_keys:
+                continue
+
             py_safe_slice_ = f.get_slice(key)
             if key in tensor_parallel_split_mapping:
                 weight = tensor_parallel_split_mapping[key](py_safe_slice_)
@@ -485,7 +392,11 @@ def _load_part_state_dict(
                 weight = weight._copy_to(paddle.framework._current_expected_place(), False)
             part_state_dict[key] = weight
         for key in keys:
-            if key.endswith(SYMMETRY_QUANT_SCALE):
+            if (
+                key.endswith(SYMMETRY_QUANT_SCALE)
+                or key.endswith(ASYMMETRY_QUANT_SCALE_MIN)
+                or key.endswith(ASYMMETRY_QUANT_SCALE_MAX)
+            ):
                 scale = f.get_tensor(key)
                 with device_guard():
                     scale = paddle.Tensor(scale, zero_copy=True)
@@ -504,9 +415,6 @@ def load_state_dict(
     """
     Reads a PaddlePaddle checkpoint file, returning properly formatted errors if they arise.
     """
-    quant = False
-    if ckpt_quant_stage != "O0":
-        quant = "optimizer" in checkpoint_file
 
     if tensor_parallel_split_mapping is None:
         tensor_parallel_split_mapping = {}
@@ -562,7 +470,9 @@ def load_state_dict(
                     with device_guard():
                         state_dict[k] = paddle.Tensor(state_dict.pop(k), zero_copy=True)
 
-            if quant:
+            if len(scale_dict) != 0:
+                if ckpt_quant_stage == "O0":
+                    raise ValueError('optimizer weight has quantization scales but `ckpt_quant_stage` is set to "O0"')
                 state_dict = dequant_unified_optimizer(state_dict, ckpt_quant_stage, scale_dict)
 
             return state_dict
diff --git a/paddlenlp/utils/env.py b/paddlenlp/utils/env.py
@@ -117,6 +117,6 @@ def _get_bool_env(env_key: str, default_value: str) -> bool:
 MOMENT2_KEYNAME = "moment2_0"
 BETA1_KEYNAME = "beta1_pow_acc_0"
 BETA2_KEYNAME = "beta2_pow_acc_0"
-SYMMETRY_QUANT_SCALE = "_codebook"
-ASYMMETRY_QUANT_SCALE_MIN = "_min_codebook"
-ASYMMETRY_QUANT_SCALE_MAX = "_max_codebook"
+SYMMETRY_QUANT_SCALE = "@scales"
+ASYMMETRY_QUANT_SCALE_MIN = "@min_scales"
+ASYMMETRY_QUANT_SCALE_MAX = "@max_scales"
diff --git a/tests/fixtures/llm/finetune.yaml b/tests/fixtures/llm/finetune.yaml
@@ -89,7 +89,7 @@ ckpt_quant:
     do_eval: true
     use_flash_attention: true
     unified_checkpoint: true
-    unified_checkpoint_config: "remove_master_weight"
+    unified_checkpoint_config: "async_save remove_master_weight"
     disable_tqdm: true
     load_best_model_at_end: true
     eval_with_do_generation: false