add comment

wtmlon · wtmlon · commit a6b2236342e2 · 2024-11-19T17:49:48.000+08:00
diff --git a/paddlenlp/quantization/checkpoint_quantization_utils.py b/paddlenlp/quantization/checkpoint_quantization_utils.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -17,7 +17,7 @@
 import paddle
 
 
-# cal adam update ratio
+# cal part adam update ratio
 def cal_ratio(m, v, eps=1e-8):
     return 1 / (np.sqrt(v) + eps)
 
@@ -29,8 +29,8 @@ def group_wise_quant_dequant(
     quant_bits=4,
     group_size=32,
     quant=True,
-    rank=-1,
-    world_size=1,
+    tp_rank=-1,
+    tp_degree=1,
     use_pd=False,
     symmetry=False,
 ):
@@ -49,10 +49,10 @@ def group_wise_quant_dequant(
             Group size of group-wise quantization.
         quant (`bool`):
             True when quantization, False in dequantization.
-        rank (`int`):
-            Model parallel rank.
-        world_size (`int`):
-            Model parallel world size.
+        tp_rank (`int`):
+            Tensor parallel rank.
+        tp_degree (`int`):
+            Tensor parallel world size.
         use_pd (`bool`):
             Whether to use paddle caculation. If False will use numpy.
         symmetry (`bool`):
@@ -92,21 +92,28 @@ def group_wise_quant_dequant(
             else:
                 new_scales = np.repeat(scales, repeats=group_size, axis=0)
 
-            if rank == -1:
+            if tp_rank == -1:
                 dequant_tensor = inputs.astype("float32") * new_scales / bnt
             elif len(new_scales.shape) == 0 or inputs.shape[-1] == new_scales.shape[-1]:
+                # input tensor was row parallel in tp.
                 dequant_tensor = (
                     inputs.astype("float32")
                     * new_scales[
-                        rank * new_scales.shape[0] // world_size : (rank + 1) * new_scales.shape[0] // world_size
+                        tp_rank * new_scales.shape[0] // tp_degree : (tp_rank + 1) * new_scales.shape[0] // tp_degree
                     ]
                     / bnt
                 )
             else:
+                # input tensor was column parallel in tp.
                 dequant_tensor = (
                     inputs.astype("float32")
                     * new_scales[
-                        :, rank * new_scales.shape[-1] // world_size : (rank + 1) * new_scales.shape[-1] // world_size
+                        :,
+                        tp_rank
+                        * new_scales.shape[-1]
+                        // tp_degree : (tp_rank + 1)
+                        * new_scales.shape[-1]
+                        // tp_degree,
                     ]
                     / bnt
                 )
@@ -120,22 +127,28 @@ def group_wise_quant_dequant(
             new_scales = np.repeat(scales, repeats=group_size, axis=0)
             new_mins = np.repeat(mins, repeats=group_size, axis=0)
 
-        if rank == -1:
+        if tp_rank == -1:
             dequant_tensor = (inputs.astype("float32") / qmax * new_scales) + new_mins
         elif len(new_scales.shape) == 0 or inputs.shape[-1] == new_scales.shape[-1]:
+            # input tensor was row parallel in tp.
             dequant_tensor = (
                 inputs.astype("float32")
                 / qmax
-                * new_scales[rank * new_scales.shape[0] // world_size : (rank + 1) * new_scales.shape[0] // world_size]
-            ) + new_mins[rank * new_mins.shape[0] // world_size : (rank + 1) * new_mins.shape[0] // world_size]
+                * new_scales[
+                    tp_rank * new_scales.shape[0] // tp_degree : (tp_rank + 1) * new_scales.shape[0] // tp_degree
+                ]
+            ) + new_mins[tp_rank * new_mins.shape[0] // tp_degree : (tp_rank + 1) * new_mins.shape[0] // tp_degree]
         else:
+            # input tensor was column parallel in tp.
             dequant_tensor = (
                 inputs.astype("float32")
                 / qmax
                 * new_scales[
-                    :, rank * new_scales.shape[-1] // world_size : (rank + 1) * new_scales.shape[-1] // world_size
+                    :, tp_rank * new_scales.shape[-1] // tp_degree : (tp_rank + 1) * new_scales.shape[-1] // tp_degree
                 ]
-            ) + new_mins[:, rank * new_mins.shape[-1] // world_size : (rank + 1) * new_mins.shape[-1] // world_size]
+            ) + new_mins[
+                :, tp_rank * new_mins.shape[-1] // tp_degree : (tp_rank + 1) * new_mins.shape[-1] // tp_degree
+            ]
         return dequant_tensor
 
 
@@ -154,28 +167,29 @@ def split_int8(final):
 
     int4_high = np.where(int4_high > 8, int4_high - 16, int4_high)
 
-    high_tensor = paddle.Tensor(int4_high, zero_copy=True)
-    low_tensor = paddle.Tensor(int4_low, zero_copy=True)
+    high_tensor = paddle.Tensor(int4_high)
+    low_tensor = paddle.Tensor(int4_low)
 
     return high_tensor, low_tensor
 
 
 # channel-wise min max scales calculation
 def cal_abs_min_max_channel(inputs, quant_axis=1):
+    eps = 1e-8
     reduce_axis = tuple([i for i in range(len(inputs.shape)) if i != quant_axis])
     abs_max_values = np.max(inputs, axis=reduce_axis)
     abs_min_values = np.min(inputs, axis=reduce_axis)
     abs_max_values = np.where(
-        abs_max_values == np.array(0, dtype=inputs.dtype), np.array(1e-8, dtype=inputs.dtype), abs_max_values
+        abs_max_values == np.array(0, dtype=inputs.dtype), np.array(eps, dtype=inputs.dtype), abs_max_values
     )
     abs_min_values = np.where(
-        abs_min_values == np.array(0, dtype=inputs.dtype), np.array(1e-8, dtype=inputs.dtype), abs_min_values
+        abs_min_values == np.array(0, dtype=inputs.dtype), np.array(eps, dtype=inputs.dtype), abs_min_values
     )
     return abs_max_values, abs_min_values
 
 
 def asymmetry_qdq_weight(
-    x, quant_bit=8, quant_axis=-1, mins=None, maxs=None, dequant=False, rank=-1, world_size=1, use_pd=False
+    x, quant_bit=8, quant_axis=-1, mins=None, maxs=None, dequant=False, tp_rank=-1, tp_degree=1, use_pd=False
 ):
     """
     channel-wise asymmetry quantization
@@ -192,9 +206,9 @@ def asymmetry_qdq_weight(
             Max scales tensor in asymmetry quantization.
         dequant (`bool`):
             True when dequantization, False in quantization.
-        rank (`int`):
+        tp_rank (`int`):
             Model parallel rank.
-        world_size (`int`):
+        tp_degree (`int`):
             Model parallel world size.
         use_pd (`bool`):
             Whether to use paddle caculation. If False will use numpy.
@@ -213,39 +227,47 @@ def asymmetry_qdq_weight(
         # dequant
         if not use_pd:
             if len(scales.shape) == 0 or quant_x.shape[-1] == scales.shape[-1]:
+                # input tensor was row parallel in tp.
                 qdq_x = (quant_x / bnt * scales) + mins
             else:
+                # input tensor was column parallel in tp.
                 qdq_x = (
                     quant_x
                     / bnt
-                    * scales[rank * scales.shape[0] // world_size : (rank + 1) * scales.shape[0] // world_size]
-                ) + mins[rank * mins.shape[0] // world_size : (rank + 1) * mins.shape[0] // world_size]
+                    * scales[tp_rank * scales.shape[0] // tp_degree : (tp_rank + 1) * scales.shape[0] // tp_degree]
+                ) + mins[tp_rank * mins.shape[0] // tp_degree : (tp_rank + 1) * mins.shape[0] // tp_degree]
             return qdq_x.astype(np.float32), scales
         else:
             if len(scales.shape) == 0 or quant_x.shape[-1] == scales.shape[-1]:
+                # input tensor was row parallel in tp.
                 qdq_x = (quant_x / bnt * scales.unsqueeze(0).expand(quant_x.shape)) + mins
             else:
+                # input tensor was column parallel in tp.
                 qdq_x = (
                     quant_x
                     / bnt
-                    * scales[rank * scales.shape[0] // world_size : (rank + 1) * scales.shape[0] // world_size]
+                    * scales[tp_rank * scales.shape[0] // tp_degree : (tp_rank + 1) * scales.shape[0] // tp_degree]
                     .unsqueeze(0)
                     .expand(quant_x.shape)
-                ) + mins[rank * mins.shape[0] // world_size : (rank + 1) * mins.shape[0] // world_size]
+                ) + mins[tp_rank * mins.shape[0] // tp_degree : (tp_rank + 1) * mins.shape[0] // tp_degree]
             return qdq_x.astype(paddle.float32), scales
 
 
 # channel-wise abs max calculation
 def cal_abs_max_channel(inputs, quant_axis=1):
+    epsilon = 1e-8
     reduce_axis = tuple([i for i in range(len(inputs.shape)) if i != quant_axis])
     abs_max_values = np.max(np.abs(inputs), axis=reduce_axis)
+    # maybe all elements are zero in one group,
+    # so set the scales from those group to an actual number
+    # from divide 0.
     abs_max_values = np.where(
-        abs_max_values == np.array(0, dtype=inputs.dtype), np.array(1e-8, dtype=inputs.dtype), abs_max_values
+        abs_max_values == np.array(0, dtype=inputs.dtype), np.array(epsilon, dtype=inputs.dtype), abs_max_values
     )
     return abs_max_values
 
 
-def qdq_weight(x, quant_bit=8, quant_axis=-1, scales=None, dequant=False, rank=-1, world_size=1, use_pd=False):
+def qdq_weight(x, quant_bit=8, quant_axis=-1, scales=None, dequant=False, tp_rank=-1, tp_degree=1, use_pd=False):
     """
     channel-wise symmetry quantization
     Args:
@@ -259,9 +281,9 @@ def qdq_weight(x, quant_bit=8, quant_axis=-1, scales=None, dequant=False, rank=-
             Abs max scales tensor in symmetry quantization.
         dequant (`bool`):
             True when dequantization, False in quantization.
-        rank (`int`):
+        tp_rank (`int`):
             Model parallel rank.
-        world_size (`int`):
+        tp_degree (`int`):
             Model parallel world size.
         use_pd (`bool`):
             Whether to use paddle caculation. If False will use numpy.
@@ -279,23 +301,27 @@ def qdq_weight(x, quant_bit=8, quant_axis=-1, scales=None, dequant=False, rank=-
         # dequant
         if not use_pd:
             if len(scales.shape) == 0 or quant_x.shape[-1] == scales.shape[-1]:
+                # input tensor was row parallel in tp.
                 qdq_x = quant_x / bnt * scales
             else:
+                # input tensor was column parallel in tp.
                 qdq_x = (
                     quant_x
                     / bnt
-                    * scales[rank * scales.shape[0] // world_size : (rank + 1) * scales.shape[0] // world_size]
+                    * scales[tp_rank * scales.shape[0] // tp_degree : (tp_rank + 1) * scales.shape[0] // tp_degree]
                 )
             # fp32 , int8, int, fp32 or fp64
             return qdq_x.astype(np.float32), scales
         else:
             if len(scales.shape) == 0 or quant_x.shape[-1] == scales.shape[-1]:
+                # input tensor was row parallel in tp.
                 qdq_x = quant_x / bnt * scales.unsqueeze(0).expand(quant_x.shape)
             else:
+                # input tensor was column parallel in tp.
                 qdq_x = (
                     quant_x
                     / bnt
-                    * scales[rank * scales.shape[0] // world_size : (rank + 1) * scales.shape[0] // world_size]
+                    * scales[tp_rank * scales.shape[0] // tp_degree : (tp_rank + 1) * scales.shape[0] // tp_degree]
                     .unsqueeze(0)
                     .expand(quant_x.shape)
                 )
diff --git a/paddlenlp/quantization/unified_checkpoint_quantization.py b/paddlenlp/quantization/unified_checkpoint_quantization.py
@@ -96,11 +96,11 @@ def dequant_unified_optimizer(state_dict, ckpt_quant_stage, scale_dict):
             ratio_min_scale_key = quant_key + ASYMMETRY_QUANT_SCALE_MIN
             ratio_max_scale_key = quant_key + ASYMMETRY_QUANT_SCALE_MAX
             m1_scale_key = quant_key[: -len(MOMENT2_KEYNAME)] + MOMENT1_KEYNAME + SYMMETRY_QUANT_SCALE
-            m1_codebook = scale_dict[m1_scale_key]
+            m1_scales = scale_dict[m1_scale_key]
             ratio_mins, ratio_maxs = scale_dict[ratio_min_scale_key], scale_dict[ratio_max_scale_key]
             m1_weight = group_wise_quant_dequant(
                 m1_quant,
-                mins=m1_codebook,
+                mins=m1_scales,
                 maxs=None,
                 quant_bits=4,
                 quant=False,
@@ -134,16 +134,11 @@ def quant_unified_optimizer(state_dict, state_dict_type, ckpt_quant_stage, async
         quant = True
     del_key = []
     if quant and state_dict_type == "optimizer_weight":
-        codebook_dict = {}
+        scales_dict = {}
         opt_keys = state_dict.keys()
-        if not async_save:
-            all_bits, quant_bits = paddle.to_tensor(0.0), paddle.to_tensor(0.0)
         for k in opt_keys:
             momentum1 = k.endswith(MOMENT1_KEYNAME)
             momentum2 = k.endswith(MOMENT2_KEYNAME)
-            k_size = state_dict[k].size
-            if not async_save and (momentum1 or momentum2):
-                all_bits += k_size * 4
 
             quant_weight = None
 
@@ -153,28 +148,29 @@ def quant_unified_optimizer(state_dict, state_dict_type, ckpt_quant_stage, async
                     # m1: m1_quant_weight, m2: ratio
                     m1_key = k.split("/")[0] + "/" + MOMENT1_KEYNAME
                     ratio = cal_ratio(state_dict[m1_key], state_dict[k])
-                    m1_quant, codebook = qdq_weight(state_dict[m1_key], quant_bit=8)
+                    m1_quant, scales = qdq_weight(state_dict[m1_key], quant_bit=8)
                     quant_weight, mins, maxs = asymmetry_qdq_weight(ratio, quant_bit=8)
                     state_dict[m1_key] = m1_quant
-                    codebook_dict[m1_key + SYMMETRY_QUANT_SCALE] = codebook
-                    codebook_dict[k + ASYMMETRY_QUANT_SCALE_MIN] = mins
-                    codebook_dict[k + ASYMMETRY_QUANT_SCALE_MAX] = maxs
+                    scales_dict[m1_key + SYMMETRY_QUANT_SCALE] = scales
+                    scales_dict[k + ASYMMETRY_QUANT_SCALE_MIN] = mins
+                    scales_dict[k + ASYMMETRY_QUANT_SCALE_MAX] = maxs
                 elif not momentum1:
                     quant_weight = state_dict[k]
             elif ckpt_quant_stage == "O2":
                 # m1: bw-wint4, 1/(sqrt(m2)+eps): bw-wint4
                 if momentum2:
+                    # skip norm-like parameters
                     if len(state_dict[k].shape) < 2:
                         continue
                     # m1: m1_quant_weight, m2: ratio
                     m1_key = k.split("/")[0] + "/" + MOMENT1_KEYNAME
                     ratio = cal_ratio(state_dict[m1_key], state_dict[k])
-                    m1_quant, m1_codebook = group_wise_quant_dequant(state_dict[m1_key], quant_bits=4, symmetry=True)
+                    m1_quant, m1_scales = group_wise_quant_dequant(state_dict[m1_key], quant_bits=4, symmetry=True)
                     quant_weight, r_mins, r_maxs = group_wise_quant_dequant(ratio, quant_bits=4)
                     quant_weight = merge_int4(m1_quant, quant_weight)
-                    codebook_dict[m1_key + SYMMETRY_QUANT_SCALE] = m1_codebook
-                    codebook_dict[k + ASYMMETRY_QUANT_SCALE_MIN] = r_mins
-                    codebook_dict[k + ASYMMETRY_QUANT_SCALE_MAX] = r_maxs
+                    scales_dict[m1_key + SYMMETRY_QUANT_SCALE] = m1_scales
+                    scales_dict[k + ASYMMETRY_QUANT_SCALE_MIN] = r_mins
+                    scales_dict[k + ASYMMETRY_QUANT_SCALE_MAX] = r_maxs
                     del_key.append(m1_key)
                 elif not momentum1:
                     quant_weight = state_dict[k]
@@ -185,23 +181,6 @@ def quant_unified_optimizer(state_dict, state_dict_type, ckpt_quant_stage, async
         for k in del_key:
             state_dict.pop(k, None)
 
-        state_dict.update(codebook_dict)
-
-        if not async_save:
-            if paddle.distributed.get_world_size() > 1:
-                dist.all_reduce(all_bits)
-                dist.all_reduce(quant_bits)
-
-            model_numel = all_bits / 4
-            all_bits = model_numel * 7.0
-            quant_bits_mw = quant_bits + model_numel * 6.0
-            quant_bits = quant_bits + model_numel * 2.0
-            logger.info(
-                f"all bits: {all_bits.item()}, quant bits: {quant_bits.item()}, quant bits mw: {quant_bits_mw.item()}"
-            )
-            logger.info(f"quant ratio (w/o Master Weight): {(all_bits.item() - quant_bits.item()) / all_bits.item()}")
-            logger.info(
-                f"quant ratio (w/ Master Weight): {(all_bits.item() - quant_bits_mw.item()) / all_bits.item()}"
-            )
+        state_dict.update(scales_dict)
 
     return state_dict