vllm-project · kylesayrs · Aug 28, 2024 · Jul 2, 2024 · Jul 10, 2024 · Jul 10, 2024
diff --git a/src/llmcompressor/modifiers/quantization/gptq/base.py b/src/llmcompressor/modifiers/quantization/gptq/base.py
@@ -59,6 +59,7 @@ class GPTQModifier(Modifier):
     |                    symmetric: true
     |                    strategy: "tensor"
     |                    group_size: 128
+    |                    actorder: True
 
 
     :param sequential_update: Whether or not to update weights sequentially by layer,

diff --git a/src/llmcompressor/modifiers/quantization/gptq/utils/gptq_wrapper.py b/src/llmcompressor/modifiers/quantization/gptq/utils/gptq_wrapper.py
@@ -98,6 +98,40 @@ def compress(
             W = W.t()
         W = W.float()
 
+        tick = time.time()
+
+        # if activation ordering is enabled, permute the weight columns
+        # in order of greatest hessian values. Columns are unpermuted after
+        # quantization is finished
+        actorder = False
+        if hasattr(self.layer, "quantization_scheme"):
+            quant_scheme = self.layer.quantization_scheme
+            quant_weights = quant_scheme.weights
+            if quant_weights is not None:
+                actorder = quant_weights.actorder
+                if actorder:
+                    # use hessian to create a permutation of weights
+                    perm = torch.argsort(torch.diag(self.H), descending=True)
+
+                    # permute weight and hessian
+                    W = W[:, perm]
+                    self.H = self.H[perm][:, perm]
+
+            # fetch latest correct scale and ZP relevant for any changes
+            from compressed_tensors.quantization import update_layer_weight_quant_params
+
+            # TODO: experiment with updating before each block
+            update_layer_weight_quant_params(self.layer, weight=W, reset_obs=True)
+            scale = self.layer.weight_scale.data
+            zero_point = self.layer.weight_zero_point.data
+
+            group_size = (
+                quant_scheme.weights.group_size
+                if quant_scheme.weights.group_size is not None
+                else W.shape[1]
+            )
+
+        # mask sparsity if applicable
         sparsity = tensor_sparsity(W)
         preserve_zeros = sparsity >= SPARSITY_THRESHOLD
         W_nz_mask = (
@@ -106,25 +140,14 @@ def compress(
             else None
         )
 
-        tick = time.time()
-
-        if hasattr(self.layer, "quantization_scheme"):
-            quant_scheme = self.layer.quantization_scheme
-            if quant_scheme.weights is not None:
-                # fetch latest correct scale and ZP relevant for any changes
-                # such as activation reordering
-                from compressed_tensors.quantization import (
-                    update_layer_weight_quant_params,
-                )
-
-                update_layer_weight_quant_params(self.layer)
-
+        # invalidate dead hessian values
         dead = torch.diag(self.H) == 0
         self.H[dead, dead] = 1
         W[:, dead] = 0
 
         Losses = torch.zeros(self.rows, device=self.dev)
 
+        # compute inverse hessian in place to save memory
         damp = percdamp * torch.mean(torch.diag(self.H))
         diag = torch.arange(self.columns, device=self.dev)
         self.H[diag, diag] += damp
@@ -165,8 +188,6 @@ def compress(
                 elif hasattr(self.layer, "quantization_scheme"):
                     quant_scheme = self.layer.quantization_scheme
                     if quant_scheme.weights is not None:
-                        scale = self.layer.weight_scale
-                        zero_point = self.layer.weight_zero_point
                         from compressed_tensors.quantization import QuantizationStrategy
                         from compressed_tensors.quantization.lifecycle.forward import (
                             fake_quantize,
@@ -192,9 +213,7 @@ def compress(
                         else:  # strategy == QuantizationStrategy.GROUP
                             # get the group index for the current column
                             column_idx = i1 + i
-                            input_dim_group = (
-                                column_idx // quant_scheme.weights.group_size
-                            )
+                            input_dim_group = column_idx // group_size
 
                             # Since we're only applying quantization to a slice, this
                             # ends up being a channelwise application
@@ -249,12 +268,26 @@ def compress(
                 f"Compressed layer size: {get_layer_size_bytes(self.layer)} MB",
             )
 
+        if actorder:
+            # restore original permutation
+            invperm = torch.argsort(perm)
+            W = W[:, invperm]
+
+            # g_idx describes the group index of the permuted weight
+            g_idx = torch.tensor(
+                [i // group_size for i in range(self.columns)],
+                dtype=torch.int,
+            ).to(device=invperm.device)
+
+            # invert to get the group index of the unpermuted weight
+            self.layer.weight_g_idx.data = g_idx[invperm]
+
         if isinstance(self.layer, transformers.Conv1D):
             W = W.t()
         W = W.reshape(final_shape).to(final_dtype)
 
-        # This is a bit hacky, but FSDP updates only work if we change the weight in
-        # place, clone() or direct assignment won't work
+        # This is a bit hacky, but FSDP updates only work if we change
+        # the weight in place, clone() or direct assignment won't work
         self.layer.weight -= self.layer.weight
         self.layer.weight += W
 
@@ -263,10 +296,6 @@ def compress(
             update_prefix_dict(self.layer, "weight", self.layer.weight.to(device))
             self.layer._hf_hook.post_forward(self.layer, None)
 
-        del W
-        del Losses
-        del diag
-
     def free(self):
         """
         Free the Hessian memory after the layer is complete

diff --git a/tests/llmcompressor/transformers/compression/configs/actorder_1.1b.yaml b/tests/llmcompressor/transformers/compression/configs/actorder_1.1b.yaml
@@ -0,0 +1,5 @@
+cadence: "nightly"
+test_type: "regression"
+model_stub: "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T"
+new_recipe: "tests/llmcompressor/transformers/compression/recipes/new_quant_actorder.yaml"
+ppl_threshold: 20
diff --git a/tests/llmcompressor/transformers/compression/recipes/new_quant_actorder.yaml b/tests/llmcompressor/transformers/compression/recipes/new_quant_actorder.yaml
@@ -0,0 +1,19 @@
+test_stage:
+    quant_modifiers:
+        QuantizationModifier:
+            ignore: ["lm_head", "model.layers.0.mlp.down_proj"]
+            config_groups:
+                group_0:
+                    weights:
+                        num_bits: 4
+                        type: "int"
+                        symmetric: False
+                        strategy: "group"
+                        group_size: 128
+                        actorder: True
+                    input_activations: null
+                    output_activations: null
+                    targets: ["Linear"]
+        GPTQModifier:
+            block_size: 128
+            sequential_update: False