GPTQ: Depreciate non-sequential update option (#762)

kylesayrs · dsikka · kylesayrs · commit ee4425261969 · 2024-11-21T18:23:38.000Z
* remove from gptq, apply style

* remove instances of sequential_update argument in GPTQ tests

* update examples

* update example tests

* documentation, remove from example

* apply style

* revert back to auto type

* apply style

---------

Co-authored-by: Dipika Sikka &lt;dipikasikka1@gmail.com&gt;
Signed-off-by: Kyle Sayers &lt;kylesayrs@gmail.com&gt;
diff --git a/examples/big_models_with_accelerate/README.md b/examples/big_models_with_accelerate/README.md
@@ -29,8 +29,8 @@ will work properly out of the box for basic quantization with `QuantizationModif
 even for CPU offloaded models. 
 
 To enable CPU offloading for second-order quantization methods such as GPTQ, we need to 
-allocate additional memory upfront when computing the device map. Note that this 
-device map will only compatible with `GPTQModifier(sequential_update=True, ...)`
+allocate additional memory upfront when computing the device map. Not doing so risks
+potentially going out-of-memory.
 
 ```python
 from llmcompressor.transformers.compression.helpers import calculate_offload_device_map
@@ -48,12 +48,7 @@ model = SparseAutoModelForCausalLM.from_pretrained(
 
 ### Practical Advice
 
-When working with `accelerate`, it is important to keep in mind that CPU offloading and naive pipeline-parallelism will slow down forward passes through the model. As a result, we need to take care to ensure that the quantization methods used fit well with the offloading scheme as methods that require many forward passes though the model will be slowed down.
-
-General rules of thumb:
-- CPU offloading is best used with data-free quantization methods (e.g. PTQ with `FP8_DYNAMIC`)
-- Multi-GPU is fast enough to be used with calibration data-based methods with `sequential_update=False`
-- It is possible to use Multi-GPU with `sequential_update=True` to save GPU memory, but the runtime will be slower
+When working with `accelerate`, it is important to keep in mind that CPU offloading and naive pipeline-parallelism will slow down forward passes through the model. As a result, we need to take care to ensure that the quantization methods used fit well with the offloading scheme as methods that require many forward passes though the model will be slowed down. If more gpu memory is not available, consider reducing the precision of the loaded model to a lower-width dtype such as `torch.bfloat16`.
 
 ## Examples
 
diff --git a/examples/big_models_with_accelerate/mult_gpus_int8_device_map.py b/examples/big_models_with_accelerate/mult_gpus_int8_device_map.py
@@ -10,8 +10,10 @@
 MODEL_ID = "mistralai/Mistral-Nemo-Instruct-2407"
 
 # adjust based off number of desired GPUs
+# reserve_for_hessians=True reserves memory which is required by
+# GPTQModifier and SparseGPTModifier
 device_map = calculate_offload_device_map(
-    MODEL_ID, reserve_for_hessians=True, num_gpus=2, torch_dtype=torch.bfloat16
+    MODEL_ID, num_gpus=2, reserve_for_hessians=True, torch_dtype=torch.bfloat16
 )
 
 model = SparseAutoModelForCausalLM.from_pretrained(
@@ -60,7 +62,9 @@ def tokenize(sample):
 recipe = [
     SmoothQuantModifier(smoothing_strength=0.8),
     GPTQModifier(
-        targets="Linear", scheme="W8A8", ignore=["lm_head"], sequential_update=True
+        targets="Linear",
+        scheme="W8A8",
+        ignore=["lm_head"],
     ),
 ]
 
diff --git a/examples/big_models_with_accelerate/multi_gpu_int8.py b/examples/big_models_with_accelerate/multi_gpu_int8.py
@@ -58,14 +58,13 @@ def tokenize(sample):
 # 3) Configure algorithms. In this case, we:
 #   * quantize the weights to int8 with GPTQ (static per channel)
 #   * quantize the activations to int8 (dynamic per token)
-#   * run non-sequentially (for seq update, see multi_gpu_int8_sequential_update.py)
 recipe = [
-    GPTQModifier(
-        targets="Linear", scheme="W8A8", ignore=["lm_head"], sequential_update=False
-    ),
+    GPTQModifier(targets="Linear", scheme="W8A8", ignore=["lm_head"]),
 ]
 
 # 4) Apply algorithms and save in `compressed-tensors` format.
+# if you encounter GPU out-of-memory issues, consider using an explicit
+# device map (see multi_gpus_int8_device_map.py)
 oneshot(
     model=model,
     tokenizer=tokenizer,
diff --git a/examples/quantization_24_sparse_w4a16/2:4_w4a16_group-128_recipe.yaml b/examples/quantization_24_sparse_w4a16/2:4_w4a16_group-128_recipe.yaml
@@ -23,7 +23,6 @@ quantization_stage:
   run_type: oneshot
   quantization_modifiers:
     GPTQModifier:
-      sequential_update: true
       ignore: ["lm_head"]
       config_groups:
         group_0:
diff --git a/examples/quantization_24_sparse_w4a16/2:4_w4a16_recipe.yaml b/examples/quantization_24_sparse_w4a16/2:4_w4a16_recipe.yaml
@@ -23,7 +23,6 @@ quantization_stage:
   run_type: oneshot
   quantization_modifiers:
     GPTQModifier:
-      sequential_update: true
       ignore: ["lm_head"]
       config_groups:
         group_0:
diff --git a/examples/quantization_w8a8_int8/gemma2_example.py b/examples/quantization_w8a8_int8/gemma2_example.py
@@ -55,7 +55,6 @@ def tokenize(sample):
 # 3) Select quantization algorithms. In this case, we:
 #   * quantize the weights to int8 with GPTQ (static per channel)
 #   * quantize the activations to int8 (dynamic per token)
-# Note: set sequential_update: true in the recipe to reduce memory
 recipe = GPTQModifier(targets="Linear", scheme="W8A8", ignore=["lm_head"])
 
 # 4) Apply quantization and save to disk compressed.
diff --git a/examples/quantization_w8a8_int8/llama3_example.py b/examples/quantization_w8a8_int8/llama3_example.py
@@ -57,7 +57,6 @@ def tokenize(sample):
 #   * apply SmoothQuant to make the activations easier to quantize
 #   * quantize the weights to int8 with GPTQ (static per channel)
 #   * quantize the activations to int8 (dynamic per token)
-# Note: set sequential_update: true in the recipe to reduce memory
 recipe = [
     SmoothQuantModifier(smoothing_strength=0.8),
     GPTQModifier(targets="Linear", scheme="W8A8", ignore=["lm_head"]),
diff --git a/examples/quantizing_moe/deepseek_moe_w8a8_int8.py b/examples/quantizing_moe/deepseek_moe_w8a8_int8.py
@@ -70,7 +70,6 @@ def tokenize(sample):
         targets="Linear",
         scheme="W8A8",
         ignore=["lm_head", "re:.*mlp.gate$"],
-        sequential_update=True,
     ),
 ]
 
diff --git a/examples/quantizing_moe/deepseek_recipe_w4a16.yaml b/examples/quantizing_moe/deepseek_recipe_w4a16.yaml
@@ -1,7 +1,6 @@
 quant_stage:
   quant_modifiers:
     GPTQModifier:
-      sequential_update: true
       ignore: [lm_head, "re:.*mlp.gate$"]
       config_groups:
         group_0:
diff --git a/src/llmcompressor/modifiers/quantization/gptq/base.py b/src/llmcompressor/modifiers/quantization/gptq/base.py
@@ -1,4 +1,4 @@
-import gc
+import warnings
 from typing import Any, Dict, Iterable, List, Optional, Tuple, Union
 
 import torch
@@ -49,7 +49,6 @@ class GPTQModifier(Modifier):
     | test_stage:
     |    obcq_modifiers:
     |      GPTQModifier:
-    |          sequential_update: true
     |          dampening_frac: 0.001
     |          block_size: 128
     |          config_groups:
@@ -67,8 +66,8 @@ class GPTQModifier(Modifier):
     |                    actorder: False
 
 
-    :param sequential_update: Whether or not to update weights sequentially by layer,
-        True saves on GPU memory, default is True
+    :param sequential_update: Whether or not to update weights sequentially by layer.
+        This option is depreciated and setting to False is no longer supported
     :param targets: list of layer names to compress during GPTQ, or '__ALL__'
         to compress every layer in the model
     :param block_size: Used to determine number of columns to compress in one pass
@@ -98,7 +97,7 @@ class GPTQModifier(Modifier):
         and activation 8 bit quantization on the Linear layers.
     """
 
-    sequential_update: bool = True
+    sequential_update: bool = True  # DEPRECIATED
     targets: Union[str, List[str], None] = None
     sequential_targets: Union[str, List[str], None] = None
     block_size: int = 128
@@ -118,13 +117,13 @@ class GPTQModifier(Modifier):
     @field_validator("sequential_update", mode="before")
     def validate_sequential_update(cls, value: bool) -> bool:
         if not value:
-            logger.warning(
-                "Not using sequential_update requires allocating all hessians in "
-                "GPU memory. If you are running into GPU memory issues, consider "
-                "using sequential_update=True"
+            warnings.warn(
+                "`sequential_update=False` is no longer supported, setting "
+                "sequential_update=True",
+                DeprecationWarning,
             )
 
-        return value
+        return True
 
     def on_initialize_structure(self, state: State, **kwargs):
         """
@@ -246,7 +245,7 @@ def initialize_compression(
         compressible layers of model, and sets the device
 
         :param model: model to initialize for compression
-        :param dataloader: calibration data for GPTQ
+        :param dataloader: calibration data, not used by GPTQ in this function
         """
         self.model = model
         self.compressible_layers_ = self.compressible_layers()
@@ -258,16 +257,12 @@ def initialize_compression(
             args = self._pruning_arguments()
             comp_cls = self._compression_class()
             compressor = LayerCompressor(comp_cls, self.model, layer, idx, name, args)
-
-            # if running sequentially, allocate all hessians now
-            if not self.sequential_update:
-                compressor.pre_compress()
-
             self.layer_compressors_.append(compressor)
 
-        if self.sequential_update:
-            first_layer_compressor = self.layer_compressors_[0]
-            first_layer_compressor.set_early_stop()
+        # for the initial forward data pass, add an early stop exception in order
+        # to capture inputs right before being compressed by first module
+        first_layer_compressor = self.layer_compressors_[0]
+        first_layer_compressor.set_early_stop()
 
     @torch.no_grad()
     def apply_compression(
@@ -288,45 +283,32 @@ def apply_compression(
         self.model.apply(disable_quantization)
 
         with DisableKVCache(self.model):
-            # in non-sequential mode we run calibration through the full model
-            # in sequential mode we run calibration up to the first transformer target
+            # run_calibration_forward uses the early stop exception to capture values
+            # as intermediates right before the forward pass of the first module
             intermediates = run_calibration_forward(
                 self.model, dataloader, mask_padding=True
             )
             self.layer_compressors_[0].clear_early_stop()
 
-            # empty cache if not using sequential update
-            if not self.sequential_update:
-                del intermediates
-                gc.collect()
-                torch.cuda.empty_cache()
-
             num_layers = len(self.compressible_layers_)
             for idx, layer_compressor in enumerate(self.layer_compressors_):
                 logger.info(f"\n===== Compressing layer {idx+1}/{num_layers} " " =====")
 
-                if self.sequential_update:
-                    # in sequential mode we run the forward pass for each layer
-                    # one at a time, caching the intermediate outputs between layers
-                    logger.info(f"Calibrating {layer_compressor.name}...")
-                    layer_compressor.pre_compress()
-                    unquantized_outputs = layer_compressor.calibrate_layer(
-                        intermediates
-                    )
+                # run the forward pass for each transformer layer (block) one at a time
+                logger.info(f"Calibrating {layer_compressor.name}...")
+                layer_compressor.pre_compress()
+                unquantized_outputs = layer_compressor.calibrate_layer(intermediates)
 
                 layer_compressor.compress()
                 layer_compressor.post_compress()
                 layer_compressor.revert_layer_wrappers()
 
-                if self.sequential_update:
-                    quantized_outputs = layer_compressor.calibrate_layer(intermediates)
-                    error = get_output_error(unquantized_outputs, quantized_outputs)
-                    logger.info(f"Mean output error from quantization: {error:.3f}")
-                    intermediates = quantized_outputs
-                    del unquantized_outputs
-
-                gc.collect()
-                torch.cuda.empty_cache()
+                # perform a second forward pass of the module to calculate
+                # weight-quantized outputs for use as inputs to the next layer
+                quantized_outputs = layer_compressor.calibrate_layer(intermediates)
+                error = get_output_error(unquantized_outputs, quantized_outputs)
+                logger.info(f"Mean output error from quantization: {error:.3f}")
+                intermediates = quantized_outputs
 
         # re-enable quantization
         self.model.apply(enable_quantization)
diff --git a/tests/e2e/vLLM/recipes/FP8/recipe_fp8_weight_only_channel.yaml b/tests/e2e/vLLM/recipes/FP8/recipe_fp8_weight_only_channel.yaml
@@ -1,7 +1,6 @@
 quant_stage:
   quant_modifiers:
     QuantizationModifier:
-      sequential_update: false
       ignore: [lm_head]
       config_groups:
         group_0:
diff --git a/tests/e2e/vLLM/recipes/FP8/recipe_fp8_weight_only_per_tensor.yaml b/tests/e2e/vLLM/recipes/FP8/recipe_fp8_weight_only_per_tensor.yaml
@@ -1,7 +1,6 @@
 quant_stage:
   quant_modifiers:
     QuantizationModifier:
-      sequential_update: false
       ignore: [lm_head]
       config_groups:
         group_0:
diff --git a/tests/e2e/vLLM/recipes/INT8/recipe_int8_channel_weight_static_per_tensor_act.yaml b/tests/e2e/vLLM/recipes/INT8/recipe_int8_channel_weight_static_per_tensor_act.yaml
@@ -1,7 +1,6 @@
 quant_stage:
   quant_modifiers:
     QuantizationModifier:
-      sequential_update: false
       ignore: [lm_head]
       config_groups:
         group_0:
diff --git a/tests/e2e/vLLM/recipes/INT8/recipe_int8_tensor_weight_static_per_tensor_act.yaml b/tests/e2e/vLLM/recipes/INT8/recipe_int8_tensor_weight_static_per_tensor_act.yaml
@@ -1,7 +1,6 @@
 quant_stage:
   quant_modifiers:
     QuantizationModifier:
-      sequential_update: false
       ignore: [lm_head]
       config_groups:
         group_0:
diff --git a/tests/e2e/vLLM/recipes/WNA16/recipe_w4a16_channel_quant.yaml b/tests/e2e/vLLM/recipes/WNA16/recipe_w4a16_channel_quant.yaml
@@ -1,7 +1,6 @@
 quant_stage:
   quant_modifiers:
     QuantizationModifier:
-      sequential_update: false
       ignore: [lm_head]
       config_groups:
         group_0:
diff --git a/tests/e2e/vLLM/recipes/WNA16/recipe_w8a16_channel_quant.yaml b/tests/e2e/vLLM/recipes/WNA16/recipe_w8a16_channel_quant.yaml
@@ -1,7 +1,6 @@
 quant_stage:
   quant_modifiers:
     QuantizationModifier:
-      sequential_update: false
       ignore: [lm_head]
       config_groups:
         group_0:
diff --git a/tests/e2e/vLLM/recipes/actorder/recipe_w4a16_actorder_group.yaml b/tests/e2e/vLLM/recipes/actorder/recipe_w4a16_actorder_group.yaml
@@ -1,7 +1,6 @@
 quant_stage:
   quant_modifiers:
     GPTQModifier:
-      sequential_update: false
       ignore: ["lm_head"]
       config_groups:
         group_0:
diff --git a/tests/e2e/vLLM/recipes/actorder/recipe_w4a16_actorder_weight.yaml b/tests/e2e/vLLM/recipes/actorder/recipe_w4a16_actorder_weight.yaml
@@ -1,7 +1,6 @@
 quant_stage:
   quant_modifiers:
     GPTQModifier:
-      sequential_update: false
       ignore: ["lm_head"]
       config_groups:
         group_0:
diff --git a/tests/examples/test_big_models_with_accelerate.py b/tests/examples/test_big_models_with_accelerate.py
@@ -49,9 +49,9 @@ def test_readme_has_install_command(self, example_dir: str):
                 ],
             ),
             pytest.param(
-                "multi_gpu_int8_sequential_update.py",
+                "mult_gpus_int8_device_map.py",
                 "",
-                id="multi_gpu_int8_sequential_update",
+                id="mult_gpus_int8_device_map",
                 marks=[requires_gpu_count(2), pytest.mark.multi_gpu],
             ),
         ],
diff --git a/tests/llmcompressor/transformers/compression/recipes/new_quant_actorder_group.yaml b/tests/llmcompressor/transformers/compression/recipes/new_quant_actorder_group.yaml
@@ -15,5 +15,4 @@ test_stage:
                     output_activations: null
                     targets: ["Linear"]
         GPTQModifier:
-            block_size: 128
-            sequential_update: False
+            block_size: 128
diff --git a/tests/llmcompressor/transformers/compression/recipes/new_quant_actorder_weight.yaml b/tests/llmcompressor/transformers/compression/recipes/new_quant_actorder_weight.yaml
@@ -15,5 +15,4 @@ test_stage:
                     output_activations: null
                     targets: ["Linear"]
         GPTQModifier:
-            block_size: 128
-            sequential_update: False
+            block_size: 128
diff --git a/tests/llmcompressor/transformers/compression/recipes/new_quant_channel.yaml b/tests/llmcompressor/transformers/compression/recipes/new_quant_channel.yaml
@@ -13,6 +13,4 @@ test_stage:
                     output_activations: null
                     targets: ["Linear"]
         GPTQModifier:
-            block_size: 128
-            sequential_update: False
-            
+            block_size: 128
diff --git a/tests/llmcompressor/transformers/compression/recipes/new_quant_full.yaml b/tests/llmcompressor/transformers/compression/recipes/new_quant_full.yaml
@@ -2,7 +2,6 @@ test_stage:
     quant_modifiers:
         GPTQModifier:
             block_size: 128
-            sequential_update: False
             ignore: ["lm_head", "model.layers.0.mlp.down_proj"]
             config_groups:
                 group_0:
diff --git a/tests/llmcompressor/transformers/compression/recipes/new_quant_group.yaml b/tests/llmcompressor/transformers/compression/recipes/new_quant_group.yaml
@@ -14,5 +14,4 @@ test_stage:
                     output_activations: null
                     targets: ["Linear"]
         GPTQModifier:
-            block_size: 128
-            sequential_update: False
+            block_size: 128
diff --git a/tests/llmcompressor/transformers/compression/recipes/new_quant_weight.yaml b/tests/llmcompressor/transformers/compression/recipes/new_quant_weight.yaml
@@ -14,5 +14,4 @@ test_stage:
                     targets: ["Linear", "Embedding"]
         GPTQModifier:
             block_size: 128
-            sequential_update: False
             targets: ["re:model.layers.\\d+$"]
diff --git a/tests/llmcompressor/transformers/obcq/recipes/additional_sparsity_with_quant.yaml b/tests/llmcompressor/transformers/obcq/recipes/additional_sparsity_with_quant.yaml
@@ -19,7 +19,6 @@ test_stage:
       ]
       preserve_sparsity_mask: True
     GPTQModifier:
-      sequential_update: False
       dampening_frac: 0.01
       targets: [
         "model.layers.0",
diff --git a/tests/llmcompressor/transformers/obcq/recipes/quant.yaml b/tests/llmcompressor/transformers/obcq/recipes/quant.yaml
@@ -4,7 +4,6 @@ test_stage:
       smoothing_strength: 0.6
     GPTQModifier:
       block_size: 128
-      sequential_update: False
       percdamp: 0.01
       config_groups:
         group_0:
diff --git a/tests/llmcompressor/transformers/obcq/recipes/quant_and_sparse.yaml b/tests/llmcompressor/transformers/obcq/recipes/quant_and_sparse.yaml

Original file line number	Diff line number	Diff line change
`@@ -70,7 +70,6 @@ def tokenize(sample):`
`70`	`70`	`targets="Linear",`
`71`	`71`	`scheme="W8A8",`
`72`	`72`	`ignore=["lm_head", "re:.*mlp.gate$"],`
`73`		`- sequential_update=True,`
`74`	`73`	`),`
`75`	`74`	`]`
`76`	`75`