Skip to content

Commit 4b969af

Browse files
authored
[GPTQ] Add actorder option to modifier (#1424)
## Purpose ## * Make actorder option more intuitive for users * Enable easier adjustment of actorder default #1425 * This change is conceptually intuitive because activation ordering is a concept that only applies to the GPTQ algorithm (the only algorithm for which quantization group order matters) ## Changes ## * Add `actorder` argument to `GPTQModifier` * Override `resolve_quantization_config` method to resolve config groups with `actorder` argument * (Misc) rearrange method order to match the typical order in which they are called in the modifier lifecycle ## Testing ## * Ran llama w4a16 example to completion Signed-off-by: Kyle Sayers <[email protected]>
1 parent 80a0449 commit 4b969af

File tree

1 file changed

+58
-27
lines changed
  • src/llmcompressor/modifiers/quantization/gptq

1 file changed

+58
-27
lines changed

src/llmcompressor/modifiers/quantization/gptq/base.py

Lines changed: 58 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,12 @@
33
from typing import Dict, List, Optional, Tuple, Union
44

55
import torch
6-
from compressed_tensors.quantization import disable_quantization
6+
from compressed_tensors.quantization import (
7+
QuantizationConfig,
8+
QuantizationScheme,
9+
disable_quantization,
10+
)
11+
from compressed_tensors.quantization.quant_args import ActivationOrdering
712
from compressed_tensors.utils import (
813
align_module_device,
914
get_execution_device,
@@ -39,6 +44,7 @@ class GPTQModifier(Modifier, QuantizationMixin):
3944
| block_size: 128
4045
| dampening_frac: 0.001
4146
| offload_hessians: False
47+
| actorder: static
4248
| config_groups:
4349
| group_0:
4450
| targets:
@@ -51,7 +57,6 @@ class GPTQModifier(Modifier, QuantizationMixin):
5157
| symmetric: true
5258
| strategy: group
5359
| group_size: 128
54-
| actorder: False
5560
5661
Lifecycle:
5762
- on_initialize
@@ -70,6 +75,8 @@ class GPTQModifier(Modifier, QuantizationMixin):
7075
:param block_size: Used to determine number of columns to compress in one pass
7176
:param dampening_frac: Amount of dampening to apply to H, as a fraction of the
7277
diagonal norm
78+
:param actorder: order in which weight columns are quantized. For more information,
79+
on actorder options, see https://github.com/vllm-project/vllm/pull/8135
7380
:param offload_hessians: Set to True for decreased memory usage but increased
7481
runtime.
7582
@@ -102,6 +109,7 @@ class GPTQModifier(Modifier, QuantizationMixin):
102109
sequential_targets: Union[str, List[str], None] = None
103110
block_size: int = 128
104111
dampening_frac: Optional[float] = 0.01
112+
actorder: Optional[ActivationOrdering] = None
105113
offload_hessians: bool = False
106114

107115
# private variables
@@ -120,6 +128,29 @@ def validate_sequential_update(cls, value: bool) -> bool:
120128

121129
return True
122130

131+
def resolve_quantization_config(self) -> QuantizationConfig:
132+
config = super().resolve_quantization_config()
133+
134+
# Resolve config with `self.actorder`
135+
for scheme in config.config_groups.values():
136+
assert isinstance(scheme, QuantizationScheme) # (1)
137+
if scheme.weights is not None:
138+
existing = scheme.weights.actorder
139+
assert isinstance(existing, (ActivationOrdering, type(None))) # (2)
140+
if existing is not None and existing != self.actorder:
141+
raise ValueError(
142+
"Cannot resolve activation ordering when both "
143+
"`GPTQModifier.actorder` and `QuantizationScheme.actorder` "
144+
"both are provided. Either set `GPTQModifier.actorder = None` "
145+
"or remove `actorder` from config groups"
146+
)
147+
scheme.weights.actorder = self.actorder
148+
149+
# (1) QuantizationConfig.model_post_init
150+
# (2) QuantizationScheme.validate_actorder
151+
152+
return config
153+
123154
def on_initialize(self, state: State, **kwargs) -> bool:
124155
"""
125156
Initialize and run the GPTQ algorithm on the current state
@@ -176,31 +207,6 @@ def on_event(self, state: State, event: Event, **kwargs):
176207
if not self.ended_:
177208
self.on_end(state, None)
178209

179-
def on_end(self, state: State, event: Event, **kwargs):
180-
"""
181-
Finish calibrating by removing observers and calibration hooks
182-
"""
183-
self.ended_ = True
184-
QuantizationMixin.end_calibration(self, state.model)
185-
self.remove_hooks() # remove gptq hooks
186-
187-
def on_finalize(self, state: State, **kwargs) -> bool:
188-
"""
189-
disable the quantization observers used by the OBCQ algorithm
190-
191-
:param state: session state storing input model and calibration data
192-
"""
193-
if not self.ended_:
194-
self.on_end(state, None)
195-
196-
if len(self._num_samples) > 0:
197-
raise ValueError(f"Failed to compress {len(self._num_samples)} modules")
198-
199-
self._hessians = dict()
200-
self._num_samples = dict()
201-
202-
return True
203-
204210
def calibrate_module(
205211
self,
206212
module: torch.nn.Module,
@@ -268,6 +274,31 @@ def compress_modules(self):
268274
# self._hessians[module] already deleted by quantize_weight
269275
del self._num_samples[module]
270276

277+
def on_end(self, state: State, event: Event, **kwargs):
278+
"""
279+
Finish calibrating by removing observers and calibration hooks
280+
"""
281+
self.ended_ = True
282+
QuantizationMixin.end_calibration(self, state.model)
283+
self.remove_hooks() # remove gptq hooks
284+
285+
def on_finalize(self, state: State, **kwargs) -> bool:
286+
"""
287+
disable the quantization observers used by the OBCQ algorithm
288+
289+
:param state: session state storing input model and calibration data
290+
"""
291+
if not self.ended_:
292+
self.on_end(state, None)
293+
294+
if len(self._num_samples) > 0:
295+
raise ValueError(f"Failed to compress {len(self._num_samples)} modules")
296+
297+
self._hessians = dict()
298+
self._num_samples = dict()
299+
300+
return True
301+
271302
@contextlib.contextmanager
272303
def _maybe_onload_hessian(self, module: torch.nn.Module):
273304
if self.offload_hessians:

0 commit comments

Comments
 (0)