Add warning for non-divisible group quantization (#1401)

kylesayrs · web-flow · commit 91b15d2c37d5 · 2025-05-29T15:22:04.000Z
## Purpose ## * Test discrepancies between initialized parameters and values calculated by observers * Reveal potential issue with how qparams are initialized neuralmagic/compressed-tensors#308 * Add warning for when users attempt to quantize groups that aren't perfectly divisible ## Prerequisites ## * #1431 ## Changes ## * Added `test_observers_update` in `tests/llmcompressor/modifiers/calibration/test_observers.py` * Add a warning for attempts to quantize indivisible groups ``` Attempting to quantize a module weight whose columns (3420) are not divisible by group_size (128). This scheme is not supported by vLLM, please consider adjusting the group_size for modules with this number of columns ``` ## Testing ## * This test fails without CT changes, but succeeds with them --------- Signed-off-by: Kyle Sayers <kylesayrs@gmail.com>
diff --git a/src/llmcompressor/observers/base.py b/src/llmcompressor/observers/base.py
@@ -104,6 +104,15 @@ def get_qparams(
                 rows = observed.shape[0]
                 columns = observed.shape[1]
                 num_groups = int(ceil(columns / group_size))
+                if num_groups * group_size != columns:
+                    logger.bind(log_once=True).warning(
+                        "Attempting to quantize a module weight whose columns "
+                        f"({columns}) are not divisible by group_size ({group_size}). "
+                        "This scheme is not supported by vLLM, please consider "
+                        "adjusting the group_size for modules with this number of "
+                        "columns",
+                    )
+
                 self._scale = torch.empty(
                     (rows, num_groups), dtype=observed.dtype, device=observed.device
                 )
diff --git a/tests/llmcompressor/modifiers/calibration/test_observers.py b/tests/llmcompressor/modifiers/calibration/test_observers.py
@@ -0,0 +1,61 @@
+import pytest
+import torch
+from compressed_tensors.quantization import (
+    QuantizationArgs,
+    QuantizationScheme,
+    initialize_module_for_quantization,
+)
+
+from llmcompressor.modifiers.quantization.calibration import initialize_observer
+
+
+@pytest.mark.parametrize(
+    "shape,group_size,actorder",
+    [
+        ((1, 1), None, False),
+        ((1, 1), 128, False),
+        ((1, 1), 128, True),
+        ((64, 64), None, False),
+        ((64, 64), 128, False),
+        ((64, 64), 128, True),
+        ((1792, 4096), None, False),
+        ((1792, 4096), 128, False),
+        ((1792, 4096), 128, True),
+        ((3420, 64), None, False),
+        ((3420, 64), 128, False),
+        ((3420, 64), 128, True),
+    ],
+)
+def test_observers_update(shape, group_size, actorder):
+    module = torch.nn.Linear(*shape)
+    scheme = QuantizationScheme(
+        targets=["Linear"],
+        weights=QuantizationArgs(group_size=group_size, actorder=actorder),
+        input_activations=QuantizationArgs(),
+        output_activations=QuantizationArgs(),
+    )
+
+    input = torch.empty(module.in_features, dtype=module.weight.dtype)
+    output = torch.empty(module.out_features, dtype=module.weight.dtype)
+
+    initialize_module_for_quantization(module, scheme)
+    initialize_observer(module, "weight")
+    initialize_observer(module, "input")
+    initialize_observer(module, "output")
+
+    for location, value in (
+        ("weight", module.weight),
+        ("input", input),
+        ("output", output),
+    ):
+        observer = getattr(module, f"{location}_observer")
+        g_idx = getattr(module, "g_idx", None)
+        updated_scale, updated_zero_point = observer(value, g_idx=g_idx)
+
+        assert_alike(updated_scale, getattr(module, f"{location}_scale"))
+        assert_alike(updated_zero_point, getattr(module, f"{location}_zero_point"))
+
+
+def assert_alike(a, b):
+    assert a.dtype == b.dtype
+    assert a.shape == b.shape