markmc
diff --git a/‎src/compressed_tensors/quantization/lifecycle/forward.py
Lines changed: 145 additions & 37 deletions b/‎src/compressed_tensors/quantization/lifecycle/forward.py
Lines changed: 145 additions & 37 deletions
diff --git a/‎tests/test_compressors/__init__.py b/‎tests/test_compressors/__init__.py
diff --git a/‎tests/test_bitmask.py renamed to ‎tests/test_compressors/test_bitmask.py b/‎tests/test_bitmask.py renamed to ‎tests/test_compressors/test_bitmask.py
diff --git a/‎tests/test_int_quant.py renamed to ‎tests/test_compressors/test_int_quant.py
Lines changed: 55 additions & 14 deletions b/‎tests/test_int_quant.py renamed to ‎tests/test_compressors/test_int_quant.py
Lines changed: 55 additions & 14 deletions
diff --git a/‎tests/test_quantization/test_configs/__init__.py
Lines changed: 13 additions & 0 deletions b/‎tests/test_quantization/test_configs/__init__.py
Lines changed: 13 additions & 0 deletions
@@ -26,7 +26,13 @@
 from torch.nn import Module
 
 
-__all__ = ["wrap_module_forward_quantized", "maybe_calibrate_or_quantize"]
+__all__ = [
+    "quantize",
+    "dequantize",
+    "fake_quantize",
+    "wrap_module_forward_quantized",
+    "maybe_calibrate_or_quantize",
+]
 
 
 @torch.no_grad()
@@ -37,29 +43,66 @@ def quantize(
     args: QuantizationArgs,
     dtype: Optional[torch.dtype] = None,
 ) -> torch.Tensor:
-    bit_range = 2**args.num_bits
-    q_max = torch.tensor(bit_range / 2 - 1, device=x.device)
-    q_min = torch.tensor(-bit_range / 2, device=x.device)
+    """
+    Quantize the input tensor x using the QuantizationStrategy specified in args.
+    Quantization can be done per tensor, channel, token or group. For group
+    quantization, the group_size must be divisible by the column size. The input scale
+    and zero_points are reshaped to support vectorization (Assumes 1 is the
+    channel dimension)
 
-    quantized_value = torch.clamp(
-        torch.round(x / scale + zero_point),
-        q_min,
-        q_max,
+    :param x: Input tensor
+    :param scale: scale tensor
+    :param zero_point: zero point tensor
+    :param args: quantization args dictating how to quantize x
+    :param dtype: optional dtype to cast the quantized output to
+    :return: fake quantized tensor
+    """
+    return _process_quantization(
+        x=x,
+        scale=scale,
+        zero_point=zero_point,
+        args=args,
+        dtype=dtype,
+        do_quantize=True,
+        do_dequantize=False,
     )
 
-    if dtype is not None:
-        quantized_value = quantized_value.to(dtype)
-
-    return quantized_value
-
 
 @torch.no_grad()
 def dequantize(
     x_q: torch.Tensor,
     scale: torch.Tensor,
     zero_point: torch.Tensor,
+    args: QuantizationArgs = None,
 ) -> torch.Tensor:
-    return (x_q - zero_point) * scale
+    """
+    Dequantize a quantized input tensor x_q based on the strategy specified in args. If
+    args is not provided, the strategy will be inferred.
+
+    :param x: quantized input tensor
+    :param scale: scale tensor
+    :param zero_point: zero point tensor
+    :param args: quantization args used to quantize x_q
+    :return: dequantized float tensor
+    """
+    if args is None:
+        if scale.ndim == 0:
+            args = QuantizationArgs(strategy=QuantizationStrategy.TENSOR)
+        elif scale.ndim == 2:
+            args = QuantizationArgs(strategy=QuantizationStrategy.CHANNEL)
+        elif scale.ndim == 3:
+            group_size = int(x_q.shape[1] / scale.shape[1])
+            args = QuantizationArgs(
+                strategy=QuantizationStrategy.GROUP, group_size=group_size
+            )
+    return _process_quantization(
+        x=x_q,
+        scale=scale,
+        zero_point=zero_point,
+        args=args,
+        do_quantize=False,
+        do_dequantize=True,
+    )
 
 
 @torch.no_grad()
@@ -70,26 +113,51 @@ def fake_quantize(
     args: QuantizationArgs,
 ) -> torch.Tensor:
     """
-    Fake quantize the input tensor x depending on the group_size.
-    if group_size is greater than 0, then q/dq by groups. The groups
-    must be divisible by the column size
-    if group_size is -1, then channel wise q/dq. THe input scale and
-    zero_points are reshaped to support vectorization (Assumes 1 is
-    the channel dimension)
+    Fake quantize the input tensor x by quantizing then dequantizing with
+    the QuantizationStrategy specified in args. Quantization can be done per tensor,
+    channel, token or group. For group quantization, the group_size must be divisible
+    by the column size. The input scale  and zero_points are reshaped to support
+    vectorization (Assumes 1 is the channel dimension)
 
     :param x: Input tensor
     :param scale: scale tensor
     :param zero_point: zero point tensor
-    :param args: quantization args that contain group_size info
+    :param args: quantization args dictating how to quantize x
     :return: fake quantized tensor
-
     """
+    return _process_quantization(
+        x=x,
+        scale=scale,
+        zero_point=zero_point,
+        args=args,
+        do_quantize=True,
+        do_dequantize=True,
+    )
+
+
+@torch.no_grad()
+def _process_quantization(
+    x: torch.Tensor,
+    scale: torch.Tensor,
+    zero_point: torch.Tensor,
+    args: QuantizationArgs,
+    dtype: Optional[torch.dtype] = None,
+    do_quantize: bool = True,
+    do_dequantize: bool = True,
+) -> torch.Tensor:
+    bit_range = 2**args.num_bits
+    q_max = torch.tensor(bit_range / 2 - 1, device=x.device)
+    q_min = torch.tensor(-bit_range / 2, device=x.device)
     group_size = args.group_size
 
     # group
     if args.strategy == QuantizationStrategy.GROUP:
 
-        DQ = torch.zeros_like(x)
+        if do_dequantize:  # if dequantizing the output should be a fp type
+            output = torch.zeros_like(x, dtype=scale.dtype)
+        else:
+            output_dtype = dtype if dtype is not None else x.dtype
+            output = torch.zeros_like(x, dtype=output_dtype)
 
         # TODO: vectorize the for loop
         # TODO: fix genetric assumption about the tensor size for computing group
@@ -115,18 +183,24 @@ def fake_quantize(
             zp = zero_point[:, i].view(-1, 1)
 
             idx = i * group_size
-            Q = quantize(x[:, idx : (idx + group_size)], sc, zp, args)
-            DQ[:, idx : (idx + group_size)] = dequantize(Q, sc, zp)
+            if do_quantize:
+                output[:, idx : (idx + group_size)] = _quantize(
+                    x[:, idx : (idx + group_size)], sc, zp, q_min, q_max, dtype=dtype
+                )
+            if do_dequantize:
+                input = (
+                    output[:, idx : (idx + group_size)]
+                    if do_quantize
+                    else x[:, idx : (idx + group_size)]
+                )
+                output[:, idx : (idx + group_size)] = _dequantize(input, sc, zp)
 
     # channel-wise
     elif args.strategy == QuantizationStrategy.CHANNEL:  # group_size == -1
-        # before: scale shape = [channel_size]
-        # after: scale shape = [1, channel_size]
-        scale = scale.unsqueeze(0)
-        zero_point = zero_point.unsqueeze(0)
-
-        Q = quantize(x, scale, zero_point, args)
-        DQ = dequantize(Q, scale, zero_point)
+        if do_quantize:
+            output = _quantize(x, scale, zero_point, q_min, q_max, dtype=dtype)
+        if do_dequantize:
+            output = _dequantize(output if do_quantize else x, scale, zero_point)
 
     # per-token
     elif args.strategy == QuantizationStrategy.TOKEN:
@@ -138,14 +212,18 @@ def fake_quantize(
         scale = scale.unsqueeze(1)
         zero_point = zero_point.unsqueeze(1)
 
-        Q = quantize(x, scale, zero_point, args)
-        DQ = dequantize(Q, scale, zero_point)
+        if do_quantize:
+            output = _quantize(x, scale, zero_point, q_min, q_max, dtype=dtype)
+        if do_dequantize:
+            output = _dequantize(output if do_quantize else x, scale, zero_point)
 
     else:
-        Q = quantize(x, scale, zero_point, args)
-        DQ = dequantize(Q, scale, zero_point)
+        if do_quantize:
+            output = _quantize(x, scale, zero_point, q_min, q_max, dtype=dtype)
+        if do_dequantize:
+            output = _dequantize(output if do_quantize else x, scale, zero_point)
 
-    return DQ
+    return output
 
 
 def wrap_module_forward_quantized(module: Module, scheme: QuantizationScheme):
@@ -223,3 +301,33 @@ def maybe_calibrate_or_quantize(
             scale.data = updated_scale.to(device)
             zero_point.data = updated_zero_point.to(device)
     return fake_quantize(value, scale, zero_point, args)
+
+
+@torch.no_grad()
+def _quantize(
+    x: torch.Tensor,
+    scale: torch.Tensor,
+    zero_point: torch.Tensor,
+    q_min: torch.Tensor,
+    q_max: torch.Tensor,
+    dtype: Optional[torch.dtype] = None,
+) -> torch.Tensor:
+    quantized_value = torch.clamp(
+        torch.round(x / scale + zero_point),
+        q_min,
+        q_max,
+    )
+
+    if dtype is not None:
+        quantized_value = quantized_value.to(dtype)
+
+    return quantized_value
+
+
+@torch.no_grad()
+def _dequantize(
+    x_q: torch.Tensor,
+    scale: torch.Tensor,
+    zero_point: torch.Tensor,
+) -> torch.Tensor:
+    return (x_q - zero_point) * scale
@@ -14,20 +14,25 @@
 
 import shutil
 
+import pytest
 import torch
 from compressed_tensors import IntQuantizationCompressor
 from compressed_tensors.quantization import (
     QuantizationArgs,
     QuantizationConfig,
     QuantizationScheme,
+    QuantizationStrategy,
 )
 from compressed_tensors.quantization.lifecycle.forward import fake_quantize
 from safetensors.torch import save_file
 
 
-def get_dummy_quant_config():
+def get_dummy_quant_config(strategy, group_size=None):
     config_groups = {
-        "group_1": QuantizationScheme(targets=["Linear"], weights=QuantizationArgs()),
+        "group_1": QuantizationScheme(
+            targets=["Linear"],
+            weights=QuantizationArgs(strategy=strategy, group_size=group_size),
+        ),
     }
     ignore = ["lm_head"]
     quant_config = QuantizationConfig(
@@ -38,13 +43,31 @@ def get_dummy_quant_config():
     return quant_config
 
 
-def test_quant_format():
+@pytest.mark.parametrize(
+    "strategy,group_size,sc,zp",
+    [
+        [QuantizationStrategy.TENSOR, None, 0.01, 0],
+        [
+            QuantizationStrategy.GROUP,
+            128,
+            torch.rand((512, 8, 1)) * 0.01,
+            torch.zeros((512, 8, 1), dtype=torch.int8),
+        ],
+        [
+            QuantizationStrategy.CHANNEL,
+            128,
+            torch.rand((512, 1)) * 0.01,
+            torch.zeros((512, 1), dtype=torch.int8),
+        ],
+    ],
+)
+def test_quant_format(strategy, group_size, sc, zp):
     dense_state_dict = {
         "dummy.weight": torch.rand((512, 1024)),
-        "dummy.weight_scale": torch.tensor(0.01, dtype=torch.float32),
-        "dummy.weight_zero_point": torch.tensor(0, dtype=torch.int32),
+        "dummy.weight_scale": torch.tensor(sc, dtype=torch.float32),
+        "dummy.weight_zero_point": torch.tensor(zp, dtype=torch.int32),
     }
-    quant_config = get_dummy_quant_config()
+    quant_config = get_dummy_quant_config(strategy=strategy, group_size=group_size)
 
     compressor = IntQuantizationCompressor(config=quant_config)
     quantized_modules_to_args = {"dummy": quant_config.config_groups["group_1"].weights}
@@ -61,16 +84,34 @@ def test_quant_format():
     assert compressed_state_dict["dummy.weight_zero_point"].dtype == torch.int32
 
 
-def test_reload_match(tmp_path):
+@pytest.mark.parametrize(
+    "strategy,group_size,sc,zp",
+    [
+        [QuantizationStrategy.TENSOR, None, 0.01, 0],
+        [
+            QuantizationStrategy.GROUP,
+            128,
+            torch.rand((300, 8, 1)) * 0.01,
+            torch.zeros((300, 8, 1), dtype=torch.int8),
+        ],
+        [
+            QuantizationStrategy.CHANNEL,
+            128,
+            torch.rand((300, 1)) * 0.01,
+            torch.zeros((300, 1), dtype=torch.int8),
+        ],
+    ],
+)
+def test_reload_match(strategy, group_size, sc, zp, tmp_path):
     dense_state_dict = {
-        "dummy.weight": torch.rand((511, 350)),
-        "dummy.weight_scale": torch.tensor(0.01, dtype=torch.float32),
-        "dummy.weight_zero_point": torch.tensor(0, dtype=torch.int32),
-        "dummy2.weight": torch.rand((128, 280)),
-        "dummy2.weight_scale": torch.tensor(0.02, dtype=torch.float32),
-        "dummy2.weight_zero_point": torch.tensor(15, dtype=torch.int32),
+        "dummy.weight": torch.rand((300, 1024)),
+        "dummy.weight_scale": torch.tensor(sc, dtype=torch.float32),
+        "dummy.weight_zero_point": torch.tensor(zp, dtype=torch.int32),
+        "dummy2.weight": torch.rand((300, 1024)),
+        "dummy2.weight_scale": torch.tensor(sc, dtype=torch.float32),
+        "dummy2.weight_zero_point": torch.tensor(zp, dtype=torch.int32),
     }
-    quant_config = get_dummy_quant_config()
+    quant_config = get_dummy_quant_config(strategy=strategy, group_size=group_size)
 
     compressor = IntQuantizationCompressor(config=quant_config)
     quantized_modules_to_args = {
 
@@ -0,0 +1,13 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.