Add fallback support for HQQ (#1848)

yiliu30 · web-flow · commit db6164a25da5 · 2024-06-06T13:11:32.000+08:00
Signed-off-by: yiliu30 &lt;yi4.liu@intel.com&gt;
diff --git a/neural_compressor/torch/algorithms/weight_only/hqq/quantizer.py b/neural_compressor/torch/algorithms/weight_only/hqq/quantizer.py
@@ -149,5 +149,8 @@ def _parse_hqq_configs_mapping(self, configs_mapping):
             if quant_config.skip_lm_head and "lm_head" in op_name:
                 logger.warning("Skip quantizing %s due to `skip_lm_head` is True.", op_name)
                 continue
+            if quant_config is not None and quant_config.dtype == "fp32":
+                logger.warning("Fallback %s.", op_name)
+                continue
             qconfig_mapping[op_name] = self._convert_hqq_module_config(quant_config)
         return qconfig_mapping
diff --git a/neural_compressor/torch/quantization/config.py b/neural_compressor/torch/quantization/config.py
@@ -1179,6 +1179,7 @@ class HQQConfig(BaseConfig):
 
     def __init__(
         self,
+        dtype: str = "int",
         bits: int = 4,
         group_size: int = 64,
         quant_zero: bool = True,
@@ -1188,6 +1189,7 @@ def __init__(
         white_list: Optional[List[OP_NAME_OR_MODULE_TYPE]] = DEFAULT_WHITE_LIST,
     ):
         super().__init__(white_list=white_list)
+        self.dtype = dtype
         self.bits = bits
         self.group_size = group_size
         self.quant_zero = quant_zero
diff --git a/test/3x/torch/quantization/weight_only/hqq/test_hqq_cpu.py b/test/3x/torch/quantization/weight_only/hqq/test_hqq_cpu.py
@@ -87,6 +87,25 @@ def test_hqq_quant(self, force_use_cpu, force_not_half):
             q_label_1.eq(q_label_2)
         ), "The results of calling `convert` + `prepare` and calling `quantize` should be equal."
 
+    def test_hqq_fallback(self, force_use_cpu, force_not_half):
+        from neural_compressor.torch.quantization import HQQConfig, convert, prepare
+
+        class ToyModel(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.fc1 = torch.nn.Linear(128, 1024)
+                self.fc2 = torch.nn.Linear(1024, 512)
+
+            def forward(self, x):
+                x = self.fc1(x)
+                x = self.fc2(x)
+                return x
+
+        quant_config = HQQConfig().set_local("fc1", HQQConfig(dtype="fp32"))
+        qmodel = convert(prepare(model=ToyModel(), quant_config=quant_config))
+        assert type(qmodel.fc1).__name__ == torch.nn.Linear.__name__, f"Expect fallback fc1, but get {type(qmodel.fc1)}"
+        assert type(qmodel.fc2).__name__ != torch.nn.Linear.__name__, f"Expect quantize fc2, but get {type(qmodel.fc2)}"
+
     @pytest.mark.parametrize(
         "nbits, group_size, quant_zero, quant_scale, scale_quant_group_size",
         [