Support automatic detection of amp and device Autoround [2.x] (#1649)

Kaihui-intel · web-flow · commit cf1de74608c4 · 2024-03-06T16:03:54.000+08:00
Signed-off-by: Kaihui-intel &lt;kaihui.tang@intel.com&gt;
diff --git a/.azure-pipelines/scripts/ut/env_setup.sh b/.azure-pipelines/scripts/ut/env_setup.sh
@@ -99,7 +99,7 @@ elif [[ $(echo "${test_case}" | grep -c "tf pruning") != 0 ]]; then
 fi
 
 if [[ $(echo "${test_case}" | grep -c "api") != 0 ]] || [[ $(echo "${test_case}" | grep -c "adaptor") != 0 ]]; then
-    pip install git+https://github.com/intel/auto-round.git@b65830f3f6cb32d92a5c8ba5f80ace12d517357b
+    pip install git+https://github.com/intel/auto-round.git@6815f8b66be456ecbef2d0beb33dbc4efeefdc04
 fi
 
 # test deps
diff --git a/neural_compressor/adaptor/pytorch.py b/neural_compressor/adaptor/pytorch.py
@@ -4918,8 +4918,6 @@ def autoround_quantize(self, model, tune_cfg, dataloader):
         # auto round recipes
         enable_full_range = self.recipes["autoround_args"].get("enable_full_range", False)
         bs = self.recipes["autoround_args"].get("bs", 8)
-        amp = self.recipes["autoround_args"].get("amp", True)
-        device = self.recipes["autoround_args"].get("device", "cpu")
         lr_scheduler = self.recipes["autoround_args"].get("lr_scheduler", None)
         dataset_name = self.recipes["autoround_args"].get("dataset_name", "NeelNanda/pile-10k")
         dataset_split = self.recipes["autoround_args"].get("dataset_split", "train")
@@ -4939,8 +4937,6 @@ def autoround_quantize(self, model, tune_cfg, dataloader):
         dynamic_max_gap = self.recipes["autoround_args"].get("dynamic_max_gap", -1)
         data_type = self.recipes["autoround_args"].get("data_type", "int")  ##only support data_type
         scale_dtype = self.recipes["autoround_args"].get("scale_dtype", "fp16")
-        # autoround export
-        export_args = self.recipes["autoround_args"].get("export_args", {"format": None})
 
         model, autoround_config = autoround_quantize(
             model=model,
@@ -4951,8 +4947,6 @@ def autoround_quantize(self, model, tune_cfg, dataloader):
             weight_config=weight_config,
             enable_full_range=enable_full_range,
             bs=bs,
-            amp=amp,
-            device=device,
             lr_scheduler=lr_scheduler,
             dataloader=dataloader,
             dataset_name=dataset_name,
@@ -4973,8 +4967,6 @@ def autoround_quantize(self, model, tune_cfg, dataloader):
             dynamic_max_gap=dynamic_max_gap,
             data_type=data_type,
             scale_dtype=scale_dtype,
-            # export arguments
-            export_args=export_args,
         )
         return model, autoround_config
 
diff --git a/neural_compressor/adaptor/torch_utils/weight_only.py b/neural_compressor/adaptor/torch_utils/weight_only.py
@@ -682,7 +682,7 @@ def autoround_quantize(
     enable_full_range: bool = False,  ##for symmetric, TODO support later
     bs: int = 8,
     amp: bool = True,
-    device="cuda:0",
+    device=None,
     lr_scheduler=None,
     dataloader=None,  ## to support later
     dataset_name: str = "NeelNanda/pile-10k",
@@ -703,7 +703,6 @@ def autoround_quantize(
     dynamic_max_gap: int = -1,
     data_type: str = "int",  ##only support data_type
     scale_dtype="fp16",
-    export_args: dict = {"format": None, "inplace": True},
     **kwargs,
 ):
     """Run autoround weight-only quantization.
@@ -726,8 +725,8 @@ def autoround_quantize(
             }
     enable_full_range (bool): Whether to enable full range quantization (default is False).
     bs (int): Batch size for training (default is 8).
-    amp (bool): Whether to use automatic mixed precision (default is True).
-    device: The device to be used for tuning (default is "cuda:0").
+    amp (bool): Whether to use automatic mixed precision (default is True). Automatically detect and set.
+    device: The device to be used for tuning (default is None). Automatically detect and set.
     lr_scheduler: The learning rate scheduler to be used.
     dataloader: The dataloader for input data (to be supported in future).
     dataset_name (str): The default dataset name (default is "NeelNanda/pile-10k").
@@ -747,8 +746,6 @@ def autoround_quantize(
     not_use_best_mse (bool): Whether to use mean squared error (default is False).
     dynamic_max_gap (int): The dynamic maximum gap (default is -1).
     data_type (str): The data type to be used (default is "int").
-    export_args (dict): The arguments for exporting compressed model, default is {"format": None, "inplace": True}.
-      Supported format: "itrex", "auto_gptq".
     **kwargs: Additional keyword arguments.
 
     Returns:
@@ -790,11 +787,4 @@ def autoround_quantize(
         **kwargs,
     )
     qdq_model, weight_config = rounder.quantize()
-    if export_args["format"] is not None:
-        output_dir = export_args.get("output_dir", None)
-        format = export_args["format"]
-        inplace = export_args.get("inplace", True)
-        use_triton = export_args.get("use_triton", False)
-        model = rounder.save_quantized(output_dir=output_dir, format=format, inplace=inplace, use_triton=use_triton)
-        return model, weight_config
     return qdq_model, weight_config
diff --git a/neural_compressor/model/torch_model.py b/neural_compressor/model/torch_model.py
@@ -559,9 +559,9 @@ def export_compressed_model(
                 new_module.pack(int_weight, gptq_scale, gptq_zp, m.bias, gptq_perm)
                 set_module(self.model, k, new_module)
         elif autoround_config:
-            from auto_round.export.export_to_itrex import compress_model  # pylint: disable=E0401
+            from auto_round.export.export_to_itrex.export import _pack_model  # pylint: disable=E0401
 
-            self.model = compress_model(
+            self.model = _pack_model(
                 self.model,
                 weight_config=autoround_config,
                 enable_full_range=enable_full_range,
diff --git a/test/adaptor/pytorch_adaptor/test_weight_only_adaptor_pytorch.py b/test/adaptor/pytorch_adaptor/test_weight_only_adaptor_pytorch.py
@@ -778,11 +778,9 @@ def test_AutoRound_quant(self):
             recipes={
                 "autoround_args": {
                     "n_samples": 20,
-                    "amp": False,
                     "seq_len": 10,
                     "iters": 10,
                     "scale_dtype": "fp32",
-                    "device": "cpu",
                 },
             },
         )
@@ -809,59 +807,6 @@ def test_AutoRound_quant(self):
         self.assertTrue(isinstance(q_model.model.transformer.h[0].attn.k_proj, WeightOnlyLinear))
         self.assertTrue(isinstance(export_model.transformer.h[0].attn.k_proj, WeightOnlyLinear))
 
-        fp32_model = copy.deepcopy(self.gptj)
-
-        conf = PostTrainingQuantConfig(
-            approach="weight_only",
-            op_type_dict={
-                ".*": {  # re.match
-                    "weight": {
-                        "dtype": "int",
-                        "bits": 4,
-                        "group_size": 32,  # -1 (per-channel)
-                        "scheme": "sym",
-                        "algorithm": "AUTOROUND",
-                    },
-                },
-            },
-            op_name_dict={
-                ".*lm_head": {  # re.match
-                    "weight": {"dtype": "fp32"},
-                },
-            },
-            recipes={
-                "autoround_args": {
-                    "n_samples": 20,
-                    "amp": False,
-                    "seq_len": 10,
-                    "iters": 10,
-                    "scale_dtype": "fp32",
-                    "device": "cpu",
-                    "export_args": {"format": "itrex", "inplace": False},
-                },
-            },
-        )
-        """All export arguments.
-
-        "export_args": {
-            "format": "itrex", # "iterx", "auto_gptq", default is None
-            "output_dir": None, # saved path
-            "inplace": False,
-            "use_triton": False,
-        }
-        """
-        input = torch.ones([1, 512], dtype=torch.long)
-        fp32_model = copy.deepcopy(self.gptj)
-        out1 = fp32_model(input)
-        export_model = quantization.fit(
-            fp32_model,
-            conf,
-            calib_dataloader=dataloader,
-        )
-        out2 = export_model.model(input)
-        self.assertTrue(torch.allclose(out1[0], out2[0], atol=1e-01))
-        self.assertTrue(isinstance(export_model.model.transformer.h[0].attn.k_proj, WeightOnlyLinear))
-
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/test/quantization/test_weight_only_quantization.py b/test/quantization/test_weight_only_quantization.py
@@ -278,8 +278,6 @@ def test_autoround_int_quant(self):
             model=model,
             tokenizer=self.tokenizer,
             n_samples=20,
-            device=device,
-            amp=False,
             seqlen=10,
             iters=10,
             scale_dtype="fp32",
@@ -292,34 +290,6 @@ def test_autoround_int_quant(self):
         self.assertFalse(torch.all(out1[0] == out2[0]))
         self.assertTrue(torch.all(out2[0] == out3[0]))
 
-    def test_autoround_export(self):
-        model = copy.deepcopy(self.gptj)
-        device = "cpu"
-        model = model
-        out1 = model(self.lm_input)
-        export_model, weight_config1 = autoround_quantize(
-            model=model,
-            tokenizer=self.tokenizer,
-            n_samples=20,
-            device=device,
-            amp=False,
-            seqlen=10,
-            iters=10,
-            scale_dtype="fp32",
-            export_args={"format": "itrex", "inplace": True},
-        )
-        export_model = export_model
-        model = model
-        out2 = model(self.lm_input)
-        out3 = export_model(self.lm_input)
-        self.assertTrue(torch.all(torch.isclose(out1[0], out2[0], atol=1e-1)))
-        self.assertFalse(torch.all(out1[0] == out2[0]))
-        self.assertTrue(torch.all(out2[0] == out3[0]))
-
-        from auto_round.export.export_to_itrex.model_wrapper import WeightOnlyLinear
-
-        self.assertTrue(isinstance(export_model.transformer.h[0].attn.k_proj, WeightOnlyLinear))
-
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/test/requirements.txt b/test/requirements.txt
@@ -1,7 +1,7 @@
 --find-links https://download.pytorch.org/whl/torch_stable.html
 accelerate==0.21.0
 dynast==1.6.0rc1
-git+https://github.com/intel/auto-round.git@b65830f3f6cb32d92a5c8ba5f80ace12d517357b
+git+https://github.com/intel/auto-round.git@6815f8b66be456ecbef2d0beb33dbc4efeefdc04
 horovod
 intel-extension-for-pytorch
 intel-tensorflow>=2.12.0