[1/N] Integrate PT2E static PTQ (#1739)

yiliu30 · pre-commit-ci[bot] · web-flow · commit 02958dd4a812 · 2024-04-25T16:42:14.000+08:00
* add x86InductorQuantizer (static)
---------

Signed-off-by: yiliu30 &lt;yi4.liu@intel.com&gt;
Co-authored-by: pre-commit-ci[bot] &lt;66853113+pre-commit-ci[bot]@users.noreply.github.com&gt;
diff --git a/neural_compressor/torch/algorithms/pt2e_quant/__init__.py b/neural_compressor/torch/algorithms/pt2e_quant/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2024 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/neural_compressor/torch/algorithms/pt2e_quant/core.py b/neural_compressor/torch/algorithms/pt2e_quant/core.py
@@ -0,0 +1,103 @@
+# Copyright (c) 2024 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Note - The `W8A8StaticQuantizer` is aligned with with the pytorch-labs/ao's unified quantization API.
+# https://github.com/pytorch-labs/ao/blob/5401df093564825c06691f4c2c10cdcf1a32a40c/torchao/quantization/unified.py#L15-L26
+# Some code snippets are taken from the X86InductorQuantizer tutorial.
+# https://pytorch.org/tutorials/prototype/pt2e_quant_x86_inductor.html
+
+
+from typing import Any, Dict, Optional, Tuple, Union
+
+import torch
+import torch.ao.quantization.quantizer.x86_inductor_quantizer as xiq
+from torch._export import capture_pre_autograd_graph
+from torch.ao.quantization.quantize_pt2e import convert_pt2e, prepare_pt2e
+from torch.ao.quantization.quantizer.x86_inductor_quantizer import X86InductorQuantizer
+from torch.fx.graph_module import GraphModule
+
+from neural_compressor.common.utils import logger
+from neural_compressor.torch.utils import TORCH_VERSION_2_2_2, get_torch_version
+
+
+class W8A8StaticQuantizer:
+
+    @staticmethod
+    def update_quantizer_based_on_quant_config(quantizer: X86InductorQuantizer, quant_config) -> X86InductorQuantizer:
+        # TODO: add the logic to update the quantizer based on the quant_config
+        quantizer.set_global(xiq.get_default_x86_inductor_quantization_config())
+        return quantizer
+
+    @staticmethod
+    def export_model(
+        model,
+        example_inputs: Tuple[Any],
+        dynamic_shapes: Optional[Union[Dict[str, Any], Tuple[Any]]] = None,
+    ) -> Optional[GraphModule]:
+        exported_model = None
+        try:
+            with torch.no_grad():
+                # Note 1: `capture_pre_autograd_graph` is also a short-term API, it will be
+                # updated to use the official `torch.export` API when that is ready.
+                cur_version = get_torch_version()
+                if cur_version <= TORCH_VERSION_2_2_2:  # pragma: no cover
+                    logger.warning(
+                        (
+                            "`dynamic_shapes` is not supported in the current version(%s) of PyTorch,"
+                            "If you want to use `dynamic_shapes` to export model, "
+                            "please upgrade to 2.3.0 or later."
+                        ),
+                        cur_version,
+                    )
+                    exported_model = capture_pre_autograd_graph(model, args=example_inputs)
+                else:  # pragma: no cover
+                    exported_model = capture_pre_autograd_graph(  # pylint: disable=E1123
+                        model, args=example_inputs, dynamic_shapes=dynamic_shapes
+                    )
+        except Exception as e:
+            logger.error(f"Failed to export the model: {e}")
+        return exported_model
+
+    def prepare(
+        self, model: torch.nn.Module, quant_config, example_inputs: Tuple[Any], *args: Any, **kwargs: Any
+    ) -> GraphModule:
+        """Prepare the model for calibration.
+
+        There are two steps in this process:
+            1) export the eager model into model with Aten IR.
+            2) create the `quantizer` according to the `quant_config`, and insert the observers accordingly.
+        """
+        assert isinstance(example_inputs, tuple), f"Expected `example_inputs` to be a tuple, got {type(example_inputs)}"
+        # Set the model to eval mode
+        model = model.eval()
+
+        # 1) Capture the FX Graph to be quantized
+        dynamic_shapes = kwargs.get("dynamic_shapes", None)
+        exported_model = self.export_model(model, example_inputs, dynamic_shapes=dynamic_shapes)
+        logger.info("Exported the model to Aten IR successfully.")
+        if exported_model is None:
+            return
+
+        # 2) create the `quantizer` according to the `quant_config`, and insert the observers accordingly.
+        quantizer = X86InductorQuantizer()
+        quantizer = self.update_quantizer_based_on_quant_config(quantizer, quant_config)
+        prepared_model = prepare_pt2e(exported_model, quantizer)
+        return prepared_model
+
+    def convert(self, model: GraphModule, *args: Any, **kwargs: Any) -> GraphModule:
+        """Convert the calibrated model into qdq mode."""
+        fold_quantize = kwargs.get("fold_quantize", False)
+        converted_model = convert_pt2e(model, fold_quantize=fold_quantize)
+        logger.warning("Converted the model in qdq mode, please compile it to accelerate inference.")
+        return converted_model
diff --git a/neural_compressor/torch/utils/environ.py b/neural_compressor/torch/utils/environ.py
@@ -53,6 +53,9 @@ def get_ipex_version():
         return None
 
 
+TORCH_VERSION_2_2_2 = Version("2.2.2")
+
+
 def get_torch_version():
     try:
         torch_version = torch.__version__.split("+")[0]
diff --git a/test/3x/torch/algorithms/pt2e_quant/test_pt2e_w8a8.py b/test/3x/torch/algorithms/pt2e_quant/test_pt2e_w8a8.py
@@ -0,0 +1,105 @@
+import unittest
+from unittest.mock import patch
+
+import pytest
+import torch
+
+from neural_compressor.common.utils import logger
+from neural_compressor.torch.algorithms.pt2e_quant.core import W8A8StaticQuantizer
+from neural_compressor.torch.utils import TORCH_VERSION_2_2_2, get_torch_version
+
+
+class TestW8A8StaticQuantizer:
+
+    @staticmethod
+    def get_toy_model():
+        class Bar(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+
+            def forward(self, a: torch.Tensor, b: torch.Tensor) -> torch.Tensor:
+                x = a / (torch.abs(a) + 1)
+                if b.sum() < 0:
+                    b = b * -1
+                return x * b
+
+        inp1 = torch.randn(10)
+        inp2 = torch.randn(10)
+        example_inputs = (inp1, inp2)
+        bar = Bar()
+        return bar, example_inputs
+
+    @staticmethod
+    def build_simple_torch_model_and_example_inputs():
+        class SimpleModel(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.fc1 = torch.nn.Linear(10, 20)
+                self.fc2 = torch.nn.Linear(20, 10)
+
+            def forward(self, x: torch.Tensor) -> torch.Tensor:
+                x = self.fc1(x)
+                x = torch.nn.functional.relu(x)
+                x = self.fc2(x)
+                return x
+
+        model = SimpleModel()
+        example_inputs = (torch.randn(10, 10),)
+        return model, example_inputs
+
+    @pytest.mark.skipif(get_torch_version() <= TORCH_VERSION_2_2_2, reason="Requires torch>=2.3.0")
+    def test_quantizer_on_simple_model(self):
+        model, example_inputs = self.build_simple_torch_model_and_example_inputs()
+        quant_config = None
+        w8a8_static_quantizer = W8A8StaticQuantizer()
+        # prepare
+        prepare_model = w8a8_static_quantizer.prepare(model, quant_config, example_inputs=example_inputs)
+        # calibrate
+        for i in range(2):
+            prepare_model(*example_inputs)
+        # convert
+        converted_model = w8a8_static_quantizer.convert(prepare_model)
+        # inference
+        from torch._inductor import config
+
+        config.freezing = True
+        opt_model = torch.compile(converted_model)
+        out = opt_model(*example_inputs)
+        logger.warning("out shape is %s", out.shape)
+        assert out is not None
+
+    @pytest.mark.skipif(get_torch_version() <= TORCH_VERSION_2_2_2, reason="Requires torch>=2.3.0")
+    def test_quantizer_on_llm(self):
+        from transformers import AutoModelForCausalLM, AutoTokenizer
+
+        model_name = "facebook/opt-125m"
+        model = AutoModelForCausalLM.from_pretrained(model_name)
+        tokenizer = AutoTokenizer.from_pretrained(model_name)
+        input_ids = tokenizer("Hello, my dog is cute", return_tensors="pt")["input_ids"]
+        example_inputs = (input_ids,)
+        quant_config = None
+        w8a8_static_quantizer = W8A8StaticQuantizer()
+        # prepare
+        prepare_model = w8a8_static_quantizer.prepare(model, quant_config, example_inputs=example_inputs)
+        # calibrate
+        for i in range(2):
+            prepare_model(*example_inputs)
+        # convert
+        converted_model = w8a8_static_quantizer.convert(prepare_model)
+        # inference
+        from torch._inductor import config
+
+        config.freezing = True
+        opt_model = torch.compile(converted_model)
+        out = opt_model(*example_inputs)
+        assert out.logits is not None
+
+    @patch("neural_compressor.torch.algorithms.pt2e_quant.core.logger.error")
+    def test_export_model_failed(self, mock_error):
+        model, example_inputs = self.get_toy_model()
+        w8a8_static_quantizer = W8A8StaticQuantizer()
+        # export model
+        exported_model = w8a8_static_quantizer.export_model(model, example_inputs=example_inputs)
+        assert exported_model is None
+        call_args_list = mock_error.call_args_list
+        assert any(["Failed to export the model" in msg for msg in [info[0][0] for info in call_args_list]])