Support mixed INT8 + FP16 in one model (#1798)

yiliu30 · web-flow · commit fa961e1d0bbe · 2024-05-20T21:44:17.000+08:00
Signed-off-by: yiliu30 &lt;yi4.liu@intel.com&gt;
diff --git a/neural_compressor/torch/algorithms/pt2e_quant/core.py b/neural_compressor/torch/algorithms/pt2e_quant/core.py
@@ -25,6 +25,7 @@
 
 from neural_compressor.common.utils import logger
 from neural_compressor.torch.algorithms.base_algorithm import Quantizer
+from neural_compressor.torch.algorithms.pt2e_quant import half_precision_rewriter as hp_rewriter
 from neural_compressor.torch.utils import create_xiq_quantizer_from_pt2e_config
 
 
@@ -61,4 +62,11 @@ def convert(self, model: GraphModule, *args: Any, **kwargs: Any) -> GraphModule:
         fold_quantize = kwargs.get("fold_quantize", False)
         converted_model = convert_pt2e(model, fold_quantize=fold_quantize)
         logger.warning("Converted the model in qdq mode, please compile it to accelerate inference.")
+        if self.quant_config:
+            self.half_precision_transformation(converted_model, self.quant_config)
         return converted_model
+
+    def half_precision_transformation(self, model, config):
+        half_precision_node_set = hp_rewriter.get_half_precision_node_set(model, config)
+        logger.info("Try to convert %d nodes to half precision.", len(half_precision_node_set))
+        hp_rewriter.transformation(model, half_precision_node_set)
diff --git a/neural_compressor/torch/algorithms/pt2e_quant/half_precision_rewriter.py b/neural_compressor/torch/algorithms/pt2e_quant/half_precision_rewriter.py
@@ -0,0 +1,193 @@
+# Copyright (c) 2024 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from dataclasses import dataclass
+from functools import partial
+from typing import Any, Callable, Dict, List, Tuple
+
+import torch
+import torch.ao.quantization.quantizer.x86_inductor_quantizer as xiq
+import torch.ao.quantization.quantizer.xnnpack_quantizer as xpq
+from torch.fx import subgraph_rewriter
+from torch.fx.experimental.proxy_tensor import make_fx
+from torch.fx.subgraph_rewriter import Match
+from typing_extensions import TypeAlias
+
+from neural_compressor.common import utils
+
+# =============================================================================
+# Search and replace patterns
+# =============================================================================
+TorchFuncType: TypeAlias = Callable[..., Any]
+
+
+@dataclass
+class PatternPair:
+    fn: TorchFuncType
+    search_pattern: torch.fx.GraphModule
+    replace_pattern: torch.fx.GraphModule
+
+
+# key: torch func
+# value: the tuple of args
+FuncArgsMappingType: TypeAlias = Dict[TorchFuncType, Tuple[torch.Tensor, ...]]
+
+
+# Align with https://pytorch.org/docs/stable/amp.html#cpu-ops-that-can-autocast-to-bfloat16
+# TODO: complete the mapping
+FN_ARGS_MAPPING: FuncArgsMappingType = {
+    torch.nn.functional.linear: (torch.randn(0, 0), torch.randn(0, 0)),  # linear w/o bias
+    torch.nn.functional.linear: (torch.randn(0, 0), torch.randn(0, 0), torch.randn(0)),  # linear w/ bias
+}
+# TODO: complete the mapping
+FN_ATEN_OPS_MAPPING = {
+    torch.nn.functional.linear: torch.ops.aten.linear.default,
+}
+
+SUPPORTED_OPERATORS = FN_ATEN_OPS_MAPPING.values()
+
+
+PatternRegistryType: TypeAlias = Dict[TorchFuncType, PatternPair]
+HALF_PRECISION_PATTERN_REGISTRY: Dict[torch.dtype, PatternRegistryType] = {torch.float16: {}, torch.bfloat16: {}}
+
+# FP16_PATTERN_REGISTRY: PatternRegistryType = HALF_PRECISION_PATTERN_REGISTRY[torch.float16]
+# BF16_PATTERN_REGISTRY: PatternRegistryType = HALF_PRECISION_PATTERN_REGISTRY[torch.bfloat16]
+
+
+def pattern_factory(fn: TorchFuncType, fn_arg: Tuple[torch.Tensor, ...], target_dtype: torch.dtype = torch.float16):
+    """Create a search, replace pattern and filter functions for a given torch function and its arguments."""
+    assert target_dtype in [
+        torch.float16,
+        torch.bfloat16,
+    ], f"target_dtype should either be `torch.float16` or `torch.bfloat16`, but got {target_dtype}"
+
+    def replace_fn_wrapper(fn_args, fn):
+        converted_args = [arg.to(target_dtype) for arg in fn_args]
+        target_dtype_out = fn(*converted_args)
+        return target_dtype_out.float()
+
+    replace_fn = partial(replace_fn_wrapper, fn=fn)
+
+    search_pattern_gm = make_fx(fn, pre_dispatch=True)(*fn_arg)
+    # TODO: double-check `*fn_args` or `fn_args`
+    replace_pattern_gm = make_fx(replace_fn, pre_dispatch=True)(fn_arg)
+
+    pattern_pair = PatternPair(fn, search_pattern_gm, replace_pattern_gm)
+
+    return pattern_pair
+
+
+def _register_pattern_pair(dtype: torch.dtype) -> None:
+    for fn, fn_args in FN_ARGS_MAPPING.items():
+        pattern_pair = pattern_factory(fn, fn_args)
+        HALF_PRECISION_PATTERN_REGISTRY[dtype][fn] = pattern_pair
+    utils.logger.info(
+        f"Registered {len(HALF_PRECISION_PATTERN_REGISTRY[dtype])} search and replace patterns for {dtype}."
+    )
+
+
+_register_pattern_pair(torch.float16)
+
+
+def get_filter_fn(node_list, fn):
+    target_op = FN_ATEN_OPS_MAPPING[fn]
+
+    def is_target_node_in_candidate_list(match, original_graph, pattern_graph):
+        """Filter the node with target operator in match and check if it is in `node_list`."""
+        target_node = None
+        for node in pattern_graph.nodes:
+            if node.target == target_op:
+                target_node = node
+                break
+        if target_node is None:
+            return False
+        matched_node = match.nodes_map[target_node]
+        return matched_node in node_list
+
+    return is_target_node_in_candidate_list
+
+
+def apply_single_pattern_pair(gm: torch.fx.GraphModule, pattern_pair: PatternPair, node_list):
+    filter_fn = get_filter_fn(node_list, pattern_pair.fn)
+    match_and_replacements = subgraph_rewriter.replace_pattern_with_filters(
+        gm=gm,
+        pattern=pattern_pair.search_pattern,
+        replacement=pattern_pair.replace_pattern,
+        match_filters=[filter_fn],
+    )
+    utils.logger.info(f"Found {len(match_and_replacements)} matches.")
+
+    match_list = [Match(anchor=m.anchor, nodes_map=m.nodes_map) for m in match_and_replacements]
+    return match_list
+
+
+def get_unquantized_node_set(gm: torch.fx.GraphModule):
+    unquantized_node_set = set()
+    for node in gm.graph.nodes:
+        if meta := getattr(node, "meta"):
+            if quantization_annotation := meta.get(xiq.QUANT_ANNOTATION_KEY):
+                if quantization_annotation._annotated:
+                    continue
+        unquantized_node_set.add(node)
+    return unquantized_node_set
+
+
+def transformation(gm: torch.fx.GraphModule, node_candidate_list: List[str], target_dtype: torch.dtype = torch.float16):
+    """Convert the nodes in `node_candidate_list` to `target_dtype` if possible."""
+    for pattern_pair in HALF_PRECISION_PATTERN_REGISTRY[target_dtype].values():
+        apply_single_pattern_pair(gm, pattern_pair, node_candidate_list)
+    utils.logger.info("Half precision conversion is done:")
+    gm.print_readable(True)
+
+
+# =============================================================================
+# Utils to parse the node candidate set for half precision conversion
+# =============================================================================
+
+
+def _parse_node_candidate_set_from_user_config(config, gm):
+    """Parse the node candidate set from user config."""
+    op_type_configs, op_name_configs = config._get_op_name_op_type_config()
+    op_type_filters = []
+    op_name_filters = []
+    for op_type_name, config in op_type_configs.items():
+        op_type = getattr(torch.nn, op_type_name)
+        if config.act_dtype == "fp16":
+            filter = xpq._get_module_type_filter(op_type)
+            op_type_filters.append(filter)
+    for op_name, config in op_name_configs.items():
+        if config.act_dtype == "fp16":
+            filter = xpq._get_module_name_filter(op_name)
+            op_name_filters.append(filter)
+    node_set_from_user_config = set()
+    all_filters = op_type_filters + op_name_filters
+    for node in gm.graph.nodes:
+        if any([filter(node) for filter in all_filters]):
+            node_set_from_user_config.add(node)
+    return node_set_from_user_config
+
+
+def get_half_precision_node_set(gm, config):
+    """Intersection between `unquantized_node_set` and `node_set_from_user_config`"""
+    # TODO: implement it, current return all unquantized_node_set
+
+    node_set_from_user_config = _parse_node_candidate_set_from_user_config(config, gm)
+    unquantized_node_set = get_unquantized_node_set(gm)
+    possible_node_set = unquantized_node_set.intersection(node_set_from_user_config)
+    half_precision_node_set = set()
+    for node in possible_node_set:
+        if node.target in SUPPORTED_OPERATORS:
+            half_precision_node_set.add(node)
+    utils.logger.info(f"Found {len(half_precision_node_set)} nodes to convert to half precision.")
+    return half_precision_node_set
diff --git a/neural_compressor/torch/utils/utility.py b/neural_compressor/torch/utils/utility.py
@@ -226,11 +226,5 @@ def create_xiq_quantizer_from_pt2e_config(config, is_dynamic=False) -> X86Induct
     # set global
     global_config = _map_inc_config_to_torch_quant_config(config, is_dynamic)
     quantizer.set_global(global_config)
-    # set local
-    for module_or_func_name, local_config in config.local_config.items():
-        local_quant_config = _map_inc_config_to_torch_quant_config(local_config, is_dynamic)
-        if isinstance(module_or_func_name, torch.nn.Module):
-            quantizer.set_module_type_qconfig(module_or_func_name, local_quant_config)
-        else:
-            quantizer.set_function_type_qconfig(module_or_func_name, local_quant_config)
+    # Skip the local config for now (need torch 2.4)
     return quantizer
diff --git a/test/3x/torch/algorithms/pt2e_quant/test_half_precision_rewriter.py b/test/3x/torch/algorithms/pt2e_quant/test_half_precision_rewriter.py
@@ -0,0 +1,52 @@
+import pytest
+import torch
+import torch.testing._internal.common_quantization as torch_test_quant_common
+
+from neural_compressor.torch import export
+from neural_compressor.torch import utils as torch_utils
+from neural_compressor.torch.algorithms.pt2e_quant import half_precision_rewriter
+
+
+class TestHalfPrecisionConverter(torch_test_quant_common.QuantizationTestCase):
+
+    @staticmethod
+    def build_simple_torch_model_and_example_inputs():
+        class SimpleModel(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.fc1 = torch.nn.Linear(10, 20)
+                self.fc2 = torch.nn.Linear(20, 10)
+
+            def forward(self, x: torch.Tensor) -> torch.Tensor:
+                x = self.fc1(x)
+                x = torch.nn.functional.relu(x)
+                x = self.fc2(x)
+                return x
+
+        model = SimpleModel()
+        example_inputs = (torch.randn(10, 10),)
+        return model, example_inputs
+
+    @pytest.mark.skipif(
+        torch_utils.get_torch_version() <= torch_utils.TORCH_VERSION_2_2_2, reason="Requires torch>=2.3.0"
+    )
+    def test_quantizer_on_simple_model(self):
+        model, example_inputs = self.build_simple_torch_model_and_example_inputs()
+        exported_model = export.export_model_for_pt2e_quant(model=model, example_inputs=example_inputs)
+        print("Exported model:")
+        exported_model.print_readable()
+        unquantized_node_set = half_precision_rewriter.get_unquantized_node_set(exported_model)
+        print("Before apply half precision rewriter:")
+        exported_model.print_readable(True)
+        half_precision_rewriter.transformation(exported_model, unquantized_node_set)
+        print("After apply half precision rewriter:")
+        exported_model.print_readable(True)
+        expected_node_occurrence = {
+            # 4 `aten.to` for each `aten.linear`
+            torch.ops.aten.to.dtype: 8,
+            torch.ops.aten.linear.default: 2,
+        }
+        expected_node_occurrence = {
+            torch_test_quant_common.NodeSpec.call_function(k): v for k, v in expected_node_occurrence.items()
+        }
+        self.checkGraphModuleNodes(exported_model, expected_node_occurrence=expected_node_occurrence)
diff --git a/test/3x/torch/quantization/test_pt2e_quant.py b/test/3x/torch/quantization/test_pt2e_quant.py
diff --git a/test/3x/torch/requirements.txt b/test/3x/torch/requirements.txt