support autoTP with weight quantization in DS inference path

ftian1 · ftian1 · commit 9b947a771fc9 · 2025-01-16T15:20:26.000Z
Signed-off-by: Feng Tian &lt;feng.tian@intel.com&gt;
diff --git a/deepspeed/inference/quantization/layers.py b/deepspeed/inference/quantization/layers.py
@@ -53,7 +53,67 @@ def __init__(self, config: Dict, pre_quant_layer: nn.Linear) -> None:
                                               device=pre_quant_layer.weight.device,
                                               dtype=pre_quant_layer.weight.dtype)
         self.config = config
+        self.quantizer = Quantizer(config=config)
+        self.bias = pre_quant_layer.bias
+        self.weight = get_quantized_weight_wrapper(self, pre_quant_layer.weight,
+                                                   get_quantize_weight_fn(self.quantizer, pre_quant_layer.weight))
+
+        self.weight.dequantizer = DeQuantizer(config, pre_quant_layer.weight.dtype)
+
+    def forward(self, input: Tensor) -> Tensor:
+        quantized_weight, quant_scale, quant_min = self.weight.deconcat(self.weight)
+        temp_dequantized_weight = self.weight.dequantizer.dequantize(quantized_weight.view(torch.uint8), quant_scale,
+                                                                     quant_min)
+
+        # !!! Do not use torch.functional.linear(input, temp_dequantized_weight, self.bias) here as in zero3 torch.functional.linear is
+        # replaced by LinearFunctionForZeroStage3. Which assume weight is non-temporary.
+        # If weight is temp buffer there will be memory leak.
+        return torch._C._nn.linear(input, temp_dequantized_weight, self.bias)
+
+
+class QuantizedLinearAllreduce(nn.Linear):
+
+    def __init__(self, config: Dict, pre_quant_layer: nn.Linear) -> None:
+        super(QuantizedLinearAllreduce, self).__init__(in_features=pre_quant_layer.weight.shape[1],
+                                                       out_features=pre_quant_layer.weight.shape[0],
+                                                       bias=pre_quant_layer.bias is not None,
+                                                       device=pre_quant_layer.weight.device,
+                                                       dtype=pre_quant_layer.weight.dtype)
+        self.config = config
+        self.mp_group = pre_quant_layer.mp_group if hasattr(pre_quant_layer, 'mp_group') else None
+        self.quantizer = Quantizer(config=config, mp_group=self.mp_group)
+        self.bias = pre_quant_layer.bias
+        self.weight = get_quantized_weight_wrapper(self, pre_quant_layer.weight,
+                                                   get_quantize_weight_fn(self.quantizer, pre_quant_layer.weight))
+
+        self.weight.dequantizer = DeQuantizer(config, pre_quant_layer.weight.dtype)
+
+    def forward(self, input: Tensor) -> Tensor:
+        quantized_weight, quant_scale, quant_min = self.weight.deconcat(self.weight)
+        temp_dequantized_weight = self.weight.dequantizer.dequantize(quantized_weight.view(torch.uint8), quant_scale,
+                                                                     quant_min)
 
+        # !!! Do not use torch.functional.linear(input, temp_dequantized_weight, self.bias) here as in zero3 torch.functional.linear is
+        # replaced by LinearFunctionForZeroStage3. Which assume weight is non-temporary.
+        # If weight is temp buffer there will be memory leak.
+        output = torch._C._nn.linear(input, temp_dequantized_weight)
+        if self.mp_group is not None:
+            from deepspeed import comm as dist
+            dist.inference_all_reduce(output, group=self.mp_group)
+        if self.bias is not None:
+            output += self.bias
+        return output
+
+
+class QuantizedLinearLayer(nn.Linear):
+
+    def __init__(self, config: Dict, pre_quant_layer: nn.Linear) -> None:
+        super(QuantizedLinearLayer, self).__init__(in_features=pre_quant_layer.weight.shape[1],
+                                                   out_features=pre_quant_layer.weight.shape[0],
+                                                   bias=pre_quant_layer.bias is not None,
+                                                   device=pre_quant_layer.weight.device,
+                                                   dtype=pre_quant_layer.weight.dtype)
+        self.config = config
         self.quantizer = Quantizer(config=config)
         self.bias = pre_quant_layer.bias
         self.weight = get_quantized_weight_wrapper(self, pre_quant_layer.weight,
@@ -72,6 +132,46 @@ def forward(self, input: Tensor) -> Tensor:
         return torch._C._nn.linear(input, temp_dequantized_weight, self.bias)
 
 
+class QuantizedLmHeadLinearAllreduce(nn.Linear):
+
+    def __init__(self, config: Dict, pre_quant_layer: nn.Linear) -> None:
+        super(QuantizedLinearLayer, self).__init__(in_features=pre_quant_layer.weight.shape[1],
+                                                   out_features=pre_quant_layer.weight.shape[0],
+                                                   bias=pre_quant_layer.bias is not None,
+                                                   device=pre_quant_layer.weight.device,
+                                                   dtype=pre_quant_layer.weight.dtype)
+        self.config = config
+        self.quantizer = Quantizer(config=config)
+        self.bias = pre_quant_layer.bias
+        self.rank = pre_quant_layer.rank
+        self.world_size = pre_quant_layer.world_size
+        self.weight = get_quantized_weight_wrapper(self, pre_quant_layer.weight,
+                                                   get_quantize_weight_fn(self.quantizer, pre_quant_layer.weight))
+
+        self.weight.dequantizer = DeQuantizer(config, pre_quant_layer.weight.dtype)
+
+    def forward(self, input: Tensor) -> Tensor:
+        quantized_weight, quant_scale, quant_min = self.weight.deconcat(self.weight)
+        temp_dequantized_weight = self.weight.dequantizer.dequantize(quantized_weight.view(torch.uint8), quant_scale,
+                                                                     quant_min)
+        from deepspeed.module_inject.tp_shard import get_shard_size, get_shard_size_list
+        input_shard_size = get_shard_size(input.shape[-1], self.world_size)
+        input_shard_offset = sum(get_shard_size_list(input.shape[-1], self.world_size)[0:self.rank])
+
+        # !!! Do not use torch.functional.linear(input, temp_dequantized_weight, self.bias) here as in zero3 torch.functional.linear is
+        # replaced by LinearFunctionForZeroStage3. Which assume weight is non-temporary.
+        # If weight is temp buffer there will be memory leak.
+        output = torch._C._nn.linear(input[:, :, input_shard_offset:input_shard_offset + input_shard_size],
+                                     temp_dequantized_weight.transpose(-1, -2))
+
+        if self.mp_group is not None:
+            from deepspeed import comm as dist
+            dist.inference_all_reduce(output, group=self.mp_group)
+        if self.bias is not None:
+            output += self.bias
+        return output
+
+
 class QuantizedEmbedding(nn.Embedding):
 
     def __init__(self, config: Dict, pre_quant_layer: nn.Embedding) -> None:
@@ -108,7 +208,12 @@ def forward(self, input: Tensor) -> Tensor:
                            self.scale_grad_by_freq, self.sparse)
 
 
+from ...module_inject import LinearAllreduce, LinearLayer, LmHeadLinearAllreduce
+
 QUANTIZATION_LAYER_MAPPINGS = {
     nn.Linear: QuantizedLinear,
     nn.Embedding: QuantizedEmbedding,
+    LinearAllreduce: QuantizedLinearAllreduce,
+    LinearLayer: QuantizedLinearLayer,
+    LmHeadLinearAllreduce: QuantizedLmHeadLinearAllreduce
 }
diff --git a/deepspeed/inference/quantization/utils.py b/deepspeed/inference/quantization/utils.py
@@ -42,8 +42,9 @@ def tensor_round(tensor: Tensor) -> Tensor:
 
 class Quantizer:
 
-    def __init__(self, config: Dict) -> None:
+    def __init__(self, config: Dict, mp_group=None) -> None:
         self.config = config
+        self.mp_group = mp_group
         assert self.config['num_bits'] == 4 or self.config[
             'num_bits'] == 8, 'Only INT4 and INT8 quantization is supported.'
         assert self.config['symmetric'] == False, 'Only asymmetric quantization is supported at this moment.'
diff --git a/deepspeed/module_inject/__init__.py b/deepspeed/module_inject/__init__.py
@@ -6,5 +6,5 @@
 from .replace_module import replace_transformer_layer, revert_transformer_layer, ReplaceWithTensorSlicing, GroupQuantizer, generic_injection
 from .module_quantize import quantize_transformer_layer
 from .replace_policy import HFBertLayerPolicy
-from .layers import LinearAllreduce, LinearLayer, EmbeddingLayer, Normalize
+from .layers import LinearAllreduce, LinearLayer, LmHeadLinearAllreduce, EmbeddingLayer, Normalize
 from .policy import DSPolicy
diff --git a/tests/unit/inference/test_inference.py b/tests/unit/inference/test_inference.py
@@ -512,6 +512,98 @@ def test(
         assert assert_fn(bs_output, ds_output)
 
 
+@pytest.mark.seq_inference
+@pytest.mark.parametrize("model_w_task", [("tiiuae/falcon-7b", "text-generation")], ids=["falcon"])
+class TestAutoTP(DistributedTest):
+    world_size = 1
+
+    def test(
+        self,
+        model_w_task,
+        query,
+        inf_kwargs,
+        assert_fn,
+    ):
+        # TODO: enable this test for H100 tests
+        pytest.skip("Not enough GPU memory for this on V100 runners")
+        model, task = model_w_task
+        dtype = torch.bfloat16
+        local_rank = int(os.getenv("LOCAL_RANK", "0"))
+
+        # We have to load these large models on CPU with pipeline because not
+        # enough GPU memory
+        tokenizer = AutoTokenizer.from_pretrained(model, use_fast=True)
+        pipe = pipeline(task,
+                        model=model,
+                        tokenizer=tokenizer,
+                        torch_dtype=dtype,
+                        trust_remote_code=True,
+                        device=torch.device("cpu"),
+                        framework="pt")
+        #bs_output = pipe(query, **inf_kwargs)
+
+        pipe.model = deepspeed.init_inference(pipe.model, mp_size=self.world_size, replace_with_kernel_inject=False)
+        # Switch device to GPU so that input tensors are not on CPU
+        pipe.device = torch.device(get_accelerator().device_name(local_rank))
+        ds_output = pipe(query, **inf_kwargs)
+
+        #print(local_rank, "baseline", bs_output)
+        print(local_rank, "deepspeed", ds_output)
+        #assert assert_fn(bs_output, ds_output)
+
+
+@pytest.mark.seq_inference
+@pytest.mark.parametrize("model_w_task", [("tiiuae/falcon-7b", "text-generation")], ids=["falcon"])
+class TestAutoTPwithWeightQuant(DistributedTest):
+    world_size = 2
+
+    def test(
+        self,
+        model_w_task,
+        query,
+        inf_kwargs,
+        assert_fn,
+    ):
+        # TODO: enable this test for H100 tests
+        pytest.skip("Not enough GPU memory for this on V100 runners")
+        model, task = model_w_task
+        dtype = torch.bfloat16
+        local_rank = int(os.getenv("LOCAL_RANK", "0"))
+
+        # We have to load these large models on CPU with pipeline because not
+        # enough GPU memory
+        tokenizer = AutoTokenizer.from_pretrained(model, use_fast=True)
+        pipe = pipeline(task,
+                        model=model,
+                        tokenizer=tokenizer,
+                        torch_dtype=dtype,
+                        trust_remote_code=True,
+                        device=torch.device("cpu"),
+                        framework="pt")
+
+        pipe.model = deepspeed.init_inference(pipe.model, mp_size=self.world_size, replace_with_kernel_inject=False)
+        ds_config = {
+            "weight_quantization": {
+                "post_init_quant": {
+                    '*': {
+                        'num_bits': 4,
+                        'group_size': 32,
+                        'group_dim': 1,
+                        'symmetric': False
+                    },
+                }
+            }
+        }
+        from deepspeed.inference.quantization.quantization import _init_group_wise_weight_quantization
+        pipe.model = _init_group_wise_weight_quantization(pipe.model, ds_config)
+        pipe.device = torch.device(get_accelerator().device_name(local_rank))
+        ds_output = pipe(query, **inf_kwargs)
+
+        #print(local_rank, "baseline", bs_output)
+        print(local_rank, "deepspeed", ds_output)
+        #assert assert_fn(bs_output, ds_output)
+
+
 @pytest.mark.seq_inference
 @pytest.mark.parametrize(
     "model_w_task, injection_policy",