Merge pull request #32 from vllm-project/sa/revert_naive_compressor

robertgshaw2-redhat · web-flow · commit 07c1fd74ed80 · 2024-07-22T15:17:03.000-04:00
Revert naive compression format
diff --git a/src/llmcompressor/transformers/compression/quantization_format.py b/src/llmcompressor/transformers/compression/quantization_format.py
@@ -60,6 +60,15 @@ def infer_quantization_format(
                 return CompressionFormat.marlin_24
             return CompressionFormat.pack_quantized
         else:  # w8a8 float and int
+            if len(weight_args) == 1:
+                if (
+                    weight_args[0].type == QuantizationType.FLOAT.value
+                    and weight_args[0].num_bits == 8
+                ):
+                    return CompressionFormat.float_quantized
+                if weight_args[0].type == QuantizationType.INT.value:
+                    return CompressionFormat.int_quantized
+
             return CompressionFormat.naive_quantized
     else:
         # format will be inferred from config
diff --git a/tests/llmcompressor/transformers/compression/test_infer_quant_format.py b/tests/llmcompressor/transformers/compression/test_infer_quant_format.py
@@ -11,12 +11,12 @@
 @pytest.mark.parametrize(
     "preset,sparsity_structure,expected_format",
     [
-        ["W8A8", "unstructured", "naive-quantized"],
+        ["W8A8", "unstructured", "int-quantized"],
         ["W8A16", "unstructured", "pack-quantized"],
         ["W8A16", "2:4", "marlin-24"],
         ["W4A16", "unstructured", "pack-quantized"],
         ["W4A16", "2:4", "marlin-24"],
-        ["FP8", "unstructured", "naive-quantized"],
+        ["FP8", "unstructured", "float-quantized"],
     ],
 )
 def test_infer_quant_format(preset, sparsity_structure, expected_format):