Skip to content

Commit b72a03a

Browse files
authored
[Tests] Fix GPTQ Tests (vllm-project#1692)
SUMMARY: - The current tests are failing because when loading the tinystories model, the lm_head is ending up with device type "meta" - This model is generally problematic so we swap to use TinyLlama - With the size of the model being large, we target just one layer for quantization to contain runtime, while updating the asserts to be reflective of the just one layer being quantized TESTING: - All tests pass with these changes
1 parent c7c9248 commit b72a03a

File tree

1 file changed

+10
-9
lines changed

1 file changed

+10
-9
lines changed

tests/llmcompressor/transformers/gptq/test_oneshot.py

Lines changed: 10 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -20,14 +20,15 @@
2020
type: "int"
2121
symmetric: true
2222
strategy: "channel"
23-
targets: ["Linear"]
23+
targets: ["re:.*model.layers.2.self_attn.q_proj$"]
2424
"""
2525

2626
recipe_modifier_full = GPTQModifier(
2727
ignore=["lm_head"],
2828
config_groups={
2929
"group_0": QuantizationScheme(
30-
targets=["Linear"], weights=QuantizationArgs(num_bits=4, strategy="channel")
30+
targets=["re:.*model.layers.2.self_attn.q_proj$"],
31+
weights=QuantizationArgs(num_bits=4, strategy="channel"),
3132
)
3233
},
3334
)
@@ -36,18 +37,18 @@
3637
ignore=["lm_head"],
3738
config_groups={
3839
"group_0": QuantizationScheme(
39-
targets=["Linear"],
40+
targets=["re:.*model.layers.2.self_attn.q_proj$"],
4041
weights=QuantizationArgs(num_bits=4, strategy="group", group_size=128),
4142
)
4243
},
4344
)
4445

4546
recipe_modifier_shorthand_a = GPTQModifier(
46-
ignore=["lm_head"], targets="Linear", scheme="W4A16"
47+
ignore=["lm_head"], targets="re:.*model.layers.2.self_attn.q_proj$", scheme="W4A16"
4748
)
4849

4950
recipe_modifier_shorthand_b = GPTQModifier(
50-
ignore=["lm_head"], scheme={"W4A16": ["Linear"]}
51+
ignore=["lm_head"], scheme={"W4A16": ["re:.*model.layers.2.self_attn.q_proj$"]}
5152
)
5253

5354

@@ -65,7 +66,7 @@ def setUp(self):
6566
import torch
6667

6768
self.output = "./oneshot_output"
68-
self.model = "Xenova/llama2.c-stories110M"
69+
self.model = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
6970
self.dataset = "open_platypus"
7071
self.device = "cuda:0" if torch.cuda.is_available() else "cpu"
7172

@@ -95,17 +96,17 @@ def test_oneshot_application(self):
9596
assert quantization_config is not None
9697

9798
# check config is set properly
98-
assert quantization_config.ignore == ["lm_head"]
99+
assert "lm_head" in quantization_config.ignore
99100
assert len(quantization_config.config_groups) == 1
100101
quant_scheme = quantization_config.config_groups["group_0"]
101102
assert isinstance(quant_scheme, QuantizationScheme)
102-
assert quant_scheme.targets == ["Linear"]
103+
assert quant_scheme.targets == ["re:.*model.layers.2.self_attn.q_proj$"]
103104
weight_args = quantization_config.config_groups["group_0"].weights
104105
assert isinstance(weight_args, QuantizationArgs)
105106
assert weight_args.num_bits == 4
106107

107108
# Check a specific layer is quantized
108-
targetted_linear_layer = model_loaded.model.layers[0].self_attn.k_proj
109+
targetted_linear_layer = model_loaded.model.layers[2].self_attn.q_proj
109110
assert hasattr(targetted_linear_layer, "quantization_scheme")
110111

111112
# Check lm-head is not quantized

0 commit comments

Comments
 (0)