-
Notifications
You must be signed in to change notification settings - Fork 194
Closed
Labels
bugSomething isn't workingSomething isn't working
Description
Describe the bug
When I try to AWQ these models, it hangs forever.
Expected behavior
I expect it to quantize the model
Environment
Nvidia DGX A100
To Reproduce
I used examples/awq/awq_one_shot.py and modified it:
from compressed_tensors.quantization import (
QuantizationArgs,
QuantizationScheme,
QuantizationStrategy,
QuantizationType,
)
from transformers import AutoModelForCausalLM, AutoTokenizer
from llmcompressor import oneshot
from llmcompressor.modifiers.awq import AWQModifier
from llmcompressor.modifiers.quantization import QuantizationModifier
MODEL_ID = "Qwen/Qwen3-30B-A3B"
DATASET_ID = "mit-han-lab/pile-val-backup"
DATASET_SPLIT = "validation"
NUM_CALIBRATION_SAMPLES = 256
MAX_SEQUENCE_LENGTH = 512
OUTPUT_DIR = MODEL_ID.split("/")[-1] + "-awq-asym"
def get_calib_dataset(tokenizer):
from datasets import load_dataset
ds = load_dataset(
DATASET_ID,
split=f"{DATASET_SPLIT}[:{NUM_CALIBRATION_SAMPLES*100}]",
)
def preprocess(example):
return {
"input_ids": tokenizer.encode(example["text"].strip())[:MAX_SEQUENCE_LENGTH]
}
ds = (
ds.shuffle(seed=42)
.map(preprocess, remove_columns=ds.column_names)
.filter(lambda example: len(example["input_ids"]) >= MAX_SEQUENCE_LENGTH)
.select(range(NUM_CALIBRATION_SAMPLES))
)
return ds
if __name__ == "__main__":
recipe = [
AWQModifier(bits=4, symmetric=False),
QuantizationModifier(
# Ignore these layers during quantization
ignore=[
"lm_head",
".*norm.*",
".*gate.*",
],
config_groups={
"group_0": QuantizationScheme(
targets=["Linear"],
weights=QuantizationArgs(
num_bits=4,
type=QuantizationType.INT,
dynamic=False,
symmetric=False,
strategy=QuantizationStrategy.GROUP,
group_size=128,
),
)
},
),
]
model = AutoModelForCausalLM.from_pretrained(
MODEL_ID, device_map="auto", torch_dtype="auto"
)
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
oneshot(
model=model,
dataset=get_calib_dataset(tokenizer=tokenizer),
recipe=recipe,
output_dir=OUTPUT_DIR,
max_seq_length=MAX_SEQUENCE_LENGTH,
num_calibration_samples=NUM_CALIBRATION_SAMPLES,
)
print("Done! model saved to", OUTPUT_DIR)
The output
(vllm) dgxuser@linux:~/workspace/llm-compressor$ python qwen_moe_awq.py
Loading checkpoint shards: 100%|████████████████████████████████████████████| 16/16 [00:22<00:00, 1.39s/it]
Repo card metadata block was not found. Setting CardData to empty.
2025-04-30T18:26:27.175014-0700 | reset | INFO - Compression lifecycle reset
2025-04-30T18:26:27.175475-0700 | from_modifiers | INFO - Creating recipe from modifiers
Metadata
Metadata
Assignees
Labels
bugSomething isn't workingSomething isn't working