Skip to content

AWQ Qwen3-235B-A22B and Qwen3-30B-A3B #1406

@ehartford

Description

@ehartford

Describe the bug
When I try to AWQ these models, it hangs forever.

Expected behavior
I expect it to quantize the model

Environment
Nvidia DGX A100

To Reproduce

I used examples/awq/awq_one_shot.py and modified it:

from compressed_tensors.quantization import (
    QuantizationArgs,
    QuantizationScheme,
    QuantizationStrategy,
    QuantizationType,
)
from transformers import AutoModelForCausalLM, AutoTokenizer

from llmcompressor import oneshot
from llmcompressor.modifiers.awq import AWQModifier
from llmcompressor.modifiers.quantization import QuantizationModifier

MODEL_ID = "Qwen/Qwen3-30B-A3B"
DATASET_ID = "mit-han-lab/pile-val-backup"
DATASET_SPLIT = "validation"
NUM_CALIBRATION_SAMPLES = 256
MAX_SEQUENCE_LENGTH = 512
OUTPUT_DIR = MODEL_ID.split("/")[-1] + "-awq-asym"


def get_calib_dataset(tokenizer):
    from datasets import load_dataset

    ds = load_dataset(
        DATASET_ID,
        split=f"{DATASET_SPLIT}[:{NUM_CALIBRATION_SAMPLES*100}]",
    )

    def preprocess(example):
        return {
            "input_ids": tokenizer.encode(example["text"].strip())[:MAX_SEQUENCE_LENGTH]
        }

    ds = (
        ds.shuffle(seed=42)
        .map(preprocess, remove_columns=ds.column_names)
        .filter(lambda example: len(example["input_ids"]) >= MAX_SEQUENCE_LENGTH)
        .select(range(NUM_CALIBRATION_SAMPLES))
    )

    return ds


if __name__ == "__main__":
    recipe = [
        AWQModifier(bits=4, symmetric=False),
        QuantizationModifier(
            # Ignore these layers during quantization
            ignore=[
                "lm_head",
                ".*norm.*",
                ".*gate.*",
            ],
            config_groups={
                "group_0": QuantizationScheme(
                    targets=["Linear"],
                    weights=QuantizationArgs(
                        num_bits=4,
                        type=QuantizationType.INT,
                        dynamic=False,
                        symmetric=False,
                        strategy=QuantizationStrategy.GROUP,
                        group_size=128,
                    ),
                )
            },
        ),
    ]

    model = AutoModelForCausalLM.from_pretrained(
        MODEL_ID, device_map="auto", torch_dtype="auto"
    )
    tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)

    oneshot(
        model=model,
        dataset=get_calib_dataset(tokenizer=tokenizer),
        recipe=recipe,
        output_dir=OUTPUT_DIR,
        max_seq_length=MAX_SEQUENCE_LENGTH,
        num_calibration_samples=NUM_CALIBRATION_SAMPLES,
    )

    print("Done! model saved to", OUTPUT_DIR)

The output

(vllm) dgxuser@linux:~/workspace/llm-compressor$ python qwen_moe_awq.py 
Loading checkpoint shards: 100%|████████████████████████████████████████████| 16/16 [00:22<00:00,  1.39s/it]
Repo card metadata block was not found. Setting CardData to empty.
2025-04-30T18:26:27.175014-0700 | reset | INFO - Compression lifecycle reset
2025-04-30T18:26:27.175475-0700 | from_modifiers | INFO - Creating recipe from modifiers

Metadata

Metadata

Labels

bugSomething isn't working

Type

No type

Projects

No projects

Milestone

No milestone

Relationships

None yet

Development

No branches or pull requests

Issue actions