|
5 | 5 | QuantizationStrategy,
|
6 | 6 | QuantizationType,
|
7 | 7 | )
|
| 8 | +from datasets import load_dataset |
8 | 9 | from lm_eval.utils import make_table
|
9 | 10 | from transformers import AutoModelForCausalLM, AutoTokenizer
|
10 | 11 |
|
11 | 12 | from llmcompressor import oneshot
|
12 | 13 | from llmcompressor.modifiers.awq import AWQModifier
|
13 | 14 | from llmcompressor.modifiers.quantization import QuantizationModifier
|
14 | 15 |
|
15 |
| -# This example demonstrates how to: |
16 |
| -# 1) Run the `llm-compressor` implementation of AWQ |
17 |
| -# 2) Evaluate the compressed model with the lm_eval framework |
| 16 | +# Select model and load it. |
| 17 | +MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct" |
| 18 | + |
| 19 | +model = AutoModelForCausalLM.from_pretrained( |
| 20 | + MODEL_ID, device_map="auto", torch_dtype="auto" |
| 21 | +) |
| 22 | +tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True) |
18 | 23 |
|
| 24 | +# Select calibration dataset. |
19 | 25 | MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
|
20 | 26 | DATASET_ID = "mit-han-lab/pile-val-backup"
|
21 | 27 | DATASET_SPLIT = "validation"
|
| 28 | + |
| 29 | +# Select number of samples. 256 samples is a good place to start. |
| 30 | +# Increasing the number of samples can improve accuracy. |
22 | 31 | NUM_CALIBRATION_SAMPLES = 256
|
23 | 32 | MAX_SEQUENCE_LENGTH = 512
|
24 |
| -OUTPUT_DIR = MODEL_ID.split("/")[-1] + "-awq-asym" |
25 | 33 |
|
26 |
| -# |
27 |
| -# 1) Run LLM Compressor AWQ implementation |
28 |
| -# |
| 34 | +# Load dataset and preprocess. |
| 35 | +ds = load_dataset(DATASET_ID, split=f"{DATASET_SPLIT}[:{NUM_CALIBRATION_SAMPLES}]") |
| 36 | +ds = ds.shuffle(seed=42) |
| 37 | + |
| 38 | + |
| 39 | +def preprocess(example): |
| 40 | + return { |
| 41 | + "text": tokenizer.apply_chat_template( |
| 42 | + [{"role": "user", "content": example["text"]}], |
| 43 | + tokenize=False, |
| 44 | + ) |
| 45 | + } |
| 46 | + |
| 47 | + |
| 48 | +ds = ds.map(preprocess) |
| 49 | + |
| 50 | + |
| 51 | +# Tokenize inputs. |
| 52 | +def tokenize(sample): |
| 53 | + return tokenizer( |
| 54 | + sample["text"], |
| 55 | + padding=False, |
| 56 | + max_length=MAX_SEQUENCE_LENGTH, |
| 57 | + truncation=True, |
| 58 | + add_special_tokens=False, |
| 59 | + ) |
29 | 60 |
|
| 61 | + |
| 62 | +# Configure the quantization algorithm to run. |
30 | 63 | recipe = [
|
31 | 64 | AWQModifier(bits=4, symmetric=False),
|
32 | 65 | QuantizationModifier(
|
|
47 | 80 | ),
|
48 | 81 | ]
|
49 | 82 |
|
50 |
| -model = AutoModelForCausalLM.from_pretrained( |
51 |
| - MODEL_ID, device_map="auto", torch_dtype="auto" |
52 |
| -) |
53 |
| -tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True) |
54 |
| - |
55 |
| - |
56 |
| -def get_calib_dataset(tokenizer): |
57 |
| - from datasets import load_dataset |
58 |
| - |
59 |
| - ds = load_dataset( |
60 |
| - DATASET_ID, |
61 |
| - split=f"{DATASET_SPLIT}[:{NUM_CALIBRATION_SAMPLES*100}]", |
62 |
| - ) |
63 |
| - |
64 |
| - def preprocess(example): |
65 |
| - return { |
66 |
| - "input_ids": tokenizer.encode(example["text"].strip())[:MAX_SEQUENCE_LENGTH] |
67 |
| - } |
68 |
| - |
69 |
| - ds = ( |
70 |
| - ds.shuffle(seed=42) |
71 |
| - .map(preprocess, remove_columns=ds.column_names) |
72 |
| - .filter(lambda example: len(example["input_ids"]) >= MAX_SEQUENCE_LENGTH) |
73 |
| - .select(range(NUM_CALIBRATION_SAMPLES)) |
74 |
| - ) |
75 |
| - |
76 |
| - return ds |
77 |
| - |
78 |
| - |
| 83 | +# Apply algorithms. |
79 | 84 | oneshot(
|
80 | 85 | model=model,
|
81 |
| - dataset=get_calib_dataset(tokenizer=tokenizer), |
| 86 | + dataset=ds, |
82 | 87 | recipe=recipe,
|
83 |
| - output_dir=OUTPUT_DIR, |
84 | 88 | max_seq_length=MAX_SEQUENCE_LENGTH,
|
85 | 89 | num_calibration_samples=NUM_CALIBRATION_SAMPLES,
|
86 | 90 | )
|
87 | 91 |
|
88 |
| -print("Done! model saved to", OUTPUT_DIR) |
| 92 | +# Confirm generations of the quantized model look sane. |
| 93 | +print("\n\n") |
| 94 | +print("========== SAMPLE GENERATION ==============") |
| 95 | +input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to("cuda") |
| 96 | +output = model.generate(input_ids, max_new_tokens=100) |
| 97 | +print(tokenizer.decode(output[0])) |
| 98 | +print("==========================================\n\n") |
| 99 | + |
| 100 | +# Save to disk compressed. |
| 101 | +SAVE_DIR = MODEL_ID.split("/")[-1] + "-awq-asym" |
| 102 | +model.save_pretrained(SAVE_DIR, save_compressed=True) |
| 103 | +tokenizer.save_pretrained(SAVE_DIR) |
89 | 104 |
|
90 | 105 | #
|
91 | 106 | # 2) Evaluate model on wikitext perplexity
|
92 | 107 | #
|
93 | 108 |
|
94 | 109 | results = lm_eval.simple_evaluate(
|
95 |
| - model="vllm", |
| 110 | + model="hf", |
96 | 111 | model_args={
|
97 |
| - "pretrained": OUTPUT_DIR, |
| 112 | + "pretrained": SAVE_DIR, |
98 | 113 | "add_bos_token": True,
|
99 | 114 | "dtype": "bfloat16",
|
100 | 115 | "gpu_memory_utilization": 0.5,
|
|
0 commit comments