Skip to content

Commit c6f28fb

Browse files
authored
[Bugfix] Disable generation of deepseek models with transformers>=4.48 (#1259)
## Purpose ## * Generation is broken on deepseek models due to a deprecation by transformers. This PR skips generation in that case ## Related Issues ## * huggingface/transformers#36071 * https://huggingface.co/deepseek-ai/DeepSeek-V3/discussions/88 Signed-off-by: Kyle Sayers <[email protected]>
1 parent 82b4bd2 commit c6f28fb

File tree

4 files changed

+63
-28
lines changed

4 files changed

+63
-28
lines changed

examples/quantizing_moe/deepseek_moe_w4a16.py

Lines changed: 14 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
import torch
22
from datasets import load_dataset
3-
from transformers import AutoModelForCausalLM, AutoTokenizer
3+
from packaging.version import Version
4+
from transformers import AutoModelForCausalLM, AutoTokenizer, __version__
45

56
from llmcompressor import oneshot
67
from llmcompressor.transformers.compression.helpers import calculate_offload_device_map
@@ -84,11 +85,18 @@ def tokenize(sample):
8485
)
8586

8687
# Confirm generations of the quantized model look sane.
87-
print("========== SAMPLE GENERATION ==============")
88-
input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to("cuda")
89-
output = model.generate(input_ids, max_new_tokens=20)
90-
print(tokenizer.decode(output[0]))
91-
print("==========================================")
88+
# Generation is broken for deepseek models when using the latest transformers package
89+
if Version(__version__) < Version("4.48"):
90+
print("========== SAMPLE GENERATION ==============")
91+
input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to("cuda")
92+
output = model.generate(input_ids, max_new_tokens=20)
93+
print(tokenizer.decode(output[0]))
94+
print("==========================================")
95+
else:
96+
print(
97+
"WARNING: cannot perform sample generation of "
98+
"deepseek models with transformers >= 4.48"
99+
)
92100

93101

94102
# Run the model on vLLM

examples/quantizing_moe/deepseek_moe_w8a8_fp8.py

Lines changed: 17 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
from datasets import load_dataset
2-
from transformers import AutoModelForCausalLM, AutoTokenizer
2+
from packaging.version import Version
3+
from transformers import AutoModelForCausalLM, AutoTokenizer, __version__
34

45
from llmcompressor import oneshot
56
from llmcompressor.modifiers.quantization import QuantizationModifier
@@ -78,10 +79,18 @@ def tokenize(sample):
7879
output_dir=SAVE_DIR,
7980
)
8081

81-
print("========== SAMPLE GENERATION ==============")
82-
SAMPLE_INPUT = ["I love quantization because"]
83-
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
84-
inputs = tokenizer(SAMPLE_INPUT, return_tensors="pt", padding=True).to(model.device)
85-
output = model.generate(**inputs, max_length=50)
86-
text_output = tokenizer.batch_decode(output)
87-
print(text_output)
82+
# Confirm generations of the quantized model look sane.
83+
# Generation is broken for deepseek models when using the latest transformers package
84+
if Version(__version__) < Version("4.48"):
85+
print("========== SAMPLE GENERATION ==============")
86+
SAMPLE_INPUT = ["I love quantization because"]
87+
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
88+
inputs = tokenizer(SAMPLE_INPUT, return_tensors="pt", padding=True).to(model.device)
89+
output = model.generate(**inputs, max_length=50)
90+
text_output = tokenizer.batch_decode(output)
91+
print(text_output)
92+
else:
93+
print(
94+
"WARNING: cannot perform sample generation of "
95+
"deepseek models with transformers >= 4.48"
96+
)

examples/quantizing_moe/deepseek_moe_w8a8_int8.py

Lines changed: 18 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
import torch
22
from datasets import load_dataset
3-
from transformers import AutoModelForCausalLM, AutoTokenizer
3+
from packaging.version import Version
4+
from transformers import AutoModelForCausalLM, AutoTokenizer, __version__
45

56
from llmcompressor import oneshot
67
from llmcompressor.modifiers.quantization import GPTQModifier
@@ -90,10 +91,19 @@ def tokenize(sample):
9091
output_dir=SAVE_DIR,
9192
)
9293

93-
print("========== SAMPLE GENERATION ==============")
94-
SAMPLE_INPUT = ["I love quantization because"]
95-
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
96-
inputs = tokenizer(SAMPLE_INPUT, return_tensors="pt", padding=True).to(model.device)
97-
output = model.generate(**inputs, max_length=50)
98-
text_output = tokenizer.batch_decode(output)
99-
print(text_output)
94+
# Confirm generations of the quantized model look sane.
95+
# Generation is broken for deepseek models when using the latest transformers package
96+
if Version(__version__) < Version("4.48"):
97+
print("========== SAMPLE GENERATION ==============")
98+
SAMPLE_INPUT = ["I love quantization because"]
99+
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
100+
inputs = tokenizer(SAMPLE_INPUT, return_tensors="pt", padding=True).to(model.device)
101+
output = model.generate(**inputs, max_length=50)
102+
text_output = tokenizer.batch_decode(output)
103+
print(text_output)
104+
print("==========================================")
105+
else:
106+
print(
107+
"WARNING: cannot perform sample generation of "
108+
"deepseek models with transformers >= 4.48"
109+
)

examples/quantizing_moe/mixtral_moe_w8a8_fp8.py

Lines changed: 14 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
from typing import List
22

3-
from transformers import AutoModelForCausalLM, AutoTokenizer
3+
from packaging.version import Version
4+
from transformers import AutoModelForCausalLM, AutoTokenizer, __version__
45

56
from llmcompressor import oneshot
67
from llmcompressor.modifiers.quantization import QuantizationModifier
@@ -49,8 +50,15 @@
4950
)
5051

5152
# Confirm generations of the quantized model look sane.
52-
print("========== SAMPLE GENERATION ==============")
53-
input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to("cuda")
54-
output = model.generate(input_ids, max_new_tokens=20)
55-
print(tokenizer.decode(output[0]))
56-
print("==========================================")
53+
# Generation is broken for deepseek models when using the latest transformers package
54+
if Version(__version__) < Version("4.48"):
55+
print("========== SAMPLE GENERATION ==============")
56+
input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to("cuda")
57+
output = model.generate(input_ids, max_new_tokens=20)
58+
print(tokenizer.decode(output[0]))
59+
print("==========================================")
60+
else:
61+
print(
62+
"WARNING: cannot perform sample generation of "
63+
"deepseek models with transformers >= 4.48"
64+
)

0 commit comments

Comments
 (0)