-
-
Notifications
You must be signed in to change notification settings - Fork 10.4k
Closed
Closed
Copy link
Labels
bugSomething isn't workingSomething isn't working
Description
Your current environment
vllm@cf069aa
🐛 Describe the bug
Running models using the transformers fallback fails if vllm_config.model_config.hf_config
does not contain head_dim
. For example, using Qwen/Qwen2.5-0.5B-Instruct
:
from vllm import LLM, SamplingParams
prompts = [
"Hello, my name is",
"The president of the United States is",
"The capital of France is",
"The future of AI is",
]
sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
llm = LLM(model="Qwen/Qwen2.5-0.5B-Instruct", model_impl="transformers")
outputs = llm.generate(prompts, sampling_params)
# Print the outputs.
for output in outputs:
prompt = output.prompt
generated_text = output.outputs[0].text
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
Error:
[rank0]: Traceback (most recent call last):
[rank0]: File "/scratch/test_qwen.py", line 20, in <module>
[rank0]: llm = LLM(model="Qwen/Qwen2.5-0.5B-Instruct", enable_prefix_caching=False, compilation_config=3, model_impl="transformers")
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/scratch/vllm/vllm/utils.py", line 1045, in inner
[rank0]: return fn(*args, **kwargs)
[rank0]: ^^^^^^^^^^^^^^^^^^^
[rank0]: File "/scratch/vllm/vllm/entrypoints/llm.py", line 243, in __init__
[rank0]: self.llm_engine = self.engine_class.from_engine_args(
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/scratch/vllm/vllm/engine/llm_engine.py", line 494, in from_engine_args
[rank0]: engine = cls(
[rank0]: ^^^^
[rank0]: File "/scratch/vllm/vllm/engine/llm_engine.py", line 274, in __init__
[rank0]: self.model_executor = executor_class(vllm_config=vllm_config, )
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/scratch/vllm/vllm/executor/executor_base.py", line 52, in __init__
[rank0]: self._init_executor()
[rank0]: File "/scratch/vllm/vllm/executor/uniproc_executor.py", line 47, in _init_executor
[rank0]: self.collective_rpc("load_model")
[rank0]: File "/scratch/vllm/vllm/executor/uniproc_executor.py", line 56, in collective_rpc
[rank0]: answer = run_method(self.driver_worker, method, args, kwargs)
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/scratch/vllm/vllm/utils.py", line 2232, in run_method
[rank0]: return func(*args, **kwargs)
[rank0]: ^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/scratch/vllm/vllm/worker/worker.py", line 183, in load_model
[rank0]: self.model_runner.load_model()
[rank0]: File "/scratch/vllm/vllm/worker/model_runner.py", line 1113, in load_model
[rank0]: self.model = get_model(vllm_config=self.vllm_config)
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/scratch/vllm/vllm/model_executor/model_loader/__init__.py", line 14, in get_model
[rank0]: return loader.load_model(vllm_config=vllm_config)
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/scratch/vllm/vllm/model_executor/model_loader/loader.py", line 416, in load_model
[rank0]: model = _initialize_model(vllm_config=vllm_config)
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/scratch/vllm/vllm/model_executor/model_loader/loader.py", line 126, in _initialize_model
[rank0]: return model_class(vllm_config=vllm_config, prefix=prefix)
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/scratch/vllm/vllm/model_executor/models/transformers.py", line 152, in __init__
[rank0]: head_size=config.head_dim,
[rank0]: ^^^^^^^^^^^^^^^
[rank0]: File "/scratch/vllm/transformers/configuration_utils.py", line 214, in __getattribute__
[rank0]: return super().__getattribute__(key)
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: AttributeError: 'Qwen2Config' object has no attribute 'head_dim'
I think it's fine to calculate the head size if head_dim
does not exist. Using this diff the model works:
+++ a/vllm/model_executor/models/transformers.py
+++ b/vllm/model_executor/models/transformers.py
@@ -146,14 +146,13 @@ class TransformersModel(nn.Module, SupportsQuant, SupportsLoRA):
# Attention modifications (assumes 1 attention op per hidden layer)
tp_size = get_tensor_model_parallel_world_size()
- head_size = vllm_config.model_config.get_head_size()
self.attention_instances = [
Attention(
num_heads=divide(config.num_attention_heads, tp_size),
- head_size=head_size,
+ head_size=config.head_dim,
# NOTE: We use Llama scale as default, if it's set by
# Transformers, it's updated in vllm_flash_attention_forward
- scale=head_size**-0.5,
+ scale=config.head_dim**-0.5,
num_kv_heads=divide(config.num_key_value_heads, tp_size),
cache_config=cache_config,
quant_config=self.quant_config,
Before submitting a new issue...
- Make sure you already searched for relevant issues, and asked the chatbot living at the bottom right corner of the documentation page, which can answer lots of frequently asked questions.
Metadata
Metadata
Assignees
Labels
bugSomething isn't workingSomething isn't working