-
-
Notifications
You must be signed in to change notification settings - Fork 9.2k
Closed
Closed
Copy link
Labels
bugSomething isn't workingSomething isn't working
Description
Your current environment
Issue encountered on main branch tests.
🐛 Describe the bug
Test failing with below traceback:
vllm_runner = <class 'tests.conftest.VllmRunner'>
@pytest.mark.skipif(current_platform.is_rocm(),
reason="Xformers backend is not supported on ROCm.")
def test_model_loading_with_params(vllm_runner):
"""
Test parameter weight loading with tp>1.
"""
with vllm_runner(model_name=MODEL_NAME,
revision=REVISION,
dtype="float16",
max_model_len=MAX_MODEL_LEN) as vllm_model:
output = vllm_model.encode("Write a short story about a robot that"
" dreams for the first time.\n")
model_config = vllm_model.model.llm_engine.model_config
model_tokenizer = vllm_model.model.llm_engine.tokenizer
# asserts on the bert model config file
assert model_config.encoder_config["max_seq_length"] == 512
assert model_config.encoder_config["do_lower_case"]
# asserts on the pooling config files
assert model_config.pooler_config.pooling_type == PoolingType.CLS.name
> assert model_config.pooler_config.pooling_norm
E AttributeError: 'PoolerConfig' object has no attribute 'pooling_norm'. Did you mean: 'pooling_type'?
tests/model_executor/test_model_load_with_params.py:43: AttributeError
------------------------------------------------- Captured stdout call -------------------------------------------------
INFO 05-27 12:31:33 [__init__.py:31] Available plugins for group vllm.general_plugins:
INFO 05-27 12:31:33 [__init__.py:33] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver
INFO 05-27 12:31:33 [__init__.py:36] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load.
INFO 05-27 12:32:10 [config.py:577] Found sentence-transformers tokenize configuration.
INFO 05-27 12:32:10 [config.py:3131] Downcasting torch.float32 to torch.float16.
INFO 05-27 12:32:18 [config.py:473] Found sentence-transformers modules configuration.
INFO 05-27 12:32:18 [config.py:493] Found pooling configuration.
INFO 05-27 12:32:18 [config.py:793] This model supports multiple tasks: {'reward', 'embed', 'classify', 'score'}. Defaulting to 'embed'.
WARNING 05-27 12:32:18 [arg_utils.py:1583] --task embed is not supported by the V1 Engine. Falling back to V0.
INFO 05-27 12:32:18 [llm_engine.py:230] Initializing a V0 LLM engine (v0.9.1.dev148+gfba064270.d20250526) with config: model='BAAI/bge-base-en-v1.5', speculative_config=None, tokenizer='BAAI/bge-base-en-v1.5', skip_tokenizer_init=False, tokenizer_mode=auto, revision=main, override_neuron_config={}, tokenizer_revision=main, trust_remote_code=True, dtype=torch.float16, max_seq_len=128, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, device_config=cuda, decoding_config=DecodingConfig(backend='auto', disable_fallback=False, disable_any_whitespace=False, disable_additional_properties=False, reasoning_backend=''), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None), seed=0, served_model_name=BAAI/bge-base-en-v1.5, num_scheduler_steps=1, multi_step_stream_outputs=True, enable_prefix_caching=None, chunked_prefill_enabled=False, use_async_output_proc=False, pooler_config=PoolerConfig(pooling_type='CLS', normalize=True, softmax=None, step_tag_id=None, returned_token_ids=None), compilation_config={"compile_sizes": [], "inductor_compile_config": {"enable_auto_functionalized_v2": false}, "cudagraph_capture_sizes": [256, 248, 240, 232, 224, 216, 208, 200, 192, 184, 176, 168, 160, 152, 144, 136, 128, 120, 112, 104, 96, 88, 80, 72, 64, 56, 48, 40, 32, 24, 16, 8, 4, 2, 1], "max_capture_size": 256}, use_cached_outputs=False,
INFO 05-27 12:32:18 [cuda.py:292] Using Flash Attention backend.
INFO 05-27 12:32:19 [parallel_state.py:1064] rank 0 in world size 1 is assigned as DP rank 0, PP rank 0, TP rank 0, EP rank 0
INFO 05-27 12:32:19 [model_runner.py:1170] Starting to load model BAAI/bge-base-en-v1.5...
INFO 05-27 12:32:19 [weight_utils.py:291] Using model weights format ['*.safetensors']
INFO 05-27 12:32:20 [weight_utils.py:344] No model.safetensors.index.json found in remote.
INFO 05-27 12:32:20 [default_loader.py:280] Loading weights took 0.08 seconds
INFO 05-27 12:32:20 [model_runner.py:1202] Model loading took 0.2091 GiB and 0.738950 seconds
------------------------------------------------- Captured stderr call -------------------------------------------------
Loading safetensors checkpoint shards: 0% Completed | 0/1 [00:00<?, ?it/s]
Loading safetensors checkpoint shards: 100% Completed | 1/1 [00:00<00:00, 13.57it/s]
Adding requests: 100%|██████████| 66/66 [00:00<00:00, 3031.89it/s]
Processed prompts: 100%|██████████| 66/66 [00:00<00:00, 176.21it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
________________________________________ test_roberta_model_loading_with_params ________________________________________
vllm_runner = <class 'tests.conftest.VllmRunner'>
@pytest.mark.skipif(current_platform.is_rocm(),
reason="Xformers backend is not supported on ROCm.")
def test_roberta_model_loading_with_params(vllm_runner):
"""
Test parameter weight loading with tp>1.
"""
with vllm_runner(model_name=MODEL_NAME_ROBERTA,
revision=REVISION_ROBERTA,
dtype="float16",
max_model_len=MAX_MODEL_LEN) as vllm_model:
output = vllm_model.encode("Write a short story about a robot that"
" dreams for the first time.\n")
model_config = vllm_model.model.llm_engine.model_config
model_tokenizer = vllm_model.model.llm_engine.tokenizer
# asserts on the bert model config file
assert model_config.encoder_config["max_seq_length"] == 512
assert not model_config.encoder_config["do_lower_case"]
# asserts on the pooling config files
assert model_config.pooler_config.pooling_type == PoolingType.MEAN.name
> assert model_config.pooler_config.pooling_norm
E AttributeError: 'PoolerConfig' object has no attribute 'pooling_norm'. Did you mean: 'pooling_type'?
tests/model_executor/test_model_load_with_params.py:83: AttributeError
------------------------------------------------- Captured stdout call -------------------------------------------------
INFO 05-27 12:32:21 [config.py:577] Found sentence-transformers tokenize configuration.
INFO 05-27 12:32:21 [config.py:3131] Downcasting torch.float32 to torch.float16.
INFO 05-27 12:32:21 [config.py:473] Found sentence-transformers modules configuration.
INFO 05-27 12:32:21 [config.py:493] Found pooling configuration.
INFO 05-27 12:32:21 [config.py:793] This model supports multiple tasks: {'reward', 'embed', 'classify', 'score'}. Defaulting to 'embed'.
WARNING 05-27 12:32:21 [arg_utils.py:1583] --task embed is not supported by the V1 Engine. Falling back to V0.
INFO 05-27 12:32:21 [llm_engine.py:230] Initializing a V0 LLM engine (v0.9.1.dev148+gfba064270.d20250526) with config: model='intfloat/multilingual-e5-small', speculative_config=None, tokenizer='intfloat/multilingual-e5-small', skip_tokenizer_init=False, tokenizer_mode=auto, revision=main, override_neuron_config={}, tokenizer_revision=main, trust_remote_code=True, dtype=torch.float16, max_seq_len=128, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, device_config=cuda, decoding_config=DecodingConfig(backend='auto', disable_fallback=False, disable_any_whitespace=False, disable_additional_properties=False, reasoning_backend=''), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None), seed=0, served_model_name=intfloat/multilingual-e5-small, num_scheduler_steps=1, multi_step_stream_outputs=True, enable_prefix_caching=None, chunked_prefill_enabled=False, use_async_output_proc=False, pooler_config=PoolerConfig(pooling_type='MEAN', normalize=True, softmax=None, step_tag_id=None, returned_token_ids=None), compilation_config={"compile_sizes": [], "inductor_compile_config": {"enable_auto_functionalized_v2": false}, "cudagraph_capture_sizes": [256, 248, 240, 232, 224, 216, 208, 200, 192, 184, 176, 168, 160, 152, 144, 136, 128, 120, 112, 104, 96, 88, 80, 72, 64, 56, 48, 40, 32, 24, 16, 8, 4, 2, 1], "max_capture_size": 256}, use_cached_outputs=False,
INFO 05-27 12:32:23 [cuda.py:292] Using Flash Attention backend.
INFO 05-27 12:32:23 [parallel_state.py:1064] rank 0 in world size 1 is assigned as DP rank 0, PP rank 0, TP rank 0, EP rank 0
INFO 05-27 12:32:23 [model_runner.py:1170] Starting to load model intfloat/multilingual-e5-small...
INFO 05-27 12:32:24 [weight_utils.py:291] Using model weights format ['*.safetensors']
INFO 05-27 12:32:24 [weight_utils.py:344] No model.safetensors.index.json found in remote.
INFO 05-27 12:32:24 [default_loader.py:280] Loading weights took 0.06 seconds
INFO 05-27 12:32:24 [model_runner.py:1202] Model loading took 0.2204 GiB and 1.087685 seconds
------------------------------------------------- Captured stderr call -------------------------------------------------
Loading safetensors checkpoint shards: 0% Completed | 0/1 [00:00<?, ?it/s]
Loading safetensors checkpoint shards: 100% Completed | 1/1 [00:00<00:00, 18.67it/s]
Adding requests: 100%|██████████| 66/66 [00:00<00:00, 11180.74it/s]
Processed prompts: 100%|██████████| 66/66 [00:00<00:00, 2941.53it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
=============================================== short test summary info ================================================
FAILED tests/model_executor/test_model_load_with_params.py::test_model_loading_with_params - AttributeError: 'PoolerConfig' object has no attribute 'pooling_norm'. Did you mean: 'pooling_type'?
FAILED tests/model_executor/test_model_load_with_params.py::test_roberta_model_loading_with_params - AttributeError: 'PoolerConfig' object has no attribute 'pooling_norm'. Did you mean: 'pooling_type'?
======================================== 2 failed, 1 passed in 64.00s (0:01:04) ========================================
Before submitting a new issue...
- Make sure you already searched for relevant issues, and asked the chatbot living at the bottom right corner of the documentation page, which can answer lots of frequently asked questions.
Metadata
Metadata
Assignees
Labels
bugSomething isn't workingSomething isn't working