Skip to content

Commit be50bc8

Browse files
Isotr0pyLyrisZhong
authored andcommitted
[Bugfix] Fix nightly transformers CI failure (vllm-project#21427)
Signed-off-by: Isotr0py <[email protected]>
1 parent f936bf3 commit be50bc8

File tree

5 files changed

+67
-11
lines changed

5 files changed

+67
-11
lines changed

tests/models/registry.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -443,20 +443,20 @@ def check_available_online(
443443
hf_overrides={"architectures": ["TarsierForConditionalGeneration"]}), # noqa: E501
444444
"Tarsier2ForConditionalGeneration": _HfExamplesInfo("omni-research/Tarsier2-Recap-7b", # noqa: E501
445445
hf_overrides={"architectures": ["Tarsier2ForConditionalGeneration"]}), # noqa: E501
446+
"VoxtralForConditionalGeneration": _HfExamplesInfo(
447+
"mistralai/Voxtral-Mini-3B-2507",
448+
min_transformers_version="4.54",
449+
# disable this temporarily until we support HF format
450+
is_available_online=False,
451+
),
446452
# [Encoder-decoder]
447453
# Florence-2 uses BartFastTokenizer which can't be loaded from AutoTokenizer
448454
# Therefore, we borrow the BartTokenizer from the original Bart model
449455
"Florence2ForConditionalGeneration": _HfExamplesInfo("microsoft/Florence-2-base", # noqa: E501
450456
tokenizer="Isotr0py/Florence-2-tokenizer", # noqa: E501
451457
trust_remote_code=True), # noqa: E501
452458
"MllamaForConditionalGeneration": _HfExamplesInfo("meta-llama/Llama-3.2-11B-Vision-Instruct"), # noqa: E501
453-
"VoxtralForConditionalGeneration": _HfExamplesInfo(
454-
"mistralai/Voxtral-Mini-3B-2507",
455-
tokenizer_mode="mistral",
456-
min_transformers_version="4.54"
457-
),
458459
"WhisperForConditionalGeneration": _HfExamplesInfo("openai/whisper-large-v3"), # noqa: E501
459-
460460
# [Cross-encoder]
461461
"JinaVLForRanking": _HfExamplesInfo("jinaai/jina-reranker-m0"), # noqa: E501
462462
}

vllm/model_executor/models/tarsier.py

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -13,8 +13,7 @@
1313
from transformers import PretrainedConfig, SiglipVisionConfig
1414
from transformers.image_utils import ImageInput, get_image_size, to_numpy_array
1515
from transformers.models.llava import LlavaProcessor
16-
from transformers.processing_utils import (ProcessingKwargs, Unpack,
17-
_validate_images_text_input_order)
16+
from transformers.processing_utils import ProcessingKwargs, Unpack
1817
from transformers.tokenization_utils_base import PreTokenizedInput, TextInput
1918

2019
from vllm.config import VllmConfig
@@ -94,9 +93,6 @@ def __call__(
9493
raise ValueError(
9594
"You have to specify at least one of `images` or `text`.")
9695

97-
# check if images and text inputs are reversed for BC
98-
images, text = _validate_images_text_input_order(images, text)
99-
10096
output_kwargs = self._merge_kwargs(
10197
TarsierProcessorKwargs,
10298
tokenizer_init_kwargs=self.tokenizer.init_kwargs,

vllm/transformers_utils/config.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@
3737
MiniMaxText01Config,
3838
MiniMaxVL01Config, MllamaConfig,
3939
MLPSpeculatorConfig, MPTConfig,
40+
Nemotron_Nano_VL_Config,
4041
NemotronConfig, NVLM_D_Config,
4142
OvisConfig, RWConfig,
4243
SkyworkR1VChatConfig, SolarConfig,
@@ -80,6 +81,7 @@ def _get_hf_token() -> Optional[str]:
8081
"dbrx": DbrxConfig,
8182
"deepseek_vl_v2": DeepseekVLV2Config,
8283
"kimi_vl": KimiVLConfig,
84+
"Llama_Nemotron_Nano_VL": Nemotron_Nano_VL_Config,
8385
"mpt": MPTConfig,
8486
"RefinedWeb": RWConfig, # For tiiuae/falcon-40b(-instruct)
8587
"RefinedWebModel": RWConfig, # For tiiuae/falcon-7b(-instruct)

vllm/transformers_utils/configs/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
from vllm.transformers_utils.configs.mpt import MPTConfig
2424
from vllm.transformers_utils.configs.nemotron import NemotronConfig
2525
from vllm.transformers_utils.configs.nemotron_h import NemotronHConfig
26+
from vllm.transformers_utils.configs.nemotron_vl import Nemotron_Nano_VL_Config
2627
from vllm.transformers_utils.configs.nvlm_d import NVLM_D_Config
2728
from vllm.transformers_utils.configs.ovis import OvisConfig
2829
from vllm.transformers_utils.configs.skyworkr1v import SkyworkR1VChatConfig
@@ -50,6 +51,7 @@
5051
"KimiVLConfig",
5152
"NemotronConfig",
5253
"NemotronHConfig",
54+
"Nemotron_Nano_VL_Config",
5355
"NVLM_D_Config",
5456
"OvisConfig",
5557
"SkyworkR1VChatConfig",
Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
# SPDX-License-Identifier: Apache-2.0
2+
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3+
4+
# yapf: disable
5+
# ruff: noqa: E501
6+
# Adapted from
7+
# https://huggingface.co/nvidia/Llama-3.1-Nemotron-Nano-VL-8B-V1/blob/main/configuration.py
8+
# --------------------------------------------------------
9+
# Adapted from https://huggingface.co/OpenGVLab/InternVL2-Llama3-76B under MIT License
10+
# LICENSE is in incl_licenses directory.
11+
# --------------------------------------------------------
12+
13+
from transformers import LlamaConfig
14+
from transformers.configuration_utils import PretrainedConfig
15+
from transformers.dynamic_module_utils import get_class_from_dynamic_module
16+
17+
18+
class Nemotron_Nano_VL_Config(PretrainedConfig):
19+
model_type = 'Llama_Nemotron_Nano_VL'
20+
is_composition = True
21+
22+
def __init__(
23+
self,
24+
vision_config=None,
25+
llm_config=None,
26+
force_image_size=None,
27+
downsample_ratio=0.5,
28+
template=None,
29+
ps_version='v1',
30+
image_tag_type="internvl",
31+
projector_hidden_size=4096,
32+
vit_hidden_size=1280,
33+
**kwargs
34+
):
35+
super().__init__(**kwargs)
36+
37+
if vision_config is not None:
38+
assert "auto_map" in vision_config and "AutoConfig" in vision_config["auto_map"]
39+
vision_auto_config = get_class_from_dynamic_module(*vision_config["auto_map"]["AutoConfig"].split("--")[::-1])
40+
self.vision_config = vision_auto_config(**vision_config)
41+
else:
42+
self.vision_config = PretrainedConfig()
43+
44+
if llm_config is None:
45+
self.text_config = LlamaConfig()
46+
else:
47+
self.text_config = LlamaConfig(**llm_config)
48+
49+
# Assign configuration values
50+
self.force_image_size = force_image_size
51+
self.downsample_ratio = downsample_ratio
52+
self.template = template # TODO move out of here and into the tokenizer
53+
self.ps_version = ps_version # Pixel shuffle version
54+
self.image_tag_type = image_tag_type # TODO: into the tokenizer too?
55+
self.projector_hidden_size = projector_hidden_size
56+
self.vit_hidden_size = vit_hidden_size

0 commit comments

Comments
 (0)