Skip to content
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions examples/offline_inference_vision_language_multi_image.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,7 @@ def load_internvl(question: str, image_urls: List[str]) -> ModelRequestData:
trust_remote_code=True,
max_model_len=4096,
limit_mm_per_prompt={"image": len(image_urls)},
mm_processor_kwargs={"max_dynamic_patch": 4},
)

placeholders = "\n".join(f"Image-{i}: <image>\n"
Expand Down
33 changes: 22 additions & 11 deletions vllm/model_executor/models/internvl.py
Original file line number Diff line number Diff line change
Expand Up @@ -175,13 +175,16 @@ def get_internvl_num_patches(image_size: int, patch_size: int,
(downsample_ratio**2))


def get_max_internvl_image_tokens(ctx: InputContext):
def get_max_internvl_image_tokens(ctx: InputContext,
*,
max_dynamic_patch: Optional[int] = None):
hf_config = ctx.get_hf_config()
vision_config = hf_config.vision_config

if max_dynamic_patch is None:
max_dynamic_patch = hf_config.max_dynamic_patch
use_thumbnail = hf_config.use_thumbnail
max_dynamic_patch = hf_config.max_dynamic_patch
if use_thumbnail:
if use_thumbnail and max_dynamic_patch > 1:
max_dynamic_patch += 1
downsample_ratio = hf_config.downsample_ratio

Expand All @@ -192,7 +195,10 @@ def get_max_internvl_image_tokens(ctx: InputContext):
return num_patches * max_dynamic_patch


def input_processor_for_internvl(ctx: InputContext, llm_inputs: LLMInputs):
def input_processor_for_internvl(ctx: InputContext,
llm_inputs: LLMInputs,
*,
max_dynamic_patch: Optional[int] = None):
multi_modal_data = llm_inputs.get("multi_modal_data")
if multi_modal_data is None or "image" not in multi_modal_data:
return llm_inputs
Expand All @@ -209,7 +215,7 @@ def input_processor_for_internvl(ctx: InputContext, llm_inputs: LLMInputs):

image_data = multi_modal_data["image"]
min_num = hf_config.min_dynamic_patch
max_num = hf_config.max_dynamic_patch
max_num = max_dynamic_patch or hf_config.max_dynamic_patch
use_thumbnail = hf_config.use_thumbnail
if isinstance(image_data, Image.Image):
width, height = image_data.size
Expand Down Expand Up @@ -253,12 +259,15 @@ def input_processor_for_internvl(ctx: InputContext, llm_inputs: LLMInputs):
multi_modal_data=multi_modal_data)


def input_mapper_for_internvl(ctx: InputContext, data: object):
def input_mapper_for_internvl(ctx: InputContext,
data: object,
*,
max_dynamic_patch: Optional[int] = None):
hf_config = ctx.get_hf_config()

use_thumbnail = hf_config.use_thumbnail
min_num = hf_config.min_dynamic_patch
max_num = hf_config.max_dynamic_patch
max_num = max_dynamic_patch or hf_config.max_dynamic_patch
image_size = hf_config.vision_config.image_size

if isinstance(data, Image.Image):
Expand Down Expand Up @@ -292,8 +301,11 @@ def input_mapper_for_internvl(ctx: InputContext, data: object):
})


def dummy_data_for_internvl(ctx: InputContext, seq_len: int,
mm_counts: Mapping[str, int]):
def dummy_data_for_internvl(ctx: InputContext,
seq_len: int,
mm_counts: Mapping[str, int],
*,
max_dynamic_patch: Optional[int] = None):
num_images = mm_counts["image"]

image_feature_size = get_max_internvl_image_tokens(ctx)
Expand All @@ -315,7 +327,7 @@ def dummy_data_for_internvl(ctx: InputContext, seq_len: int,

image_size = vision_config.image_size
min_num = hf_config.min_dynamic_patch
max_num = hf_config.max_dynamic_patch
max_num = max_dynamic_patch or hf_config.max_dynamic_patch
max_image_width = max_num * image_size
max_image_height = min_num * image_size

Expand Down Expand Up @@ -470,7 +482,6 @@ def _process_image_input(
self,
image_input: InternVLImageInputs,
) -> torch.Tensor:

if image_input["type"] == "image_embeds":
return image_input["data"]

Expand Down
Loading