1
- from typing import List , Optional , Tuple , Type , overload
1
+ from typing import List , Optional , Tuple , Type
2
2
3
3
import pytest
4
4
from transformers import (AutoConfig , AutoModelForVision2Seq , AutoTokenizer ,
9
9
from vllm .sequence import SampleLogprobs
10
10
from vllm .utils import STR_DTYPE_TO_TORCH_DTYPE
11
11
12
- from ....conftest import (VIDEO_ASSETS , HfRunner , PromptImageInput , VllmRunner ,
13
- _VideoAssets )
14
- from ....utils import large_gpu_test
12
+ from ....conftest import (VIDEO_ASSETS , HfRunner , PromptImageInput ,
13
+ PromptVideoInput , VllmRunner )
15
14
from ...utils import check_logprobs_close
16
15
17
16
# Video test
20
19
"<|im_start|>user\n <video>\n why is this video funny?<|im_end|>\n <|im_start|>assistant\n " # noqa: E501
21
20
})
22
21
23
- models = ["llava-hf/llava-onevision-qwen2-7b -ov-hf" ]
22
+ models = ["llava-hf/llava-onevision-qwen2-0.5b -ov-hf" ]
24
23
25
24
26
25
def vllm_to_hf_output (vllm_output : Tuple [List [int ], str ,
@@ -47,50 +46,16 @@ def vllm_to_hf_output(vllm_output: Tuple[List[int], str,
47
46
return hf_output_ids , hf_output_str , out_logprobs
48
47
49
48
50
- @overload
51
- def run_video_test (
52
- hf_runner : Type [HfRunner ],
53
- vllm_runner : Type [VllmRunner ],
54
- video_assets : _VideoAssets ,
55
- model : str ,
56
- * ,
57
- size_factors : List [float ],
58
- dtype : str ,
59
- max_tokens : int ,
60
- num_logprobs : int ,
61
- num_frames : int ,
62
- tensor_parallel_size : int ,
63
- distributed_executor_backend : Optional [str ] = None ,
64
- ):
65
- ...
66
-
67
-
68
- @overload
69
- def run_video_test (
70
- hf_runner : Type [HfRunner ],
71
- vllm_runner : Type [VllmRunner ],
72
- video_assets : _VideoAssets ,
73
- model : str ,
74
- * ,
75
- sizes : List [Tuple [int , int ]],
76
- dtype : str ,
77
- max_tokens : int ,
78
- num_logprobs : int ,
79
- num_frames : int ,
80
- tensor_parallel_size : int ,
81
- distributed_executor_backend : Optional [str ] = None ,
82
- ):
83
- ...
49
+ # Video test
50
+ _LIMIT_VIDEO_PER_PROMPT = 4
84
51
85
52
86
53
def run_video_test (
87
54
hf_runner : Type [HfRunner ],
88
55
vllm_runner : Type [VllmRunner ],
89
- video_assets : _VideoAssets ,
56
+ inputs : List [ Tuple [ List [ str ], PromptVideoInput ]] ,
90
57
model : str ,
91
58
* ,
92
- size_factors : Optional [List [float ]] = None ,
93
- sizes : Optional [List [Tuple [int , int ]]] = None ,
94
59
dtype : str ,
95
60
max_tokens : int ,
96
61
num_logprobs : int ,
@@ -99,38 +64,20 @@ def run_video_test(
99
64
distributed_executor_backend : Optional [str ] = None ,
100
65
):
101
66
torch_dtype = STR_DTYPE_TO_TORCH_DTYPE [dtype ]
102
-
103
- videos = [
104
- sample_frames_from_video (asset .np_ndarrays , num_frames )
105
- for asset in video_assets
106
- ]
107
-
108
- if size_factors is not None :
109
- inputs_per_video = [(
110
- [prompt for _ in size_factors ],
111
- [rescale_video_size (video , factor ) for factor in size_factors ],
112
- ) for video , prompt in zip (videos , HF_VIDEO_PROMPTS )]
113
- elif sizes is not None :
114
- inputs_per_video = [(
115
- [prompt for _ in sizes ],
116
- [resize_video (video , size ) for size in sizes ],
117
- ) for video , prompt in zip (videos , HF_VIDEO_PROMPTS )]
118
- else :
119
- raise ValueError ("You must provide either `size_factors` or `sizes`" )
120
-
121
- # max_model_len should be greater than image_feature_size
122
67
with vllm_runner (model ,
123
68
dtype = dtype ,
124
- max_model_len = 4096 ,
69
+ max_model_len = 16384 ,
125
70
tensor_parallel_size = tensor_parallel_size ,
126
71
distributed_executor_backend = distributed_executor_backend ,
127
- enforce_eager = True ) as vllm_model :
128
- vllm_outputs_per_video = [
72
+ enforce_eager = True ,
73
+ limit_mm_per_prompt = {"video" : _LIMIT_VIDEO_PER_PROMPT
74
+ }) as vllm_model :
75
+ vllm_outputs_per_input = [
129
76
vllm_model .generate_greedy_logprobs (prompts ,
130
77
max_tokens ,
131
78
num_logprobs = num_logprobs ,
132
79
videos = videos )
133
- for prompts , videos in inputs_per_video
80
+ for prompts , videos in inputs
134
81
]
135
82
136
83
def process (hf_inputs : BatchEncoding ):
@@ -142,16 +89,16 @@ def process(hf_inputs: BatchEncoding):
142
89
dtype = dtype ,
143
90
postprocess_inputs = process ,
144
91
auto_cls = AutoModelForVision2Seq ) as hf_model :
145
- hf_outputs_per_video = [
92
+ hf_outputs_per_input = [
146
93
hf_model .generate_greedy_logprobs_limit (prompts ,
147
94
max_tokens ,
148
95
num_logprobs = num_logprobs ,
149
96
videos = videos )
150
- for prompts , videos in inputs_per_video
97
+ for prompts , videos in inputs
151
98
]
152
99
153
- for hf_outputs , vllm_outputs in zip (hf_outputs_per_video ,
154
- vllm_outputs_per_video ):
100
+ for hf_outputs , vllm_outputs in zip (hf_outputs_per_input ,
101
+ vllm_outputs_per_input ):
155
102
# TODO: Check whether using original CLIPVisionModel can improve
156
103
# consistency against HF
157
104
check_logprobs_close (
@@ -165,74 +112,51 @@ def process(hf_inputs: BatchEncoding):
165
112
)
166
113
167
114
168
- @large_gpu_test (min_gb = 48 )
169
115
@pytest .mark .parametrize ("model" , models )
170
- @pytest .mark .parametrize (
171
- "size_factors" ,
172
- [
173
- # No video
174
- [],
175
- # Single-scale
176
- [1.0 ],
177
- # Single-scale, batched
178
- [1.0 , 1.0 , 1.0 ],
179
- # Multi-scale
180
- [0.25 , 0.5 , 1.0 ],
181
- ],
182
- )
183
116
@pytest .mark .parametrize ("dtype" , ["half" ])
184
117
@pytest .mark .parametrize ("max_tokens" , [128 ])
185
118
@pytest .mark .parametrize ("num_logprobs" , [5 ])
186
119
@pytest .mark .parametrize ("num_frames" , [16 ])
187
- def test_models (hf_runner , vllm_runner , video_assets , model , size_factors ,
188
- dtype , max_tokens , num_logprobs , num_frames ) -> None :
189
- """Inference result should be the same between hf and vllm.
190
-
191
- All the image fixtures for the test is under tests/videos.
192
- For huggingface runner, we provide the np.ndarray as input.
193
- For vllm runner, we provide MultiModalDataDict objects
194
- and corresponding MultiModalConfig as input.
195
- Note, the text input is also adjusted to abide by vllm contract.
196
- The text output is sanitized to be able to compare with hf.
197
- """
120
+ def test_models_multiple_video_inputs (hf_runner , vllm_runner , video_assets ,
121
+ model , dtype , max_tokens , num_logprobs ,
122
+ num_frames ) -> None :
123
+ video = sample_frames_from_video (video_assets [0 ].np_ndarrays , num_frames )
124
+ inputs = [(
125
+ [
126
+ "<|im_start|>user <video><video>\n Describe 2 videos. \
127
+ <|im_end|><|im_start|>assistant\n " ,
128
+ "<|im_start|>user <video><video>\n Describe 2 videos. \
129
+ <|im_end|><|im_start|>assistant\n " ,
130
+ "<|im_start|>user <video><video><video><video>\n Describe 4 videos. \
131
+ <|im_end|><|im_start|>assistant\n " ,
132
+ "<|im_start|>user <video>\n why is this video funny? \
133
+ <|im_end|><|im_start|>assistant\n " ,
134
+ ],
135
+ [
136
+ [video , video ],
137
+ # Images with different sizes and aspect-ratios
138
+ [
139
+ rescale_video_size (video , 0.1 ),
140
+ video ,
141
+ ],
142
+ [
143
+ video ,
144
+ rescale_video_size (video , 0.25 ),
145
+ resize_video (video , (183 , 488 )),
146
+ resize_video (video , (488 , 183 ))
147
+ ],
148
+ video ,
149
+ ])]
198
150
run_video_test (
199
151
hf_runner ,
200
152
vllm_runner ,
201
- video_assets ,
153
+ inputs ,
202
154
model ,
203
- size_factors = size_factors ,
204
155
dtype = dtype ,
205
156
max_tokens = max_tokens ,
206
157
num_logprobs = num_logprobs ,
207
- num_frames = num_frames ,
208
158
tensor_parallel_size = 1 ,
209
- )
210
-
211
-
212
- @large_gpu_test (min_gb = 48 )
213
- @pytest .mark .parametrize ("model" , models )
214
- @pytest .mark .parametrize (
215
- "sizes" ,
216
- [[(1669 , 2560 ), (2560 , 1669 ), (183 , 488 ), (488 , 183 )]],
217
- )
218
- @pytest .mark .parametrize ("dtype" , ["half" ])
219
- @pytest .mark .parametrize ("max_tokens" , [128 ])
220
- @pytest .mark .parametrize ("num_logprobs" , [5 ])
221
- @pytest .mark .parametrize ("num_frames" , [16 ])
222
- def test_models_fixed_sizes (hf_runner , vllm_runner , video_assets , model , sizes ,
223
- dtype , max_tokens , num_logprobs ,
224
- num_frames ) -> None :
225
- run_video_test (
226
- hf_runner ,
227
- vllm_runner ,
228
- video_assets ,
229
- model ,
230
- sizes = sizes ,
231
- dtype = dtype ,
232
- max_tokens = max_tokens ,
233
- num_logprobs = num_logprobs ,
234
159
num_frames = num_frames ,
235
- tensor_parallel_size = 1 ,
236
160
)
237
161
238
162
@@ -303,7 +227,6 @@ def process(hf_inputs: BatchEncoding):
303
227
)
304
228
305
229
306
- @large_gpu_test (min_gb = 48 )
307
230
@pytest .mark .parametrize ("model" , models )
308
231
@pytest .mark .parametrize ("dtype" , ["half" ])
309
232
@pytest .mark .parametrize ("max_tokens" , [128 ])
0 commit comments