@@ -185,6 +185,106 @@ To do multi-LoRA batch inference, you need to set LoRA related parameters in `en
185
185
batch_size=64,
186
186
)
187
187
188
+ .. _vision_language_model :
189
+
190
+ Batch inference with vision-language-model (VLM)
191
+ --------------------------------------------------------
192
+
193
+ Ray Data LLM also supports running batch inference with vision language
194
+ models. This example shows how to prepare a dataset with images and run
195
+ batch inference with a vision language model.
196
+
197
+ This example applies 2 adjustments on top of the previous example:
198
+
199
+ - set `has_image=True ` in `vLLMEngineProcessorConfig `
200
+ - prepare image input inside preprocessor
201
+
202
+ .. testcode ::
203
+
204
+ # Load "LMMs-Eval-Lite" dataset from Hugging Face.
205
+ vision_dataset_llms_lite = datasets.load_dataset("lmms-lab/LMMs-Eval-Lite", "coco2017_cap_val")
206
+ vision_dataset = ray.data.from_huggingface(vision_dataset_llms_lite["lite"])
207
+
208
+ vision_processor_config = vLLMEngineProcessorConfig(
209
+ model_source="Qwen/Qwen2.5-VL-3B-Instruct",
210
+ engine_kwargs=dict(
211
+ tensor_parallel_size=1,
212
+ pipeline_parallel_size=1,
213
+ max_model_len=4096,
214
+ enable_chunked_prefill=True,
215
+ max_num_batched_tokens=2048,
216
+ ),
217
+ # Override Ray's runtime env to include the Hugging Face token. Ray Data uses Ray under the hood to orchestrate the inference pipeline.
218
+ runtime_env=dict(
219
+ env_vars=dict(
220
+ HF_TOKEN=HF_TOKEN,
221
+ VLLM_USE_V1="1",
222
+ ),
223
+ ),
224
+ batch_size=16,
225
+ accelerator_type="L4",
226
+ concurrency=1,
227
+ has_image=True,
228
+ )
229
+
230
+ def vision_preprocess(row: dict) -> dict:
231
+ choice_indices = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H']
232
+ return dict(
233
+ messages=[
234
+ {
235
+ "role": "system",
236
+ "content": """Analyze the image and question carefully, using step-by-step reasoning.
237
+ First, describe any image provided in detail. Then, present your reasoning. And finally your final answer in this format:
238
+ Final Answer: <answer>
239
+ where <answer> is:
240
+ - The single correct letter choice A, B, C, D, E, F, etc. when options are provided. Only include the letter.
241
+ - Your direct answer if no options are given, as a single phrase or number.
242
+ - If your answer is a number, only include the number without any unit.
243
+ - If your answer is a word or phrase, do not paraphrase or reformat the text you see in the image.
244
+ - You cannot answer that the question is unanswerable. You must either pick an option or provide a direct answer.
245
+ IMPORTANT: Remember, to end your answer with Final Answer: <answer>.""",
246
+ },
247
+ {
248
+ "role": "user",
249
+ "content": [
250
+ {
251
+ "type": "text",
252
+ "text": row["question"] + "\n\n "
253
+ },
254
+ {
255
+ "type": "image",
256
+ # Ray Data accepts PIL Image or image URL.
257
+ "image": Image.open(BytesIO(row["image"]["bytes"]))
258
+ },
259
+ {
260
+ "type": "text",
261
+ "text": "\n\n Choices:\n " + "\n ".join([f"{choice_indices[i]}. {choice}" for i, choice in enumerate(row["answer"])])
262
+ }
263
+ ]
264
+ },
265
+ ],
266
+ sampling_params=dict(
267
+ temperature=0.3,
268
+ max_tokens=150,
269
+ detokenize=False,
270
+ ),
271
+ )
272
+
273
+ def vision_postprocess(row: dict) -> dict:
274
+ return {
275
+ "resp": row["generated_text"],
276
+ }
277
+
278
+ vision_processor = build_llm_processor(
279
+ vision_processor_config,
280
+ preprocess=vision_preprocess,
281
+ postprocess=vision_postprocess,
282
+ )
283
+
284
+ vision_processed_ds = vision_processor(vision_dataset).materialize()
285
+ vision_processed_ds.show(3)
286
+
287
+
188
288
.. _openai_compatible_api_endpoint :
189
289
190
290
Batch inference with an OpenAI-compatible endpoint
0 commit comments