Skip to content

Commit 112759d

Browse files
authored
[Doc][LLM] add VLM example to doc (#53018)
Signed-off-by: Linkun <[email protected]>
1 parent 7fe1088 commit 112759d

File tree

3 files changed

+111
-2
lines changed

3 files changed

+111
-2
lines changed

.vale/styles/config/vocabularies/Data/accept.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,4 +29,5 @@ runai
2929
Spotify('s)?
3030
TFRecord(s)?
3131
UDF(s)?
32+
VLM(s)?
3233
XGBoost

ci/lint/check-documentation-style.sh

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,15 @@
33
set -euxo pipefail
44

55
VALE_BIN=$(mktemp -d)
6-
wget https://github.com/errata-ai/vale/releases/download/v3.4.1/vale_3.4.1_Linux_64-bit.tar.gz -P "$VALE_BIN"
7-
tar -xvzf "$VALE_BIN"/vale_3.4.1_Linux_64-bit.tar.gz -C "$VALE_BIN" vale
6+
if [[ "$OSTYPE" == "linux-gnu"* ]]; then
7+
wget https://github.com/errata-ai/vale/releases/download/v3.4.1/vale_3.4.1_Linux_64-bit.tar.gz -P "$VALE_BIN"
8+
tar -xvzf "$VALE_BIN"/vale_3.4.1_Linux_64-bit.tar.gz -C "$VALE_BIN" vale
9+
elif [[ "$OSTYPE" == "darwin"* ]]; then
10+
wget https://github.com/errata-ai/vale/releases/download/v3.4.1/vale_3.4.1_macOS_arm64.tar.gz -P "$VALE_BIN"
11+
tar -xvzf "$VALE_BIN"/vale_3.4.1_macOS_arm64.tar.gz -C "$VALE_BIN" vale
12+
else
13+
echo "Unsupported OS: $OSTYPE"
14+
exit 1
15+
fi
816
"$VALE_BIN"/vale doc/source/data doc/source/ray-overview/examples
917
rm -rf "$VALE_BIN"

doc/source/data/working-with-llms.rst

Lines changed: 100 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -185,6 +185,106 @@ To do multi-LoRA batch inference, you need to set LoRA related parameters in `en
185185
batch_size=64,
186186
)
187187

188+
.. _vision_language_model:
189+
190+
Batch inference with vision-language-model (VLM)
191+
--------------------------------------------------------
192+
193+
Ray Data LLM also supports running batch inference with vision language
194+
models. This example shows how to prepare a dataset with images and run
195+
batch inference with a vision language model.
196+
197+
This example applies 2 adjustments on top of the previous example:
198+
199+
- set `has_image=True` in `vLLMEngineProcessorConfig`
200+
- prepare image input inside preprocessor
201+
202+
.. testcode::
203+
204+
# Load "LMMs-Eval-Lite" dataset from Hugging Face.
205+
vision_dataset_llms_lite = datasets.load_dataset("lmms-lab/LMMs-Eval-Lite", "coco2017_cap_val")
206+
vision_dataset = ray.data.from_huggingface(vision_dataset_llms_lite["lite"])
207+
208+
vision_processor_config = vLLMEngineProcessorConfig(
209+
model_source="Qwen/Qwen2.5-VL-3B-Instruct",
210+
engine_kwargs=dict(
211+
tensor_parallel_size=1,
212+
pipeline_parallel_size=1,
213+
max_model_len=4096,
214+
enable_chunked_prefill=True,
215+
max_num_batched_tokens=2048,
216+
),
217+
# Override Ray's runtime env to include the Hugging Face token. Ray Data uses Ray under the hood to orchestrate the inference pipeline.
218+
runtime_env=dict(
219+
env_vars=dict(
220+
HF_TOKEN=HF_TOKEN,
221+
VLLM_USE_V1="1",
222+
),
223+
),
224+
batch_size=16,
225+
accelerator_type="L4",
226+
concurrency=1,
227+
has_image=True,
228+
)
229+
230+
def vision_preprocess(row: dict) -> dict:
231+
choice_indices = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H']
232+
return dict(
233+
messages=[
234+
{
235+
"role": "system",
236+
"content": """Analyze the image and question carefully, using step-by-step reasoning.
237+
First, describe any image provided in detail. Then, present your reasoning. And finally your final answer in this format:
238+
Final Answer: <answer>
239+
where <answer> is:
240+
- The single correct letter choice A, B, C, D, E, F, etc. when options are provided. Only include the letter.
241+
- Your direct answer if no options are given, as a single phrase or number.
242+
- If your answer is a number, only include the number without any unit.
243+
- If your answer is a word or phrase, do not paraphrase or reformat the text you see in the image.
244+
- You cannot answer that the question is unanswerable. You must either pick an option or provide a direct answer.
245+
IMPORTANT: Remember, to end your answer with Final Answer: <answer>.""",
246+
},
247+
{
248+
"role": "user",
249+
"content": [
250+
{
251+
"type": "text",
252+
"text": row["question"] + "\n\n"
253+
},
254+
{
255+
"type": "image",
256+
# Ray Data accepts PIL Image or image URL.
257+
"image": Image.open(BytesIO(row["image"]["bytes"]))
258+
},
259+
{
260+
"type": "text",
261+
"text": "\n\nChoices:\n" + "\n".join([f"{choice_indices[i]}. {choice}" for i, choice in enumerate(row["answer"])])
262+
}
263+
]
264+
},
265+
],
266+
sampling_params=dict(
267+
temperature=0.3,
268+
max_tokens=150,
269+
detokenize=False,
270+
),
271+
)
272+
273+
def vision_postprocess(row: dict) -> dict:
274+
return {
275+
"resp": row["generated_text"],
276+
}
277+
278+
vision_processor = build_llm_processor(
279+
vision_processor_config,
280+
preprocess=vision_preprocess,
281+
postprocess=vision_postprocess,
282+
)
283+
284+
vision_processed_ds = vision_processor(vision_dataset).materialize()
285+
vision_processed_ds.show(3)
286+
287+
188288
.. _openai_compatible_api_endpoint:
189289

190290
Batch inference with an OpenAI-compatible endpoint

0 commit comments

Comments
 (0)