[Doc][LLM] add VLM example to doc (#53018)

lk-chen · web-flow · commit 112759d81ec6 · 2025-05-16T08:54:27.000-07:00
Signed-off-by: Linkun &lt;github@lkchen.net&gt;
diff --git a/.vale/styles/config/vocabularies/Data/accept.txt b/.vale/styles/config/vocabularies/Data/accept.txt
@@ -29,4 +29,5 @@ runai
 Spotify('s)?
 TFRecord(s)?
 UDF(s)?
+VLM(s)?
 XGBoost
diff --git a/ci/lint/check-documentation-style.sh b/ci/lint/check-documentation-style.sh
@@ -3,7 +3,15 @@
 set -euxo pipefail
 
 VALE_BIN=$(mktemp -d)
-wget https://github.com/errata-ai/vale/releases/download/v3.4.1/vale_3.4.1_Linux_64-bit.tar.gz -P "$VALE_BIN"
-tar -xvzf "$VALE_BIN"/vale_3.4.1_Linux_64-bit.tar.gz -C "$VALE_BIN" vale
+if [[ "$OSTYPE" == "linux-gnu"* ]]; then
+    wget https://github.com/errata-ai/vale/releases/download/v3.4.1/vale_3.4.1_Linux_64-bit.tar.gz -P "$VALE_BIN"
+    tar -xvzf "$VALE_BIN"/vale_3.4.1_Linux_64-bit.tar.gz -C "$VALE_BIN" vale
+elif [[ "$OSTYPE" == "darwin"* ]]; then
+    wget https://github.com/errata-ai/vale/releases/download/v3.4.1/vale_3.4.1_macOS_arm64.tar.gz -P "$VALE_BIN"
+    tar -xvzf "$VALE_BIN"/vale_3.4.1_macOS_arm64.tar.gz -C "$VALE_BIN" vale
+else
+    echo "Unsupported OS: $OSTYPE"
+    exit 1
+fi
 "$VALE_BIN"/vale doc/source/data doc/source/ray-overview/examples
 rm -rf "$VALE_BIN"
diff --git a/doc/source/data/working-with-llms.rst b/doc/source/data/working-with-llms.rst
@@ -185,6 +185,106 @@ To do multi-LoRA batch inference, you need to set LoRA related parameters in `en
         batch_size=64,
     )
 
+.. _vision_language_model:
+
+Batch inference with vision-language-model (VLM)
+--------------------------------------------------------
+
+Ray Data LLM also supports running batch inference with vision language
+models. This example shows how to prepare a dataset with images and run
+batch inference with a vision language model.
+
+This example applies 2 adjustments on top of the previous example:
+
+- set `has_image=True` in `vLLMEngineProcessorConfig`
+- prepare image input inside preprocessor
+
+.. testcode::
+
+    # Load "LMMs-Eval-Lite" dataset from Hugging Face.
+    vision_dataset_llms_lite = datasets.load_dataset("lmms-lab/LMMs-Eval-Lite", "coco2017_cap_val")
+    vision_dataset = ray.data.from_huggingface(vision_dataset_llms_lite["lite"])
+
+    vision_processor_config = vLLMEngineProcessorConfig(
+        model_source="Qwen/Qwen2.5-VL-3B-Instruct",
+        engine_kwargs=dict(
+            tensor_parallel_size=1,
+            pipeline_parallel_size=1,
+            max_model_len=4096,
+            enable_chunked_prefill=True,
+            max_num_batched_tokens=2048,
+        ),
+        # Override Ray's runtime env to include the Hugging Face token. Ray Data uses Ray under the hood to orchestrate the inference pipeline.
+        runtime_env=dict(
+            env_vars=dict(
+                HF_TOKEN=HF_TOKEN,
+                VLLM_USE_V1="1",
+            ),
+        ),
+        batch_size=16,
+        accelerator_type="L4",
+        concurrency=1,
+        has_image=True,
+    )
+
+    def vision_preprocess(row: dict) -> dict:
+        choice_indices = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H']
+        return dict(
+            messages=[
+                {
+                    "role": "system",
+                    "content": """Analyze the image and question carefully, using step-by-step reasoning.
+    First, describe any image provided in detail. Then, present your reasoning. And finally your final answer in this format:
+    Final Answer: <answer>
+    where <answer> is:
+    - The single correct letter choice A, B, C, D, E, F, etc. when options are provided. Only include the letter.
+    - Your direct answer if no options are given, as a single phrase or number.
+    - If your answer is a number, only include the number without any unit.
+    - If your answer is a word or phrase, do not paraphrase or reformat the text you see in the image.
+    - You cannot answer that the question is unanswerable. You must either pick an option or provide a direct answer.
+    IMPORTANT: Remember, to end your answer with Final Answer: <answer>.""",
+                },
+                {
+                    "role": "user",
+                    "content": [
+                        {
+                            "type": "text",
+                            "text": row["question"] + "\n\n"
+                        },
+                        {
+                            "type": "image",
+                            # Ray Data accepts PIL Image or image URL.
+                            "image": Image.open(BytesIO(row["image"]["bytes"]))
+                        },
+                        {
+                            "type": "text",
+                            "text": "\n\nChoices:\n" + "\n".join([f"{choice_indices[i]}. {choice}" for i, choice in enumerate(row["answer"])])
+                        }
+                    ]
+                },
+            ],
+            sampling_params=dict(
+                temperature=0.3,
+                max_tokens=150,
+                detokenize=False,
+            ),
+        )
+
+    def vision_postprocess(row: dict) -> dict:
+        return {
+            "resp": row["generated_text"],
+        }
+
+    vision_processor = build_llm_processor(
+        vision_processor_config,
+        preprocess=vision_preprocess,
+        postprocess=vision_postprocess,
+    )
+
+    vision_processed_ds = vision_processor(vision_dataset).materialize()
+    vision_processed_ds.show(3)
+
+
 .. _openai_compatible_api_endpoint:
 
 Batch inference with an OpenAI-compatible endpoint