ray-project
diff --git a/‎.vale/styles/config/vocabularies/General/accept.txt‎
Lines changed: 9 additions & 1 deletion b/‎.vale/styles/config/vocabularies/General/accept.txt‎
Lines changed: 9 additions & 1 deletion
diff --git a/‎doc/BUILD.bazel‎
Lines changed: 6 additions & 0 deletions b/‎doc/BUILD.bazel‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎doc/source/conf.py‎
Lines changed: 2 additions & 0 deletions b/‎doc/source/conf.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎doc/source/serve/examples.yml‎
Lines changed: 48 additions & 0 deletions b/‎doc/source/serve/examples.yml‎
Lines changed: 48 additions & 0 deletions
diff --git a/‎doc/source/serve/tutorials/BUILD.bazel‎
Lines changed: 0 additions & 5 deletions b/‎doc/source/serve/tutorials/BUILD.bazel‎
Lines changed: 0 additions & 5 deletions
diff --git a/‎doc/source/serve/tutorials/deployment-serve-llm/README.ipynb‎
Lines changed: 58 additions & 0 deletions b/‎doc/source/serve/tutorials/deployment-serve-llm/README.ipynb‎
Lines changed: 58 additions & 0 deletions
diff --git a/‎doc/source/serve/tutorials/deployment-serve-llm/README.md‎
Lines changed: 41 additions & 0 deletions b/‎doc/source/serve/tutorials/deployment-serve-llm/README.md‎
Lines changed: 41 additions & 0 deletions
diff --git a/‎doc/source/serve/tutorials/deployment-serve-llm/ci/aws.yaml‎
Lines changed: 14 additions & 0 deletions b/‎doc/source/serve/tutorials/deployment-serve-llm/ci/aws.yaml‎
Lines changed: 14 additions & 0 deletions
diff --git a/‎doc/source/serve/tutorials/deployment-serve-llm/ci/build.sh‎
Lines changed: 3 additions & 0 deletions b/‎doc/source/serve/tutorials/deployment-serve-llm/ci/build.sh‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎doc/source/serve/tutorials/deployment-serve-llm/ci/gce.yaml‎
Lines changed: 14 additions & 0 deletions b/‎doc/source/serve/tutorials/deployment-serve-llm/ci/gce.yaml‎
Lines changed: 14 additions & 0 deletions
@@ -33,6 +33,7 @@ autoscales
 bool
 breakpoint
 BTS
+bursty
 chatbot
 CLI
 configs
@@ -45,8 +46,10 @@ deserialize
 deserializes
 dev
 dev to prod
-disable
+[d|D]isable[d]
+[d|D]isable
 DLinear
+Dockerfile
 DPO
 EKS
 ETDataset
@@ -69,13 +72,15 @@ LMs
 LSH
 MCP
 Megatron
+Mixtral
 MLflow
 MLOps
 namespace
 NER
 Nsight
 NumPy
 NVIDIA
+NVLink
 OOM
 open-source
 PACK
@@ -86,6 +91,8 @@ pretraining
 productionize
 Pythonic
 QPS
+Qwen
+Quantizing
 retrigger
 RISECamp
 RLHF
@@ -104,6 +111,7 @@ teardown
 uncaptured
 URI(s)?
 UUID
+USD
 uv
 verl
 VM(s)?
 
@@ -606,3 +606,9 @@ filegroup(
     srcs = glob(["source/ray-overview/examples/**/*.yaml"]),
     visibility = ["//release:__pkg__"],
 )
+
+filegroup(
+    name = "deployment_serve_llm_example_configs",
+    srcs = glob(["source/serve/tutorials/deployment-serve-llm/**/*.yaml"]),
+    visibility = ["//release:__pkg__"],
+)
@@ -228,6 +228,8 @@ def __init__(self, version: str):
     "data/api/ray.data.*.rst",
     "ray-overview/examples/**/README.md",  # Exclude .md files in examples subfolders
     "train/examples/**/README.md",
+    "serve/tutorials/deployment-serve-llm/README.*",
+    "serve/tutorials/deployment-serve-llm/*/notebook.ipynb",
 ] + autogen_files
 
 # If "DOC_LIB" is found, only build that top-level navigation item.
 
@@ -74,6 +74,54 @@ examples:
       - natural language processing
     link: tutorials/serve-deepseek
     related_technology: llm applications
+  - title: Deploying a small-sized LLM
+    skill_level: beginner
+    use_cases:
+      - generative ai
+      - large language models
+      - natural language processing
+    link: tutorials/deployment-serve-llm/small-size-llm/README
+    related_technology: llm applications
+  - title: Deploying a medium-sized LLM
+    skill_level: beginner
+    use_cases:
+      - generative ai
+      - large language models
+      - natural language processing
+    link: tutorials/deployment-serve-llm/medium-size-llm/README
+    related_technology: llm applications
+  - title: Deploying a large-sized LLM
+    skill_level: beginner
+    use_cases:
+      - generative ai
+      - large language models
+      - natural language processing
+    link: tutorials/deployment-serve-llm/large-size-llm/README
+    related_technology: llm applications
+  - title: Deploying a vision LLM
+    skill_level: beginner
+    use_cases:
+      - generative ai
+      - large language models
+      - natural language processing
+    link: tutorials/deployment-serve-llm/vision-llm/README
+    related_technology: llm applications
+  - title: Deploying a reasoning LLM
+    skill_level: beginner
+    use_cases:
+      - generative ai
+      - large language models
+      - natural language processing
+    link: tutorials/deployment-serve-llm/reasoning-llm/README
+    related_technology: llm applications
+  - title: Deploying a hybrid reasoning LLM
+    skill_level: beginner
+    use_cases:
+      - generative ai
+      - large language models
+      - natural language processing
+    link: tutorials/deployment-serve-llm/hybrid-reasoning-llm/README
+    related_technology: llm applications
   - title: Serve a Chatbot with Request and Response Streaming
     skill_level: intermediate
     use_cases:
 
@@ -0,0 +1,58 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "bc12c0d2",
+   "metadata": {},
+   "source": [
+    "# Quickstarts for LLM serving\n",
+    "\n",
+    "These guides provide a fast path to serving LLMs using Ray Serve on Anyscale, with focused tutorials for different deployment scales, from single-GPU setups to multi-node clusters.\n",
+    "\n",
+    "Each tutorial includes development and production setups, tips for configuring your cluster, and guidance on monitoring and scaling with Ray Serve.\n",
+    "\n",
+    "## Tutorial categories\n",
+    "\n",
+    "**[Small-sized LLM deployment](https://docs.ray.io/en/latest/serve/tutorials/deployment-serve-llm/small-size-llm/README.html)**  \n",
+    "Deploy small-sized models on a single GPU, such as Llama 3 8&nbsp;B, Mistral 7&nbsp;B, or Phi-2.  \n",
+    "\n",
+    "---\n",
+    "\n",
+    "**[Medium-sized LLM deployment](https://docs.ray.io/en/latest/serve/tutorials/deployment-serve-llm/medium-size-llm/README.html)**  \n",
+    "Deploy medium-sized models using tensor parallelism across 4—8 GPUs on a single node, such as Llama 3 70&nbsp;B, Qwen 14&nbsp;B, Mixtral 8x7&nbsp;B.  \n",
+    "\n",
+    "---\n",
+    "\n",
+    "**[Large-sized LLM deployment](https://docs.ray.io/en/latest/serve/tutorials/deployment-serve-llm/large-size-llm/README.html)**  \n",
+    "Deploy massive models using pipeline parallelism across a multi-node cluster, such as Deepseek-R1 or Llama-Nemotron-253&nbsp;B.  \n",
+    "\n",
+    "---\n",
+    "\n",
+    "**[Vision LLM deployment](https://docs.ray.io/en/latest/serve/tutorials/deployment-serve-llm/vision-llm/README.html)**  \n",
+    "Deploy models with image and text input such as Qwen 2.5-VL-7&nbsp;B-Instruct, MiniGPT-4, or Pixtral-12&nbsp;B.  \n",
+    "\n",
+    "---\n",
+    "\n",
+    "**[Reasoning LLM deployment](https://docs.ray.io/en/latest/serve/tutorials/deployment-serve-llm/reasoning-llm/README.html)**  \n",
+    "Deploy models with reasoning capabilities designed for long-context tasks, coding, or tool use, such as QwQ-32&nbsp;B.  \n",
+    "\n",
+    "---\n",
+    "\n",
+    "**[Hybrid thinking LLM deployment](https://docs.ray.io/en/latest/serve/tutorials/deployment-serve-llm/hybrid-reasoning-llm/README.html)**  \n",
+    "Deploy models that can switch between reasoning and non-reasoning modes for flexible usage, such as Qwen-3."
+   ]
+  }
+ ],
+ "metadata": {
+  "language_info": {
+   "name": "python"
+  },
+  "myst": {
+   "front_matter": {
+    "orphan": true
+   }
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
@@ -0,0 +1,41 @@
+<!--
+Do not modify this README. This file is a copy of the notebook and is not used to display the content.
+Modify notebook.ipynb instead, then regenerate this file with:
+jupyter nbconvert "$notebook.ipynb" --to markdown --output "README.md"
+-->
+
+# Quickstarts for LLM serving
+
+These guides provide a fast path to serving LLMs using Ray Serve on Anyscale, with focused tutorials for different deployment scales, from single-GPU setups to multi-node clusters.
+
+Each tutorial includes development and production setups, tips for configuring your cluster, and guidance on monitoring and scaling with Ray Serve.
+
+## Tutorial categories
+
+**[Small-sized LLM deployment](https://docs.ray.io/en/latest/serve/tutorials/deployment-serve-llm/small-size-llm/README.html)**  
+Deploy small-sized models on a single GPU, such as Llama 3 8&nbsp;B, Mistral 7&nbsp;B, or Phi-2.  
+
+---
+
+**[Medium-sized LLM deployment](https://docs.ray.io/en/latest/serve/tutorials/deployment-serve-llm/medium-size-llm/README.html)**  
+Deploy medium-sized models using tensor parallelism across 4—8 GPUs on a single node, such as Llama 3 70&nbsp;B, Qwen 14&nbsp;B, Mixtral 8x7&nbsp;B.  
+
+---
+
+**[Large-sized LLM deployment](https://docs.ray.io/en/latest/serve/tutorials/deployment-serve-llm/large-size-llm/README.html)**  
+Deploy massive models using pipeline parallelism across a multi-node cluster, such as Deepseek-R1 or Llama-Nemotron-253&nbsp;B.  
+
+---
+
+**[Vision LLM deployment](https://docs.ray.io/en/latest/serve/tutorials/deployment-serve-llm/vision-llm/README.html)**  
+Deploy models with image and text input such as Qwen 2.5-VL-7&nbsp;B-Instruct, MiniGPT-4, or Pixtral-12&nbsp;B.  
+
+---
+
+**[Reasoning LLM deployment](https://docs.ray.io/en/latest/serve/tutorials/deployment-serve-llm/reasoning-llm/README.html)**  
+Deploy models with reasoning capabilities designed for long-context tasks, coding, or tool use, such as QwQ-32&nbsp;B.  
+
+---
+
+**[Hybrid thinking LLM deployment](https://docs.ray.io/en/latest/serve/tutorials/deployment-serve-llm/hybrid-reasoning-llm/README.html)**  
+Deploy models that can switch between reasoning and non-reasoning modes for flexible usage, such as Qwen-3.
@@ -0,0 +1,14 @@
+cloud_id: {{env["ANYSCALE_CLOUD_ID"]}}
+region: us-west-2
+
+# Head node
+head_node_type:
+  name: head
+  instance_type: m5.2xlarge
+  resources:
+    cpu: 8
+
+# Worker nodes
+auto_select_worker_config: true
+flags:
+  allow-cross-zone-autoscaling: true
@@ -0,0 +1,3 @@
+#!/bin/bash
+
+set -exo pipefail
@@ -0,0 +1,14 @@
+cloud_id: {{env["ANYSCALE_CLOUD_ID"]}}
+region: us-central1
+
+# Head node
+head_node_type:
+  name: head
+  instance_type: n2-standard-8
+  resources:
+    cpu: 8
+
+# Worker nodes
+auto_select_worker_config: true
+flags:
+  allow-cross-zone-autoscaling: true
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+#!/bin/bash`
	`2`	`+`
	`3`	`+set -exo pipefail`