Skip to content

Commit 77f7f61

Browse files
litone01hongxiayang
authored andcommitted
[DOC] Add additional comments for LLMEngine and AsyncLLMEngine (vllm-project#1011)
1 parent 4078436 commit 77f7f61

File tree

9 files changed

+242
-15
lines changed

9 files changed

+242
-15
lines changed

docs/source/conf.py

Lines changed: 36 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -9,19 +9,22 @@
99
# If extensions (or modules to document with autodoc) are in another directory,
1010
# add these directories to sys.path here. If the directory is relative to the
1111
# documentation root, use os.path.abspath to make it absolute, like shown here.
12-
#
13-
# import os
14-
# import sys
15-
# sys.path.insert(0, os.path.abspath('.'))
1612

13+
import os
14+
import sys
15+
from sphinx.ext import autodoc
16+
import logging
17+
18+
sys.path.insert(0, os.path.abspath(os.path.join('..', '..')))
19+
20+
logger = logging.getLogger(__name__)
1721

1822
# -- Project information -----------------------------------------------------
1923

2024
project = 'vLLM'
2125
copyright = '2023, vLLM Team'
2226
author = 'the vLLM Team'
2327

24-
2528
# -- General configuration ---------------------------------------------------
2629

2730
# Add any Sphinx extension module names here, as strings. They can be
@@ -32,6 +35,8 @@
3235
"sphinx.ext.viewcode",
3336
"sphinx.ext.intersphinx",
3437
"sphinx_copybutton",
38+
"sphinx.ext.autodoc",
39+
"sphinx.ext.autosummary",
3540
]
3641

3742
# Add any paths that contain templates here, relative to this directory.
@@ -55,7 +60,6 @@
5560
html_theme = 'sphinx_book_theme'
5661
html_logo = 'assets/logos/vllm-logo-text-light.png'
5762
html_theme_options = {
58-
'logo_only': True,
5963
'path_to_docs': 'docs/source',
6064
'repository_url': 'https://github.com/vllm-project/vllm',
6165
'use_repository_button': True,
@@ -64,4 +68,29 @@
6468
# Add any paths that contain custom static files (such as style sheets) here,
6569
# relative to this directory. They are copied after the builtin static files,
6670
# so a file named "default.css" will overwrite the builtin "default.css".
67-
html_static_path = ['_static']
71+
# html_static_path = ['_static']
72+
73+
# Mock out external dependencies here.
74+
autodoc_mock_imports = [
75+
"torch", "transformers", "psutil", "aioprometheus", "sentencepiece",
76+
"vllm.cuda_utils", "vllm._C"
77+
]
78+
79+
for mock_target in autodoc_mock_imports:
80+
if mock_target in sys.modules:
81+
logger.info(
82+
f"Potentially problematic mock target ({mock_target}) found; "
83+
"autodoc_mock_imports cannot mock modules that have already "
84+
"been loaded into sys.modules when the sphinx build starts.")
85+
86+
87+
class MockedClassDocumenter(autodoc.ClassDocumenter):
88+
"""Remove note about base class when a class is derived from object."""
89+
90+
def add_line(self, line: str, source: str, *lineno: int) -> None:
91+
if line == " Bases: :py:class:`object`":
92+
return
93+
super().add_line(line, source, *lineno)
94+
95+
96+
autodoc.ClassDocumenter = MockedClassDocumenter
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
2+
AsyncLLMEngine
3+
=================================
4+
5+
.. autoclass:: vllm.engine.async_llm_engine.AsyncLLMEngine
6+
:members: generate, abort
7+
:show-inheritance:
Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
vLLM Engine
2+
=================================
3+
4+
.. automodule:: vllm.engine
5+
.. currentmodule:: vllm.engine
6+
7+
.. toctree::
8+
:maxdepth: 2
9+
:caption: Engines
10+
11+
llm_engine
12+
async_llm_engine
13+

docs/source/dev/engine/llm_engine.rst

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
LLMEngine
2+
=================================
3+
4+
.. autoclass:: vllm.engine.llm_engine.LLMEngine
5+
:members: add_request, abort_request, step, _init_cache
6+
:show-inheritance:

docs/source/index.rst

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -85,4 +85,16 @@ Documentation
8585
:maxdepth: 1
8686
:caption: Quantization
8787

88-
quantization/auto_awq
88+
quantization/auto_awq
89+
90+
.. toctree::
91+
:maxdepth: 2
92+
:caption: Developer Documentation
93+
94+
dev/engine/engine_index
95+
96+
Indices and tables
97+
==================
98+
99+
* :ref:`genindex`
100+
* :ref:`modindex`

vllm/core/scheduler.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -88,6 +88,18 @@ def add_seq_group(self, seq_group: SequenceGroup) -> None:
8888
self.waiting.append(seq_group)
8989

9090
def abort_seq_group(self, request_id: Union[str, Iterable[str]]) -> None:
91+
"""Aborts a sequence group with the given ID.
92+
93+
Check if the sequence group with the given ID
94+
is present in any of the state queue.
95+
If present, remove the sequence group from the state queue.
96+
Also, if any of the sequences in the sequence group is not finished,
97+
free the sequence with status `FINISHED_ABORTED`.
98+
Otherwise, do nothing.
99+
100+
Args:
101+
request_id: The ID(s) of the sequence group to abort.
102+
"""
91103
if isinstance(request_id, str):
92104
request_id = (request_id, )
93105
request_ids = set(request_id)

vllm/engine/async_llm_engine.py

Lines changed: 45 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -253,7 +253,8 @@ class AsyncLLMEngine:
253253
log_requests: Whether to log the requests.
254254
start_engine_loop: If True, the background task to run the engine
255255
will be automatically started in the generate call.
256-
*args, *kwargs: Arguments for LLMEngine.
256+
*args: Arguments for LLMEngine.
257+
*kwargs: Arguments for LLMEngine.
257258
"""
258259

259260
_engine_class: Type[_AsyncLLMEngine] = _AsyncLLMEngine
@@ -428,6 +429,49 @@ async def generate(
428429
Yields:
429430
The output `RequestOutput` objects from the LLMEngine for the
430431
request.
432+
433+
Details:
434+
- If the engine is not running, start the background loop,
435+
which iteratively invokes
436+
:meth:`~vllm.engine.async_llm_engine.AsyncLLMEngine.engine_step`
437+
to process the waiting requests.
438+
- Add the request to the engine's `RequestTracker`.
439+
On the next background loop, this request will be sent to
440+
the underlying engine.
441+
Also, a corresponding `AsyncStream` will be created.
442+
- Wait for the request outputs from `AsyncStream` and yield them.
443+
444+
Example:
445+
>>> # Please refer to entrypoints/api_server.py for
446+
>>> # the complete example.
447+
>>>
448+
>>> # initialize the engine and the example input
449+
>>> engine = AsyncLLMEngine.from_engine_args(engine_args)
450+
>>> example_input = {
451+
>>> "prompt": "What is LLM?",
452+
>>> "stream": False, # assume the non-streaming case
453+
>>> "temperature": 0.0,
454+
>>> "request_id": 0,
455+
>>> }
456+
>>>
457+
>>> # start the generation
458+
>>> results_generator = engine.generate(
459+
>>> example_input["prompt"],
460+
>>> SamplingParams(temperature=example_input["temperature"]),
461+
>>> example_input["request_id"])
462+
>>>
463+
>>> # get the results
464+
>>> final_output = None
465+
>>> async for request_output in results_generator:
466+
>>> if await request.is_disconnected():
467+
>>> # Abort the request if the client disconnects.
468+
>>> await engine.abort(request_id)
469+
>>> # Return or raise an error
470+
>>> ...
471+
>>> final_output = request_output
472+
>>>
473+
>>> # Process and return the final output
474+
>>> ...
431475
"""
432476
# Preprocess the request.
433477
# This should not be used for logging, as it is monotonic time.

vllm/engine/llm_engine.py

Lines changed: 102 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -257,7 +257,26 @@ def _verify_args(self) -> None:
257257
self.cache_config.verify_with_parallel_config(self.parallel_config)
258258

259259
def _init_cache(self) -> None:
260-
"""Profiles the memory usage and initializes the KV cache."""
260+
"""Profiles the memory usage and initializes the KV cache.
261+
262+
The engine will first conduct a profiling of the existing memory usage.
263+
Then, it calculate the maximum possible number of GPU and CPU blocks
264+
that can be allocated with the remaining free memory.
265+
More details can be found in the
266+
:meth:`~vllm.worker.worker.Worker.profile_num_available_blocks` method
267+
from class :class:`~vllm.worker.Worker`.
268+
269+
Afterwards, as there may be multiple workers,
270+
we take the minimum number of blocks across all workers
271+
to ensure this can be applied to all of them.
272+
273+
Finally, the engine will initialize the KV cache
274+
with the calculated number of blocks.
275+
276+
.. tip::
277+
You may limit the usage of GPU memory
278+
by adjusting the `gpu_memory_utilization` parameters.
279+
"""
261280
# Get the maximum number of blocks that can be allocated on GPU and CPU.
262281
num_blocks = self._run_workers(
263282
"profile_num_available_blocks",
@@ -334,6 +353,30 @@ def add_request(
334353
use the tokenizer to convert the prompts to token IDs.
335354
arrival_time: The arrival time of the request. If None, we use
336355
the current monotonic time.
356+
357+
Details:
358+
- Set arrival_time to the current time if it is None.
359+
- Set prompt_token_ids to the encoded prompt if it is None.
360+
- Create `best_of` number of :class:`~vllm.Sequence` objects.
361+
- Create a :class:`~vllm.SequenceGroup` object
362+
from the list of :class:`~vllm.Sequence`.
363+
- Add the :class:`~vllm.SequenceGroup` object to the scheduler.
364+
365+
Example:
366+
>>> # initialize engine
367+
>>> engine = LLMEngine.from_engine_args(engine_args)
368+
>>> # set request arguments
369+
>>> example_prompt = "Who is the president of the United States?"
370+
>>> sampling_params = SamplingParams(temperature=0.0)
371+
>>> request_id = 0
372+
>>>
373+
>>> # add the request to the engine
374+
>>> engine.add_request(
375+
>>> str(request_id),
376+
>>> example_prompt,
377+
>>> SamplingParams(temperature=0.0))
378+
>>> # continue the request processing
379+
>>> ...
337380
"""
338381
if arrival_time is None:
339382
arrival_time = time.monotonic()
@@ -358,6 +401,17 @@ def abort_request(self, request_id: Union[str, Iterable[str]]) -> None:
358401
359402
Args:
360403
request_id: The ID(s) of the request to abort.
404+
405+
Details:
406+
- Refer to the
407+
:meth:`~vllm.core.scheduler.Scheduler.abort_seq_group`
408+
from class :class:`~vllm.core.scheduler.Scheduler`.
409+
410+
Example:
411+
>>> # initialize engine and add a request with request_id
412+
>>> request_id = str(0)
413+
>>> # abort the request
414+
>>> engine.abort_request(request_id)
361415
"""
362416
self.scheduler.abort_seq_group(request_id)
363417

@@ -617,11 +671,53 @@ def _process_model_outputs(
617671
def step(self) -> List[RequestOutput]:
618672
"""Performs one decoding iteration and returns newly generated results.
619673
620-
This function performs one decoding iteration of the engine. It first
621-
schedules the sequences to be executed in the next iteration and the
622-
token blocks to be swapped in/out/copy. Then, it executes the model
623-
and updates the scheduler with the model outputs. Finally, it decodes
624-
the sequences and returns the newly generated results.
674+
.. figure:: https://i.imgur.com/sv2HssD.png
675+
:alt: Overview of the step function
676+
:align: center
677+
678+
Overview of the step function.
679+
680+
Details:
681+
- Step 1: Schedules the sequences to be executed in the next
682+
iteration and the token blocks to be swapped in/out/copy.
683+
684+
- Depending on the scheduling policy,
685+
sequences may be `preempted/reordered`.
686+
- A Sequence Group (SG) refer to a group of sequences
687+
that are generated from the same prompt.
688+
689+
- Step 2: Calls the workers to execute the model.
690+
- Step 3: Processes the model output. This mainly includes:
691+
692+
- Decodes the relevant outputs.
693+
- Updates the scheduled sequence groups with model outputs
694+
based on its `sampling parameters` (`use_beam_search` or not).
695+
- Frees the finished sequence groups.
696+
697+
- Finally, it creates and returns the newly generated results.
698+
699+
Example:
700+
>>> # Please see the example/ folder for more detailed examples.
701+
>>>
702+
>>> # initialize engine and request arguments
703+
>>> engine = LLMEngine.from_engine_args(engine_args)
704+
>>> example_inputs = [(0, "What is LLM?",
705+
>>> SamplingParams(temperature=0.0))]
706+
>>>
707+
>>> # Start the engine with an event loop
708+
>>> while True:
709+
>>> if example_inputs:
710+
>>> req_id, prompt, sampling_params = example_inputs.pop(0)
711+
>>> engine.add_request(str(req_id), prompt, sampling_params)
712+
>>>
713+
>>> # continue the request processing
714+
>>> request_outputs = engine.step()
715+
>>> for request_output in request_outputs:
716+
>>> if request_output.finished:
717+
>>> # return or show the request output
718+
>>>
719+
>>> if not (engine.has_unfinished_requests() or example_inputs):
720+
>>> break
625721
"""
626722
seq_group_metadata_list, scheduler_outputs = self.scheduler.schedule()
627723

vllm/worker/worker.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -87,6 +87,14 @@ def profile_num_available_blocks(
8787
gpu_memory_utilization: float,
8888
cpu_swap_space: int,
8989
) -> Tuple[int, int]:
90+
"""Profiles the peak memory usage of the model and returns the maximum
91+
number of GPU and CPU cache blocks that can be allocated.
92+
93+
Args:
94+
block_size: The size of the cache block.
95+
gpu_memory_utilization: The fraction of the total GPU memory to use.
96+
cpu_swap_space: The size of the CPU swap space in bytes.
97+
"""
9098
# Profile the memory usage of the model and get the maximum number of
9199
# cache blocks that can be allocated with the remaining free memory.
92100
torch.cuda.empty_cache()

0 commit comments

Comments
 (0)