Skip to content

Commit 6041efb

Browse files
joerundeLeiWang1999
authored andcommitted
[Frontend] Kill the server on engine death (vllm-project#6594)
Signed-off-by: Joe Runde <[email protected]> Signed-off-by: Joe Runde <[email protected]> Signed-off-by: LeiWang1999 <[email protected]>
1 parent 7771a7b commit 6041efb

File tree

8 files changed

+136
-14
lines changed

8 files changed

+136
-14
lines changed
Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
import json
2+
import os
3+
4+
import openai
5+
import pytest
6+
7+
from ...utils import RemoteOpenAIServer
8+
9+
MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
10+
11+
12+
@pytest.mark.asyncio
13+
async def test_shutdown_on_engine_failure(tmp_path):
14+
# Use a bad adapter to crash the engine
15+
# (This test will fail when that bug is fixed)
16+
adapter_path = tmp_path / "bad_adapter"
17+
os.mkdir(adapter_path)
18+
with open(adapter_path / "adapter_model_config.json", "w") as f:
19+
json.dump({"not": "real"}, f)
20+
with open(adapter_path / "adapter_model.safetensors", "wb") as f:
21+
f.write(b"this is fake")
22+
23+
# dtype, max-len etc set so that this can run in CI
24+
args = [
25+
"--dtype",
26+
"bfloat16",
27+
"--max-model-len",
28+
"8192",
29+
"--enforce-eager",
30+
"--max-num-seqs",
31+
"128",
32+
"--enable-lora",
33+
"--lora-modules",
34+
f"bad-adapter={tmp_path / 'bad_adapter'}",
35+
]
36+
37+
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
38+
client = remote_server.get_async_client()
39+
40+
with pytest.raises(openai.APIConnectionError):
41+
# This crashes the engine
42+
await client.completions.create(model="bad-adapter",
43+
prompt="Hello, my name is")
44+
45+
# Now the server should shut down
46+
return_code = remote_server.proc.wait(timeout=1)
47+
assert return_code is not None

vllm/engine/async_llm_engine.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -58,7 +58,7 @@ def _log_task_completion(task: asyncio.Task,
5858
error_callback(exception)
5959
raise AsyncEngineDeadError(
6060
"Task finished unexpectedly. This should never happen! "
61-
"Please open an issue on Github. See stack trace above for the"
61+
"Please open an issue on Github. See stack trace above for the "
6262
"actual cause.") from e
6363

6464

@@ -132,7 +132,9 @@ def propagate_exception(self,
132132
self._request_streams[request_id].put(exc)
133133
self.abort_request(request_id)
134134
else:
135-
for rid, stream in self._request_streams.items():
135+
# NB: list() used here because self.abort_request pops the stream
136+
# out of self._request_streams, so we can't iterate on it directly
137+
for rid, stream in list(self._request_streams.items()):
136138
stream.put(exc)
137139
self.abort_request(rid)
138140

vllm/entrypoints/api_server.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -118,6 +118,7 @@ async def run_server(args: Namespace,
118118

119119
shutdown_task = await serve_http(
120120
app,
121+
engine=engine,
121122
host=args.host,
122123
port=args.port,
123124
log_level=args.log_level,

vllm/entrypoints/launcher.py

Lines changed: 42 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,21 @@
11
import asyncio
22
import signal
3+
from http import HTTPStatus
34
from typing import Any
45

56
import uvicorn
6-
from fastapi import FastAPI
7+
from fastapi import FastAPI, Response
78

9+
from vllm import envs
10+
from vllm.engine.async_llm_engine import AsyncEngineDeadError
11+
from vllm.engine.protocol import AsyncEngineClient
812
from vllm.logger import init_logger
913

1014
logger = init_logger(__name__)
1115

1216

13-
async def serve_http(app: FastAPI, **uvicorn_kwargs: Any):
17+
async def serve_http(app: FastAPI, engine: AsyncEngineClient,
18+
**uvicorn_kwargs: Any):
1419
logger.info("Available routes are:")
1520
for route in app.routes:
1621
methods = getattr(route, "methods", None)
@@ -23,6 +28,7 @@ async def serve_http(app: FastAPI, **uvicorn_kwargs: Any):
2328

2429
config = uvicorn.Config(app, **uvicorn_kwargs)
2530
server = uvicorn.Server(config)
31+
_add_shutdown_handlers(app, server, engine)
2632

2733
loop = asyncio.get_running_loop()
2834

@@ -44,3 +50,37 @@ async def dummy_shutdown() -> None:
4450
except asyncio.CancelledError:
4551
logger.info("Gracefully stopping http server")
4652
return server.shutdown()
53+
54+
55+
def _add_shutdown_handlers(app: FastAPI, server: uvicorn.Server,
56+
engine: AsyncEngineClient) -> None:
57+
"""Adds handlers for fatal errors that should crash the server"""
58+
59+
@app.exception_handler(RuntimeError)
60+
async def runtime_error_handler(_, __):
61+
"""On generic runtime error, check to see if the engine has died.
62+
It probably has, in which case the server will no longer be able to
63+
handle requests. Trigger a graceful shutdown with a SIGTERM."""
64+
if (not envs.VLLM_KEEP_ALIVE_ON_ENGINE_DEATH and engine.errored
65+
and not engine.is_running):
66+
logger.fatal("AsyncLLMEngine has failed, terminating server "
67+
"process")
68+
# See discussions here on shutting down a uvicorn server
69+
# https://github.com/encode/uvicorn/discussions/1103
70+
# In this case we cannot await the server shutdown here because
71+
# this handler must first return to close the connection for
72+
# this request.
73+
server.should_exit = True
74+
75+
return Response(status_code=HTTPStatus.INTERNAL_SERVER_ERROR)
76+
77+
@app.exception_handler(AsyncEngineDeadError)
78+
async def engine_dead_handler(_, __):
79+
"""Kill the server if the async engine is already dead. It will
80+
not handle any further requests."""
81+
if not envs.VLLM_KEEP_ALIVE_ON_ENGINE_DEATH:
82+
logger.fatal("AsyncLLMEngine is already dead, terminating server "
83+
"process")
84+
server.should_exit = True
85+
86+
return Response(status_code=HTTPStatus.INTERNAL_SERVER_ERROR)

vllm/entrypoints/openai/api_server.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -357,6 +357,7 @@ async def run_server(args, **uvicorn_kwargs) -> None:
357357

358358
shutdown_task = await serve_http(
359359
app,
360+
engine=async_engine_client,
360361
host=args.host,
361362
port=args.port,
362363
log_level=args.uvicorn_log_level,

vllm/entrypoints/openai/rpc/client.py

Lines changed: 24 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@ async def setup(self):
3333

3434
# Wait until server is ready.
3535
await self.wait_for_server()
36+
self._errored = False
3637

3738
# Get the configs.
3839
self.model_config = await self._get_model_config_rpc()
@@ -169,15 +170,15 @@ async def _get_scheduler_config_rpc(self) -> SchedulerConfig:
169170
expected_type=SchedulerConfig,
170171
error_message="Could not get SchedulerConfig from RPC Server")
171172

172-
async def _get_lora_config_rpc(self):
173+
async def _get_lora_config_rpc(self) -> LoRAConfig:
173174
"""Get LoRAConfig from the RPCServer"""
174175

175176
return await self._send_get_data_rpc_request(
176177
RPCUtilityRequest.GET_LORA_CONFIG,
177178
expected_type=LoRAConfig,
178179
error_message="Could not get LoRAConfig from RPC Server")
179180

180-
async def _is_tracing_enabled_rpc(self) -> ParallelConfig:
181+
async def _is_tracing_enabled_rpc(self) -> bool:
181182
"""Get is_tracing_enabled flag from the RPCServer"""
182183

183184
return await self._send_get_data_rpc_request(
@@ -200,6 +201,18 @@ async def do_log_stats(self):
200201
request=RPCUtilityRequest.DO_LOG_STATS,
201202
error_message="RPCRequest DO_LOG_STATS failed.")
202203

204+
@property
205+
def is_running(self) -> bool:
206+
return not self._errored
207+
208+
@property
209+
def is_stopped(self) -> bool:
210+
return self._errored
211+
212+
@property
213+
def errored(self) -> bool:
214+
return self._errored
215+
203216
async def generate(
204217
self,
205218
inputs: PromptInputs,
@@ -233,6 +246,15 @@ async def generate(
233246
request_output = cloudpickle.loads(message)
234247

235248
if isinstance(request_output, Exception):
249+
# On exception, check if the server is still healthy.
250+
# Use this to set the sync `is_running` and `errored`
251+
# properties.
252+
try:
253+
await self.check_health()
254+
except Exception:
255+
self._errored = True
256+
# NB: do before raising here so that the flag is set
257+
# by the time the caller receives this exception
236258
raise request_output
237259

238260
finished = request_output.finished

vllm/entrypoints/openai/rpc/server.py

Lines changed: 11 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -96,14 +96,17 @@ async def is_server_ready(self, identity):
9696

9797
async def abort(self, identity, request: RPCAbortRequest):
9898
"""Abort request and notify the client of success."""
99-
# Abort the request in the llm engine.
100-
await self.engine.abort(request.request_id)
101-
102-
# Send confirmation to the client.
103-
await self.socket.send_multipart([
104-
identity,
105-
cloudpickle.dumps(VLLM_RPC_SUCCESS_STR),
106-
])
99+
try:
100+
# Abort the request in the llm engine.
101+
await self.engine.abort(request.request_id)
102+
except Exception:
103+
logger.warning("Failed to abort request %s", request.request_id)
104+
finally:
105+
# Send confirmation to the client.
106+
await self.socket.send_multipart([
107+
identity,
108+
cloudpickle.dumps(VLLM_RPC_SUCCESS_STR),
109+
])
107110

108111
async def generate(self, identity, generate_request: RPCGenerateRequest):
109112
try:

vllm/envs.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,7 @@
4949
NVCC_THREADS: Optional[str] = None
5050
VLLM_USE_PRECOMPILED: bool = False
5151
VLLM_NO_DEPRECATION_WARNING: bool = False
52+
VLLM_KEEP_ALIVE_ON_ENGINE_DEATH: bool = False
5253
CMAKE_BUILD_TYPE: Optional[str] = None
5354
VERBOSE: bool = False
5455
VLLM_ALLOW_LONG_MAX_MODEL_LEN: bool = False
@@ -335,6 +336,11 @@ def get_default_config_root():
335336
"VLLM_NO_DEPRECATION_WARNING":
336337
lambda: bool(int(os.getenv("VLLM_NO_DEPRECATION_WARNING", "0"))),
337338

339+
# If set, the OpenAI API server will stay alive even after the underlying
340+
# AsyncLLMEngine errors and stops serving requests
341+
"VLLM_KEEP_ALIVE_ON_ENGINE_DEATH":
342+
lambda: bool(os.getenv("VLLM_KEEP_ALIVE_ON_ENGINE_DEATH", 0)),
343+
338344
# If the env var VLLM_ALLOW_LONG_MAX_MODEL_LEN is set, it allows
339345
# the user to specify a max sequence length greater than
340346
# the max length derived from the model's config.json.

0 commit comments

Comments
 (0)