Skip to content

[Frontend] Kill the server on engine death #6594

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 15 commits into from
Aug 8, 2024
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
43 changes: 43 additions & 0 deletions tests/entrypoints/openai/test_shutdown.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
import json
import os

import openai
import pytest

from ...utils import RemoteOpenAIServer

MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"


@pytest.mark.asyncio
async def test_shutdown_on_engine_failure(tmp_path):
# Use a bad adapter to crash the engine
# (This test will fail when that bug is fixed)
os.mkdir(tmp_path / "bad_adapter")
with open(tmp_path / "bad_adapter" / "adapter_model_config.json",
"w") as f:
json.dump({"not": "real"}, f)
with open(tmp_path / "bad_adapter" / "adapter_model.safetensors",
"wb") as f:
f.write(b"this is fake")

args = [
"--dtype",
"bfloat16",
"--enforce-eager",
"--enable-lora",
"--lora-modules",
f"bad-adapter={tmp_path / 'bad_adapter'}",
]

with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
client = remote_server.get_async_client()

with pytest.raises(openai.APIConnectionError):
# This crashes the engine
await client.completions.create(model="bad-adapter",
prompt="Hello, my name is")

# Now the server should shut down
rc = remote_server.proc.wait(timeout=1)
assert rc is not None
2 changes: 1 addition & 1 deletion vllm/engine/async_llm_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ def _log_task_completion(task: asyncio.Task,
error_callback(exception)
raise AsyncEngineDeadError(
"Task finished unexpectedly. This should never happen! "
"Please open an issue on Github. See stack trace above for the"
"Please open an issue on Github. See stack trace above for the "
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

(This has just triggered me every time I see the log)

"actual cause.") from e


Expand Down
40 changes: 38 additions & 2 deletions vllm/entrypoints/openai/api_server.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@

import vllm.envs as envs
from vllm.engine.arg_utils import AsyncEngineArgs
from vllm.engine.async_llm_engine import AsyncLLMEngine
from vllm.engine.async_llm_engine import AsyncEngineDeadError, AsyncLLMEngine
from vllm.entrypoints.logger import RequestLogger
from vllm.entrypoints.openai.cli_args import make_arg_parser
# yapf conflicts with isort for this block
Expand All @@ -44,6 +44,7 @@

TIMEOUT_KEEP_ALIVE = 5 # seconds

server: uvicorn.Server
engine: AsyncLLMEngine
engine_args: AsyncEngineArgs
openai_serving_chat: OpenAIServingChat
Expand Down Expand Up @@ -186,6 +187,37 @@ async def validation_exception_handler(_, exc):
return JSONResponse(err.model_dump(),
status_code=HTTPStatus.BAD_REQUEST)

@app.exception_handler(RuntimeError)
async def runtime_error_handler(_, __):
"""On generic runtime error, check to see if the engine has died.
It probably has, in which case the server will no longer be able to
handle requests. Trigger a graceful shutdown with a SIGTERM."""
if (not args.keep_alive_on_engine_death and engine.errored
and not engine.is_running):
logger.fatal("AsyncLLMEngine has failed, terminating server "
"process")
# See discussions here on shutting down a uvicorn server
# https://github.com/encode/uvicorn/discussions/1103
# In this case we cannot await the server shutdown here because
# this handler must first return to close the connection for
# this request.
global server
server.should_exit = True

return Response(status_code=HTTPStatus.INTERNAL_SERVER_ERROR)

@app.exception_handler(AsyncEngineDeadError)
async def engine_dead_handler(_, __):
"""Kill the server if the async engine is already dead. It will
not handle any further requests."""
if not args.keep_alive_on_engine_death:
logger.fatal("AsyncLLMEngine is already dead, terminating server "
"process")
global server
server.should_exit = True

return Response(status_code=HTTPStatus.INTERNAL_SERVER_ERROR)

if token := envs.VLLM_API_KEY or args.api_key:

@app.middleware("http")
Expand Down Expand Up @@ -288,6 +320,8 @@ async def build_server(
methods = ', '.join(route.methods)
logger.info("Route: %s, Methods: %s", route.path, methods)

# Configure and build the uvicorn server
# See `uvicorn.run()` for reference
config = uvicorn.Config(
app,
host=args.host,
Expand All @@ -301,7 +335,9 @@ async def build_server(
**uvicorn_kwargs,
)

return uvicorn.Server(config)
global server
server = uvicorn.Server(config)
return server


async def run_server(args, llm_engine=None, **uvicorn_kwargs) -> None:
Expand Down
5 changes: 5 additions & 0 deletions vllm/entrypoints/openai/cli_args.py
Original file line number Diff line number Diff line change
Expand Up @@ -134,6 +134,11 @@ def make_arg_parser(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
help="When --max-logprobs is specified, represents single tokens as"
"strings of the form 'token_id:{token_id}' so that tokens that"
"are not JSON-encodable can be identified.")
parser.add_argument("--keep-alive-on-engine-death",
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Only minor but I wonder if this would be better as an env var since I think it would only be used in debugging scenarios...

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah, yeah I can change that. I only recently learned that there's an explicit split between the cli args and vllm.envs

action="store_true",
help="The default behavior is to stop the server "
"process when the LLM engine dies. Set this flag to "
"keep the server up instead.")

parser = AsyncEngineArgs.add_cli_args(parser)

Expand Down
Loading