vllm-project · njhill · Aug 8, 2024 · Jul 19, 2024 · Jul 22, 2024 · Jul 22, 2024
@@ -0,0 +1,43 @@
+import json
+import os
+
+import openai
+import pytest
+
+from ...utils import RemoteOpenAIServer
+
+MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
+
+
+@pytest.mark.asyncio
+async def test_shutdown_on_engine_failure(tmp_path):
+    # Use a bad adapter to crash the engine
+    # (This test will fail when that bug is fixed)
+    os.mkdir(tmp_path / "bad_adapter")
+    with open(tmp_path / "bad_adapter" / "adapter_model_config.json",
+              "w") as f:
+        json.dump({"not": "real"}, f)
+    with open(tmp_path / "bad_adapter" / "adapter_model.safetensors",
+              "wb") as f:
+        f.write(b"this is fake")
+
+    args = [
+        "--dtype",
+        "bfloat16",
+        "--enforce-eager",
+        "--enable-lora",
+        "--lora-modules",
+        f"bad-adapter={tmp_path / 'bad_adapter'}",
+    ]
+
+    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
+        client = remote_server.get_async_client()
+
+        with pytest.raises(openai.APIConnectionError):
+            # This crashes the engine
+            await client.completions.create(model="bad-adapter",
+                                            prompt="Hello, my name is")
+
+        # Now the server should shut down
+        rc = remote_server.proc.wait(timeout=1)
+        assert rc is not None
diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py
@@ -57,7 +57,7 @@ def _log_task_completion(task: asyncio.Task,
         error_callback(exception)
         raise AsyncEngineDeadError(
             "Task finished unexpectedly. This should never happen! "
-            "Please open an issue on Github. See stack trace above for the"
+            "Please open an issue on Github. See stack trace above for the "
             "actual cause.") from e
 
 

@@ -18,7 +18,7 @@
 
 import vllm.envs as envs
 from vllm.engine.arg_utils import AsyncEngineArgs
-from vllm.engine.async_llm_engine import AsyncLLMEngine
+from vllm.engine.async_llm_engine import AsyncEngineDeadError, AsyncLLMEngine
 from vllm.entrypoints.logger import RequestLogger
 from vllm.entrypoints.openai.cli_args import make_arg_parser
 # yapf conflicts with isort for this block
@@ -44,6 +44,7 @@
 
 TIMEOUT_KEEP_ALIVE = 5  # seconds
 
+server: uvicorn.Server
 engine: AsyncLLMEngine
 engine_args: AsyncEngineArgs
 openai_serving_chat: OpenAIServingChat
@@ -186,6 +187,37 @@ async def validation_exception_handler(_, exc):
         return JSONResponse(err.model_dump(),
                             status_code=HTTPStatus.BAD_REQUEST)
 
+    @app.exception_handler(RuntimeError)
+    async def runtime_error_handler(_, __):
+        """On generic runtime error, check to see if the engine has died.
+        It probably has, in which case the server will no longer be able to
+        handle requests. Trigger a graceful shutdown with a SIGTERM."""
+        if (not args.keep_alive_on_engine_death and engine.errored
+                and not engine.is_running):
+            logger.fatal("AsyncLLMEngine has failed, terminating server "
+                         "process")
+            # See discussions here on shutting down a uvicorn server
+            # https://github.com/encode/uvicorn/discussions/1103
+            # In this case we cannot await the server shutdown here because
+            # this handler must first return to close the connection for
+            # this request.
+            global server
+            server.should_exit = True
+
+        return Response(status_code=HTTPStatus.INTERNAL_SERVER_ERROR)
+
+    @app.exception_handler(AsyncEngineDeadError)
+    async def engine_dead_handler(_, __):
+        """Kill the server if the async engine is already dead. It will
+        not handle any further requests."""
+        if not args.keep_alive_on_engine_death:
+            logger.fatal("AsyncLLMEngine is already dead, terminating server "
+                         "process")
+            global server
+            server.should_exit = True
+
+        return Response(status_code=HTTPStatus.INTERNAL_SERVER_ERROR)
+
     if token := envs.VLLM_API_KEY or args.api_key:
 
         @app.middleware("http")
@@ -288,6 +320,8 @@ async def build_server(
         methods = ', '.join(route.methods)
         logger.info("Route: %s, Methods: %s", route.path, methods)
 
+    # Configure and build the uvicorn server
+    # See `uvicorn.run()` for reference
     config = uvicorn.Config(
         app,
         host=args.host,
@@ -301,7 +335,9 @@ async def build_server(
         **uvicorn_kwargs,
     )
 
-    return uvicorn.Server(config)
+    global server
+    server = uvicorn.Server(config)
+    return server
 
 
 async def run_server(args, llm_engine=None, **uvicorn_kwargs) -> None:

diff --git a/vllm/entrypoints/openai/cli_args.py b/vllm/entrypoints/openai/cli_args.py
@@ -134,6 +134,11 @@ def make_arg_parser(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
         help="When --max-logprobs is specified, represents single tokens as"
         "strings of the form 'token_id:{token_id}' so that tokens that"
         "are not JSON-encodable can be identified.")
+    parser.add_argument("--keep-alive-on-engine-death",
+                        action="store_true",
+                        help="The default behavior is to stop the server "
+                        "process when the LLM engine dies. Set this flag to "
+                        "keep the server up instead.")
 
     parser = AsyncEngineArgs.add_cli_args(parser)