Skip to content

Commit 7c7714d

Browse files
alexm-redhatnjhillrshaw@neuralmagic.comrobertgshaw2-redhatsimon-mo
authored
[Core][Bugfix][Perf] Introduce MQLLMEngine to avoid asyncio OH (#8157)
Co-authored-by: Nick Hill <[email protected]> Co-authored-by: [email protected] <[email protected]> Co-authored-by: Robert Shaw <[email protected]> Co-authored-by: Simon Mo <[email protected]>
1 parent 9d104b5 commit 7c7714d

36 files changed

+1467
-1172
lines changed

.buildkite/test-pipeline.yaml

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,13 +43,15 @@ steps:
4343
fast_check: true
4444
source_file_dependencies:
4545
- vllm/
46+
- tests/mq_llm_engine
4647
- tests/async_engine
4748
- tests/test_inputs
4849
- tests/multimodal
4950
- tests/test_utils
5051
- tests/worker
5152
commands:
52-
- pytest -v -s async_engine # Async Engine
53+
- pytest -v -s mq_llm_engine # MQLLMEngine
54+
- pytest -v -s async_engine # AsyncLLMEngine
5355
- NUM_SCHEDULER_STEPS=4 pytest -v -s async_engine/test_async_llm_engine.py
5456
- pytest -v -s test_inputs.py
5557
- pytest -v -s multimodal

docs/source/dev/profiling/profiling_index.rst

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -21,8 +21,8 @@ Traces can be visualized using https://ui.perfetto.dev/.
2121
.. tip::
2222

2323
To stop the profiler - it flushes out all the profile trace files to the directory. This takes time, for example for about 100 requests worth of data for a llama 70b, it takes about 10 minutes to flush out on a H100.
24-
Set the env variable VLLM_RPC_GET_DATA_TIMEOUT_MS to a big number before you start the server. Say something like 30 minutes.
25-
``export VLLM_RPC_GET_DATA_TIMEOUT_MS=1800000``
24+
Set the env variable VLLM_RPC_TIMEOUT to a big number before you start the server. Say something like 30 minutes.
25+
``export VLLM_RPC_TIMEOUT=1800000``
2626

2727
Example commands and usage:
2828
===========================

tests/async_engine/test_openapi_server.py

Lines changed: 0 additions & 106 deletions
This file was deleted.

tests/entrypoints/openai/rpc/test_zmq_client.py

Lines changed: 0 additions & 120 deletions
This file was deleted.

tests/entrypoints/openai/test_accuracy.py

Lines changed: 25 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -18,38 +18,32 @@
1818
FILTER = "exact_match,strict-match"
1919
RTOL = 0.03
2020
EXPECTED_VALUE = 0.58
21+
DEFAULT_ARGS = ["--max-model-len", "4096", "--disable-log-requests"]
22+
MORE_ARGS_LIST = [["--enable-chunked-prefill"], ["--num-scheduler-steps", "8"]]
2123

2224

23-
@pytest.fixture(scope="module")
24-
def server():
25-
args = [
26-
"--max-model-len", "4096", "--enable-chunked-prefill",
27-
"--disable-log-requests", "--enforce-eager"
28-
]
29-
30-
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
31-
yield remote_server
32-
33-
34-
@pytest.fixture(scope="module")
35-
def server_data(server):
36-
return {
37-
"url": f"{server.url_for('v1')}/completions",
38-
}
25+
@pytest.mark.parametrize("more_args", MORE_ARGS_LIST)
26+
def test_lm_eval_accuracy(more_args):
27+
args = list(DEFAULT_ARGS)
28+
args.extend(more_args)
3929

30+
print(f"Running with: {args}")
4031

41-
def test_lm_eval_accuracy(server_data):
42-
model_args = (f"model={MODEL_NAME},"
43-
f"base_url={server_data['url']},"
44-
f"num_concurrent={NUM_CONCURRENT},tokenized_requests=False")
45-
46-
results = lm_eval.simple_evaluate(
47-
model="local-completions",
48-
model_args=model_args,
49-
tasks=TASK,
50-
)
51-
52-
measured_value = results["results"][TASK][FILTER]
53-
assert (measured_value - RTOL < EXPECTED_VALUE
54-
and measured_value + RTOL > EXPECTED_VALUE
55-
), f"Expected: {EXPECTED_VALUE} | Measured: {measured_value}"
32+
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
33+
url = f"{remote_server.url_for('v1')}/completions"
34+
35+
model_args = (
36+
f"model={MODEL_NAME},"
37+
f"base_url={url},"
38+
f"num_concurrent={NUM_CONCURRENT},tokenized_requests=False")
39+
40+
results = lm_eval.simple_evaluate(
41+
model="local-completions",
42+
model_args=model_args,
43+
tasks=TASK,
44+
)
45+
46+
measured_value = results["results"][TASK][FILTER]
47+
assert (measured_value - RTOL < EXPECTED_VALUE
48+
and measured_value + RTOL > EXPECTED_VALUE
49+
), f"Expected: {EXPECTED_VALUE} | Measured: {measured_value}"

tests/async_engine/test_chat_template.py renamed to tests/entrypoints/openai/test_chat_template.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
from vllm.entrypoints.openai.protocol import ChatCompletionRequest
66
from vllm.transformers_utils.tokenizer import get_tokenizer
77

8-
from ..utils import VLLM_PATH
8+
from ...utils import VLLM_PATH
99

1010
chatml_jinja_path = VLLM_PATH / "examples/template_chatml.jinja"
1111
assert chatml_jinja_path.exists()

tests/entrypoints/openai/test_mp_api_server.py

Lines changed: 0 additions & 40 deletions
This file was deleted.

tests/entrypoints/openai/test_serving_chat.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
from unittest.mock import MagicMock
55

66
from vllm.config import MultiModalConfig
7-
from vllm.engine.async_llm_engine import AsyncLLMEngine
7+
from vllm.engine.multiprocessing.client import MQLLMEngineClient
88
from vllm.entrypoints.openai.protocol import ChatCompletionRequest
99
from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
1010
from vllm.transformers_utils.tokenizer import get_tokenizer
@@ -52,8 +52,9 @@ def test_async_serving_chat_init():
5252

5353

5454
def test_serving_chat_should_set_correct_max_tokens():
55-
mock_engine = MagicMock(spec=AsyncLLMEngine)
55+
mock_engine = MagicMock(spec=MQLLMEngineClient)
5656
mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME)
57+
mock_engine.errored = False
5758

5859
serving_chat = OpenAIServingChat(mock_engine,
5960
MockModelConfig(),

0 commit comments

Comments
 (0)