6
6
from typing import List , Optional , Tuple
7
7
8
8
import torch
9
+ import uvloop
9
10
from tqdm import tqdm
10
11
from transformers import (AutoModelForCausalLM , AutoTokenizer ,
11
12
PreTrainedTokenizerBase )
12
13
13
- from vllm .engine .arg_utils import EngineArgs
14
+ from vllm .engine .arg_utils import AsyncEngineArgs , EngineArgs
15
+ from vllm .entrypoints .openai .api_server import (
16
+ build_async_engine_client_from_engine_args )
14
17
from vllm .model_executor .layers .quantization import QUANTIZATION_METHODS
15
- from vllm .utils import FlexibleArgumentParser
18
+ from vllm .utils import FlexibleArgumentParser , merge_async_iterators
16
19
17
20
18
21
def sample_requests (
@@ -135,6 +138,93 @@ def run_vllm(
135
138
return end - start
136
139
137
140
141
+ async def run_vllm_async (
142
+ requests : List [Tuple [str , int , int ]],
143
+ model : str ,
144
+ tokenizer : str ,
145
+ quantization : Optional [str ],
146
+ tensor_parallel_size : int ,
147
+ seed : int ,
148
+ n : int ,
149
+ use_beam_search : bool ,
150
+ trust_remote_code : bool ,
151
+ dtype : str ,
152
+ max_model_len : Optional [int ],
153
+ enforce_eager : bool ,
154
+ kv_cache_dtype : str ,
155
+ quantization_param_path : Optional [str ],
156
+ device : str ,
157
+ enable_prefix_caching : bool ,
158
+ enable_chunked_prefill : bool ,
159
+ max_num_batched_tokens : int ,
160
+ distributed_executor_backend : Optional [str ],
161
+ gpu_memory_utilization : float = 0.9 ,
162
+ num_scheduler_steps : int = 1 ,
163
+ use_v2_block_manager : bool = False ,
164
+ download_dir : Optional [str ] = None ,
165
+ load_format : str = EngineArgs .load_format ,
166
+ disable_async_output_proc : bool = False ,
167
+ disable_frontend_multiprocessing : bool = False ,
168
+ ) -> float :
169
+ from vllm import SamplingParams
170
+ engine_args = AsyncEngineArgs (
171
+ model = model ,
172
+ tokenizer = tokenizer ,
173
+ quantization = quantization ,
174
+ tensor_parallel_size = tensor_parallel_size ,
175
+ seed = seed ,
176
+ trust_remote_code = trust_remote_code ,
177
+ dtype = dtype ,
178
+ max_model_len = max_model_len ,
179
+ gpu_memory_utilization = gpu_memory_utilization ,
180
+ enforce_eager = enforce_eager ,
181
+ kv_cache_dtype = kv_cache_dtype ,
182
+ quantization_param_path = quantization_param_path ,
183
+ device = device ,
184
+ enable_prefix_caching = enable_prefix_caching ,
185
+ download_dir = download_dir ,
186
+ enable_chunked_prefill = enable_chunked_prefill ,
187
+ max_num_batched_tokens = max_num_batched_tokens ,
188
+ distributed_executor_backend = distributed_executor_backend ,
189
+ load_format = load_format ,
190
+ num_scheduler_steps = num_scheduler_steps ,
191
+ use_v2_block_manager = use_v2_block_manager ,
192
+ disable_async_output_proc = disable_async_output_proc ,
193
+ worker_use_ray = False ,
194
+ engine_use_ray = False ,
195
+ disable_log_requests = True ,
196
+ )
197
+
198
+ async with build_async_engine_client_from_engine_args (
199
+ engine_args , disable_frontend_multiprocessing ) as llm :
200
+
201
+ # Add the requests to the engine.
202
+ prompts : List [str ] = []
203
+ sampling_params : List [SamplingParams ] = []
204
+ for prompt , _ , output_len in requests :
205
+ prompts .append (prompt )
206
+ sampling_params .append (
207
+ SamplingParams (
208
+ n = n ,
209
+ temperature = 0.0 if use_beam_search else 1.0 ,
210
+ top_p = 1.0 ,
211
+ use_beam_search = use_beam_search ,
212
+ ignore_eos = True ,
213
+ max_tokens = output_len ,
214
+ ))
215
+
216
+ generators = []
217
+ start = time .perf_counter ()
218
+ for i , (prompt , sp ) in enumerate (zip (prompts , sampling_params )):
219
+ generator = llm .generate (prompt , sp , request_id = f"test{ i } " )
220
+ generators .append (generator )
221
+ all_gens = merge_async_iterators (* generators )
222
+ async for i , res in all_gens :
223
+ pass
224
+ end = time .perf_counter ()
225
+ return end - start
226
+
227
+
138
228
def run_hf (
139
229
requests : List [Tuple [str , int , int ]],
140
230
model : str ,
@@ -230,7 +320,7 @@ def main(args: argparse.Namespace):
230
320
args .output_len )
231
321
232
322
if args .backend == "vllm" :
233
- elapsed_time = run_vllm (
323
+ run_args = [
234
324
requests , args .model , args .tokenizer , args .quantization ,
235
325
args .tensor_parallel_size , args .seed , args .n , args .use_beam_search ,
236
326
args .trust_remote_code , args .dtype , args .max_model_len ,
@@ -240,7 +330,14 @@ def main(args: argparse.Namespace):
240
330
args .max_num_batched_tokens , args .distributed_executor_backend ,
241
331
args .gpu_memory_utilization , args .num_scheduler_steps ,
242
332
args .use_v2_block_manager , args .download_dir , args .load_format ,
243
- args .disable_async_output_proc )
333
+ args .disable_async_output_proc
334
+ ]
335
+
336
+ if args .async_engine :
337
+ run_args .append (args .disable_frontend_multiprocessing )
338
+ elapsed_time = uvloop .run (run_vllm_async (* run_args ))
339
+ else :
340
+ elapsed_time = run_vllm (* run_args )
244
341
elif args .backend == "hf" :
245
342
assert args .tensor_parallel_size == 1
246
343
elapsed_time = run_hf (requests , args .model , tokenizer , args .n ,
@@ -426,6 +523,14 @@ def main(args: argparse.Namespace):
426
523
action = 'store_true' ,
427
524
default = False ,
428
525
help = "Disable async output processor for vLLM backend." )
526
+ parser .add_argument ("--async-engine" ,
527
+ action = 'store_true' ,
528
+ default = False ,
529
+ help = "Use vLLM async engine rather than LLM class." )
530
+ parser .add_argument ("--disable-frontend-multiprocessing" ,
531
+ action = 'store_true' ,
532
+ default = False ,
533
+ help = "Disable decoupled async engine frontend." )
429
534
args = parser .parse_args ()
430
535
if args .tokenizer is None :
431
536
args .tokenizer = args .model
0 commit comments