vllm-project · youkaichao · Oct 7, 2024 · Oct 6, 2024 · Oct 6, 2024 · Oct 6, 2024
diff --git a/benchmarks/backend_request_func.py b/benchmarks/backend_request_func.py
@@ -23,7 +23,6 @@ class RequestFuncInput:
     output_len: int
     model: str
     best_of: int = 1
-    use_beam_search: bool = False
     logprobs: Optional[int] = None
     multi_modal_content: Optional[dict] = None
     ignore_eos: bool = False
@@ -49,7 +48,6 @@ async def async_request_tgi(
     assert api_url.endswith("generate_stream")
 
     async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
-        assert not request_func_input.use_beam_search
         params = {
             "best_of": request_func_input.best_of,
             "max_new_tokens": request_func_input.output_len,
@@ -121,7 +119,6 @@ async def async_request_trt_llm(
     assert api_url.endswith("generate_stream")
 
     async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
-        assert not request_func_input.use_beam_search
         assert request_func_input.best_of == 1
         payload = {
             "accumulate_tokens": True,
@@ -187,7 +184,6 @@ async def async_request_deepspeed_mii(
 ) -> RequestFuncOutput:
     async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
         assert request_func_input.best_of == 1
-        assert not request_func_input.use_beam_search
 
         payload = {
             "prompt": request_func_input.prompt,
@@ -235,7 +231,6 @@ async def async_request_openai_completions(
     ), "OpenAI Completions API URL must end with 'completions' or 'profile'."
 
     async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
-        assert not request_func_input.use_beam_search
         payload = {
             "model": request_func_input.model,
             "prompt": request_func_input.prompt,
@@ -317,7 +312,6 @@ async def async_request_openai_chat_completions(
     ), "OpenAI Chat Completions API URL must end with 'chat/completions'."
 
     async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
-        assert not request_func_input.use_beam_search
         content = [{"type": "text", "text": request_func_input.prompt}]
         if request_func_input.multi_modal_content:
             content.append(request_func_input.multi_modal_content)

diff --git a/benchmarks/benchmark_latency.py b/benchmarks/benchmark_latency.py
@@ -51,9 +51,8 @@ def main(args: argparse.Namespace):
 
     sampling_params = SamplingParams(
         n=args.n,
-        temperature=0.0 if args.use_beam_search else 1.0,
+        temperature=1.0,
         top_p=1.0,
-        use_beam_search=args.use_beam_search,
         ignore_eos=True,
         max_tokens=args.output_len,
     )

diff --git a/benchmarks/benchmark_prioritization.py b/benchmarks/benchmark_prioritization.py
@@ -68,7 +68,6 @@ def run_vllm(
     tensor_parallel_size: int,
     seed: int,
     n: int,
-    use_beam_search: bool,
     trust_remote_code: bool,
     dtype: str,
     max_model_len: Optional[int],
@@ -114,9 +113,8 @@ def run_vllm(
         sampling_params.append(
             SamplingParams(
                 n=n,
-                temperature=0.0 if use_beam_search else 1.0,
+                temperature=1.0,
                 top_p=1.0,
-                use_beam_search=use_beam_search,
                 ignore_eos=True,
                 max_tokens=output_len,
             ))
@@ -144,15 +142,16 @@ def main(args: argparse.Namespace):
                                    args.output_len)
 
     if args.backend == "vllm":
-        elapsed_time = run_vllm(
-            requests, args.model, args.tokenizer, args.quantization,
-            args.tensor_parallel_size, args.seed, args.n, args.use_beam_search,
-            args.trust_remote_code, args.dtype, args.max_model_len,
-            args.enforce_eager, args.kv_cache_dtype,
-            args.quantization_param_path, args.device,
-            args.enable_prefix_caching, args.enable_chunked_prefill,
-            args.max_num_batched_tokens, args.gpu_memory_utilization,
-            args.download_dir)
+        elapsed_time = run_vllm(requests, args.model, args.tokenizer,
+                                args.quantization, args.tensor_parallel_size,
+                                args.seed, args.n, args.trust_remote_code,
+                                args.dtype, args.max_model_len,
+                                args.enforce_eager, args.kv_cache_dtype,
+                                args.quantization_param_path, args.device,
+                                args.enable_prefix_caching,
+                                args.enable_chunked_prefill,
+                                args.max_num_batched_tokens,
+                                args.gpu_memory_utilization, args.download_dir)
     else:
         raise ValueError(f"Unknown backend: {args.backend}")
     total_num_tokens = sum(prompt_len + output_len
@@ -203,7 +202,6 @@ def main(args: argparse.Namespace):
                         type=int,
                         default=1,
                         help="Number of generated sequences per prompt.")
-    parser.add_argument("--use-beam-search", action="store_true")
     parser.add_argument("--num-prompts",
                         type=int,
                         default=200,

diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py
@@ -73,7 +73,6 @@ def run_vllm(
     tensor_parallel_size: int,
     seed: int,
     n: int,
-    use_beam_search: bool,
     trust_remote_code: bool,
     dtype: str,
     max_model_len: Optional[int],
@@ -91,7 +90,6 @@ def run_vllm(
     download_dir: Optional[str] = None,
     load_format: str = EngineArgs.load_format,
     disable_async_output_proc: bool = False,
-    use_new_beam_search_impl: bool = False,
 ) -> float:
     from vllm import LLM, SamplingParams
     llm = LLM(
@@ -127,19 +125,19 @@ def run_vllm(
         sampling_params.append(
             SamplingParams(
                 n=n,
-                temperature=0.0 if use_beam_search else 1.0,
+                temperature=1.0,
                 top_p=1.0,
-                use_beam_search=use_beam_search,
                 ignore_eos=True,
                 max_tokens=output_len,
             ))
 
-    if not use_new_beam_search_impl:
+    use_beam_search = False
+
+    if not use_beam_search:
         start = time.perf_counter()
         llm.generate(prompts, sampling_params, use_tqdm=True)
         end = time.perf_counter()
     else:
-        assert use_beam_search
         prompts = [prompt for prompt, _, _ in requests]
         # output_len should be the same for all requests.
         output_len = requests[0][2]
@@ -165,7 +163,6 @@ async def run_vllm_async(
     tensor_parallel_size: int,
     seed: int,
     n: int,
-    use_beam_search: bool,
     trust_remote_code: bool,
     dtype: str,
     max_model_len: Optional[int],
@@ -224,9 +221,8 @@ async def run_vllm_async(
             sampling_params.append(
                 SamplingParams(
                     n=n,
-                    temperature=0.0 if use_beam_search else 1.0,
+                    temperature=1.0,
                     top_p=1.0,
-                    use_beam_search=use_beam_search,
                     ignore_eos=True,
                     max_tokens=output_len,
                 ))
@@ -248,11 +244,9 @@ def run_hf(
     model: str,
     tokenizer: PreTrainedTokenizerBase,
     n: int,
-    use_beam_search: bool,
     max_batch_size: int,
     trust_remote_code: bool,
 ) -> float:
-    assert not use_beam_search
     llm = AutoModelForCausalLM.from_pretrained(
         model, torch_dtype=torch.float16, trust_remote_code=trust_remote_code)
     if llm.config.model_type == "llama":
@@ -284,7 +278,7 @@ def run_hf(
                               padding=True).input_ids
         llm_outputs = llm.generate(
             input_ids=input_ids.cuda(),
-            do_sample=not use_beam_search,
+            do_sample=True,
             num_return_sequences=n,
             temperature=1.0,
             top_p=1.0,
@@ -340,7 +334,7 @@ def main(args: argparse.Namespace):
     if args.backend == "vllm":
         run_args = [
             requests, args.model, args.tokenizer, args.quantization,
-            args.tensor_parallel_size, args.seed, args.n, args.use_beam_search,
+            args.tensor_parallel_size, args.seed, args.n,
             args.trust_remote_code, args.dtype, args.max_model_len,
             args.enforce_eager, args.kv_cache_dtype,
             args.quantization_param_path, args.device,
@@ -355,12 +349,11 @@ def main(args: argparse.Namespace):
             run_args.append(args.disable_frontend_multiprocessing)
             elapsed_time = uvloop.run(run_vllm_async(*run_args))
         else:
-            elapsed_time = run_vllm(*run_args, args.use_new_beam_search_impl)
+            elapsed_time = run_vllm(*run_args)
     elif args.backend == "hf":
         assert args.tensor_parallel_size == 1
         elapsed_time = run_hf(requests, args.model, tokenizer, args.n,
-                              args.use_beam_search, args.hf_max_batch_size,
-                              args.trust_remote_code)
+                              args.hf_max_batch_size, args.trust_remote_code)
     elif args.backend == "mii":
         elapsed_time = run_mii(requests, args.model, args.tensor_parallel_size,
                                args.output_len)
@@ -414,8 +407,6 @@ def main(args: argparse.Namespace):
                         type=int,
                         default=1,
                         help="Number of generated sequences per prompt.")
-    parser.add_argument("--use-beam-search", action="store_true")
-    parser.add_argument("--use-new-beam-search-impl", action="store_true")
     parser.add_argument("--num-prompts",
                         type=int,
                         default=1000,
@@ -570,8 +561,6 @@ def main(args: argparse.Namespace):
             raise ValueError("dtype must be auto for MII backend.")
         if args.n != 1:
             raise ValueError("n must be 1 for MII backend.")
-        if args.use_beam_search:
-            raise ValueError("Beam search is not supported for MII backend.")
         if args.quantization is not None:
             raise ValueError("Quantization is only for vLLM backend.")
         if args.hf_max_batch_size is not None:

diff --git a/examples/llm_engine_example.py b/examples/llm_engine_example.py
@@ -18,9 +18,6 @@ def create_test_prompts() -> List[Tuple[str, SamplingParams]]:
                         temperature=0.8,
                         top_p=0.95,
                         frequency_penalty=0.1)),
-        ("It is only with the heart that one can see rightly",
-         SamplingParams(n=3, best_of=3, use_beam_search=True,
-                        temperature=0.0)),
     ]
 
 

diff --git a/examples/multilora_inference.py b/examples/multilora_inference.py
@@ -43,15 +43,6 @@ def create_test_prompts(
                            max_tokens=128,
                            stop_token_ids=[32003]),
             LoRARequest("sql-lora", 1, lora_path)),
-        (
-            "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_11 (nationality VARCHAR, elector VARCHAR)\n\n question: When Anchero Pantaleone was the elector what is under nationality? [/user] [assistant]",  # noqa: E501
-            SamplingParams(n=3,
-                           best_of=3,
-                           use_beam_search=True,
-                           temperature=0,
-                           max_tokens=128,
-                           stop_token_ids=[32003]),
-            LoRARequest("sql-lora", 1, lora_path)),
         (
             "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_74 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]",  # noqa: E501
             SamplingParams(temperature=0.0,
@@ -60,15 +51,6 @@ def create_test_prompts(
                            max_tokens=128,
                            stop_token_ids=[32003]),
             LoRARequest("sql-lora2", 2, lora_path)),
-        (
-            "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_11 (nationality VARCHAR, elector VARCHAR)\n\n question: When Anchero Pantaleone was the elector what is under nationality? [/user] [assistant]",  # noqa: E501
-            SamplingParams(n=3,
-                           best_of=3,
-                           use_beam_search=True,
-                           temperature=0,
-                           max_tokens=128,
-                           stop_token_ids=[32003]),
-            LoRARequest("sql-lora", 1, lora_path)),
     ]
 
 

diff --git a/tests/basic_correctness/test_preemption.py b/tests/basic_correctness/test_preemption.py
@@ -197,54 +197,6 @@ def test_swap(
     assert total_preemption == total_recorded_preemption
 
 
-@pytest.mark.parametrize("model", MODELS)
-@pytest.mark.parametrize("dtype", ["float"])
-@pytest.mark.parametrize("max_tokens", [96])
-@pytest.mark.parametrize("beam_width", [4])
-@pytest.mark.parametrize("use_v2_block_manager", [True, False])
-def test_swap_infeasible(
-    vllm_runner,
-    example_prompts,
-    model: str,
-    dtype: str,
-    max_tokens: int,
-    beam_width: int,
-    worker_use_ray: bool,
-    use_v2_block_manager: bool,
-) -> None:
-    """Verify infeasible swap request will be ignored."""
-    BLOCK_SIZE = 16
-    prefill_blocks = 2
-    decode_blocks = max_tokens // BLOCK_SIZE
-    example_prompts = example_prompts[:1]
-    with vllm_runner(
-            model,
-            dtype=dtype,
-            swap_space=10,
-            block_size=BLOCK_SIZE,
-            # Since beam search have more than 1 sequence, prefill +
-            # decode blocks are not enough to finish.
-            num_gpu_blocks_override=prefill_blocks + decode_blocks,
-            max_model_len=(prefill_blocks + decode_blocks) * BLOCK_SIZE,
-            worker_use_ray=worker_use_ray,
-            use_v2_block_manager=use_v2_block_manager,
-    ) as vllm_model:
-        sampling_params = SamplingParams(n=beam_width,
-                                         use_beam_search=True,
-                                         temperature=0.0,
-                                         max_tokens=max_tokens,
-                                         ignore_eos=True)
-        req_outputs = vllm_model.model.generate(
-            example_prompts,
-            sampling_params=sampling_params,
-        )
-        assert (vllm_model.model.llm_engine.scheduler[0].artificial_preempt_cnt
-                < ARTIFICIAL_PREEMPTION_MAX_CNT)
-
-    # Verify the request is ignored and not hang.
-    assert req_outputs[0].outputs[0].finish_reason == "length"
-
-
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("dtype", ["float"])
 @pytest.mark.parametrize("max_tokens", [96])

diff --git a/tests/conftest.py b/tests/conftest.py
@@ -782,7 +782,6 @@ def generate_encoder_decoder_greedy_logprobs(
                List[TokensTextLogprobsPromptLogprobs]]:
         greedy_logprobs_params = SamplingParams(
             temperature=0.0,
-            use_beam_search=False,
             max_tokens=max_tokens,
             logprobs=num_logprobs,
             prompt_logprobs=(num_prompt_logprobs),
@@ -794,19 +793,6 @@ def generate_encoder_decoder_greedy_logprobs(
         return self.generate_encoder_decoder_w_logprobs(
             encoder_decoder_prompts, greedy_logprobs_params)
 
-    def generate_beam_search(
-        self,
-        prompts: List[str],
-        beam_width: int,
-        max_tokens: int,
-    ) -> List[Tuple[List[List[int]], List[str]]]:
-        beam_search_params = SamplingParams(n=beam_width,
-                                            use_beam_search=True,
-                                            temperature=0.0,
-                                            max_tokens=max_tokens)
-        outputs = self.generate(prompts, beam_search_params)
-        return outputs
-
     def generate_beam_search_new(
         self,
         prompts: Union[List[str], List[List[int]]],