Skip to content

[Frontend] Batch inference for llm.chat() API #8648

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 18 commits into from
Sep 24, 2024
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 35 additions & 0 deletions tests/entrypoints/llm/test_generate.py
Original file line number Diff line number Diff line change
Expand Up @@ -162,6 +162,41 @@ def test_chat():
assert len(outputs) == 1


def test_multi_chat():

llm = LLM(model="meta-llama/Meta-Llama-3-8B-Instruct")

prompt1 = "Explain the concept of entropy."
prompt2 = "Explain what among us is."

conversation1 = [
{
"role": "system",
"content": "You are a helpful assistant"
},
{
"role": "user",
"content": prompt1
},
]

conversation2 = [
{
"role": "system",
"content": "You are a helpful assistant"
},
{
"role": "user",
"content": prompt2
},
]

messages = [conversation1, conversation2]

outputs = llm.chat(messages)
assert len(outputs) == 2


@pytest.mark.parametrize("image_urls",
[[TEST_IMAGE_URLS[0], TEST_IMAGE_URLS[1]]])
def test_chat_multi_image(image_urls: List[str]):
Expand Down
125 changes: 86 additions & 39 deletions vllm/entrypoints/llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -349,6 +349,7 @@ def generate(
outputs = self._run_engine(use_tqdm=use_tqdm)
return LLMEngine.validate_outputs(outputs, RequestOutput)

@overload
def chat(
self,
messages: List[ChatCompletionMessageParam],
Expand All @@ -360,6 +361,34 @@ def chat(
add_generation_prompt: bool = True,
tools: Optional[List[Dict[str, Any]]] = None,
) -> List[RequestOutput]:
...

@overload
def chat(
self,
messages: List[List[ChatCompletionMessageParam]],
sampling_params: Optional[Union[SamplingParams,
List[SamplingParams]]] = None,
use_tqdm: bool = True,
lora_request: Optional[LoRARequest] = None,
chat_template: Optional[str] = None,
add_generation_prompt: bool = True,
tools: Optional[List[Dict[str, Any]]] = None,
) -> List[List[RequestOutput]]:
...

def chat(
self,
messages: Union[List[ChatCompletionMessageParam],
List[List[ChatCompletionMessageParam]]],
sampling_params: Optional[Union[SamplingParams,
List[SamplingParams]]] = None,
use_tqdm: bool = True,
lora_request: Optional[LoRARequest] = None,
chat_template: Optional[str] = None,
add_generation_prompt: bool = True,
tools: Optional[List[Dict[str, Any]]] = None,
) -> Union[List[RequestOutput], List[List[RequestOutput]]]:
"""
Generate responses for a chat conversation.

Expand All @@ -371,8 +400,9 @@ def chat(
to the OpenAI API.

Args:
messages: A single conversation represented as a list of messages.
Each message is a dictionary with 'role' and 'content' keys.
messages: A list of conversations or a single conversation.
- Each conversation is represented as a list of messages.
- Each message is a dictionary with 'role' and 'content' keys.
sampling_params: The sampling parameters for text generation.
If None, we use the default sampling parameters. When it
is a single value, it is applied to every prompt. When it
Expand All @@ -386,49 +416,66 @@ def chat(
to each message.

Returns:
A list of ``RequestOutput`` objects containing the generated
responses in the same order as the input messages.
A list of lists or single list of ``RequestOutput`` objects
containing the generated responses in the same order as the input
conversations and messages.
"""
list_of_messages: List[List[ChatCompletionMessageParam]]

tokenizer = self.get_tokenizer()
model_config = self.llm_engine.get_model_config()

conversation, mm_data = parse_chat_messages(messages, model_config,
tokenizer)

prompt: Union[str, List[int]]
if isinstance(tokenizer, MistralTokenizer):
prompt = apply_mistral_chat_template(
tokenizer,
messages=messages,
chat_template=chat_template,
add_generation_prompt=add_generation_prompt,
tools=tools,
)
# Handle multi and single conversations
if is_list_of(messages, list):
# messages is List[List[...]]
list_of_messages = messages
else:
prompt = apply_hf_chat_template(
tokenizer,
conversation=conversation,
chat_template=chat_template,
add_generation_prompt=add_generation_prompt,
tools=tools,
)
# messages is List[...]
list_of_messages = [messages]

outputs = []

for msgs in list_of_messages:
tokenizer = self.get_tokenizer()
model_config = self.llm_engine.get_model_config()

conversation, mm_data = parse_chat_messages(
msgs, model_config, tokenizer)

prompt: Union[str, List[int]]
if isinstance(tokenizer, MistralTokenizer):
prompt = apply_mistral_chat_template(
tokenizer,
messages=msgs,
chat_template=chat_template,
add_generation_prompt=add_generation_prompt,
tools=tools,
)
else:
prompt = apply_hf_chat_template(
tokenizer,
conversation=conversation,
chat_template=chat_template,
add_generation_prompt=add_generation_prompt,
tools=tools,
)

inputs: PromptInputs
if is_list_of(prompt, int):
inputs = TokensPrompt(prompt_token_ids=prompt)
else:
inputs = TextPrompt(prompt=prompt)

inputs: PromptInputs
if is_list_of(prompt, int):
inputs = TokensPrompt(prompt_token_ids=prompt)
else:
inputs = TextPrompt(prompt=prompt)
if mm_data is not None:
inputs["multi_modal_data"] = mm_data

if mm_data is not None:
inputs["multi_modal_data"] = mm_data
out = self.generate(
inputs,
sampling_params=sampling_params,
use_tqdm=use_tqdm,
lora_request=lora_request,
)
outputs.append(out)

return self.generate(
inputs,
sampling_params=sampling_params,
use_tqdm=use_tqdm,
lora_request=lora_request,
)
# When messages is List[...], return a single list
return outputs if len(outputs) > 1 else outputs[0]

@overload # LEGACY: single (prompt + optional token ids)
def encode(
Expand Down