Skip to content

Commit adf1313

Browse files
gcalmetteszhewenl
authored andcommitted
[gpt-oss] add input/output usage in responses api when harmony context is leveraged (vllm-project#22667)
Signed-off-by: Guillaume Calmettes <[email protected]>
1 parent 8b25ae8 commit adf1313

File tree

1 file changed

+26
-2
lines changed

1 file changed

+26
-2
lines changed

vllm/entrypoints/context.py

Lines changed: 26 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
import json
44
import logging
55
from abc import ABC, abstractmethod
6+
from collections.abc import Sequence
67
from typing import TYPE_CHECKING, Union
78

89
from openai_harmony import Author, Message, Role, StreamState, TextContent
@@ -67,15 +68,27 @@ def __init__(
6768

6869
self.parser = get_streamable_parser_for_assistant()
6970
self.num_init_messages = len(messages)
70-
# TODO(woosuk): Implement the following fields.
7171
self.num_prompt_tokens = 0
72-
self.num_cached_tokens = 0
7372
self.num_output_tokens = 0
73+
# TODO(woosuk): Implement the following fields.
74+
self.num_cached_tokens = 0
7475
self.num_reasoning_tokens = 0
7576

77+
def _update_num_prompt_tokens(self, output: RequestOutput):
78+
if output.prompt_token_ids and len(output.prompt_token_ids) > 0:
79+
# NOTE: with built-in tools, there might be multiple rounds in
80+
# the conversation, with the full conversation being resent
81+
# as new prompt each time. Hence the sum.
82+
self.num_prompt_tokens += len(output.prompt_token_ids)
83+
84+
def _update_num_output_tokens(self, token_ids: Sequence[int]):
85+
self.num_output_tokens += len(token_ids)
86+
7687
def append_output(self, output) -> None:
7788
if isinstance(output, RequestOutput):
89+
self._update_num_prompt_tokens(output)
7890
output_token_ids = output.outputs[0].token_ids
91+
self._update_num_output_tokens(output_token_ids)
7992
self.parser = get_streamable_parser_for_assistant()
8093
for token_id in output_token_ids:
8194
self.parser.process(token_id)
@@ -158,15 +171,26 @@ def __init__(self, *args, **kwargs):
158171
self.parser = get_streamable_parser_for_assistant()
159172
self.encoding = get_encoding()
160173
self.last_tok = None
174+
self.first_tok_of_message = True
161175

162176
@property
163177
def messages(self) -> list:
164178
return self.parser.messages
165179

166180
def append_output(self, output) -> None:
167181
if isinstance(output, RequestOutput):
182+
# append_output is called for each output token in streaming case,
183+
# so we only want to add the prompt tokens once for each message.
184+
if self.first_tok_of_message:
185+
self._update_num_prompt_tokens(output)
186+
# Reset self.first_tok_of_message if needed:
187+
# if the current token is the last one of the current message
188+
# (finished=True), then the next token processed will mark the
189+
# beginning of a new message
190+
self.first_tok_of_message = output.finished
168191
tok = output.outputs[0].token_ids[0]
169192
self.parser.process(tok)
193+
self._update_num_output_tokens(output.outputs[0].token_ids)
170194
self.last_tok = tok
171195
else:
172196
# Handle the case of tool output in direct message format

0 commit comments

Comments
 (0)