Skip to content

Commit 636d847

Browse files
pandada8zhanghao.smooth
authored andcommitted
Prevent returning partial stop string in vllm worker (lm-sys#2780)
1 parent a2723ef commit 636d847

File tree

1 file changed

+11
-2
lines changed

1 file changed

+11
-2
lines changed

fastchat/serve/vllm_worker.py

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@
2222
logger,
2323
worker_id,
2424
)
25-
from fastchat.utils import get_context_length
25+
from fastchat.utils import get_context_length, is_partial_stop
2626

2727

2828
app = FastAPI()
@@ -119,7 +119,12 @@ async def generate_stream(self, params):
119119
else:
120120
text_outputs = [output.text for output in request_output.outputs]
121121
text_outputs = " ".join(text_outputs)
122-
# Note: usage is not supported yet
122+
123+
partial_stop = any(is_partial_stop(text_outputs, i) for i in stop)
124+
# prevent yielding partial stop sequence
125+
if partial_stop:
126+
continue
127+
123128
prompt_tokens = len(request_output.prompt_token_ids)
124129
completion_tokens = sum(
125130
len(output.token_ids) for output in request_output.outputs
@@ -139,6 +144,10 @@ async def generate_stream(self, params):
139144
if len(request_output.outputs) == 1
140145
else [output.finish_reason for output in request_output.outputs],
141146
}
147+
# Emit twice here to ensure a 'finish_reason' with empty content in the OpenAI API response.
148+
# This aligns with the behavior of model_worker.
149+
if request_output.finished:
150+
yield (json.dumps(ret | {"finish_reason": None}) + "\0").encode()
142151
yield (json.dumps(ret) + "\0").encode()
143152

144153
async def generate(self, params):

0 commit comments

Comments
 (0)