Skip to content

Commit 4351a39

Browse files
authored
[Fix] Fix reasoning content (#797)
* fix reasoning * fix reasoning
1 parent 6f0616c commit 4351a39

File tree

5 files changed

+17
-13
lines changed

5 files changed

+17
-13
lines changed

evalscope/benchmarks/bfcl/generation.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -78,7 +78,7 @@ def generate_turn(model: Model, row: dict[str, Any]):
7878
if isinstance(message, str):
7979
result = message
8080
else:
81-
result = message.content
81+
result = message.text
8282

8383
logger.debug(f'Turn:{turn_idx} Step:{n_steps} Result: {result}')
8484
current_responses.append(result)
@@ -186,7 +186,7 @@ def generate_turn_with_tools(model: Model, row: dict[str, Any]):
186186
logger.error(f'Error converting tool calls to function call strings: {e}')
187187
tool_call_strs = None
188188
else:
189-
model_responses = [message.content]
189+
model_responses = [message.text]
190190
tool_call_strs = None
191191

192192
current_responses.extend(model_responses)

evalscope/benchmarks/tau_bench/generation.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@ def patched_solve(
4545
input=[dict_to_chat_message(msg) for msg in messages],
4646
tools=[ToolInfo.model_validate(tool['function']) for tool in self.tools_info]
4747
)
48-
oai_res = openai_chat_choices(res.choices)
48+
oai_res = openai_chat_choices(res.choices, include_reasoning=False)
4949

5050
next_message = oai_res[0].message.model_dump(exclude_none=True)
5151

evalscope/benchmarks/tau_bench/tau_bench_adapter.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -79,10 +79,10 @@ def new_generate_next_message(self, messages):
7979

8080
res = user_server.generate(input=[dict_to_chat_message(msg) for msg in messages])
8181

82-
message = res.message.model_dump(exclude_none=True)
82+
message = {'role': 'assistant', 'content': res.completion}
8383
self.messages.append(message)
8484
self.total_cost = 0
85-
return message['content']
85+
return res.completion
8686

8787
# get the current instance of TauBenchAdapter
8888
adapter_instance = self

evalscope/models/utils/openai.py

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -209,7 +209,7 @@ def openai_completion_params(model: str, config: GenerateConfig, tools: bool) ->
209209
return params
210210

211211

212-
def openai_assistant_content(message: ChatMessageAssistant) -> str:
212+
def openai_assistant_content(message: ChatMessageAssistant, include_reasoning=True) -> str:
213213
# In agent bridge scenarios, we could encounter concepts such as reasoning and
214214
# .internal use in the ChatMessageAssistant that are not supported by the OpenAI
215215
# choices API. This code smuggles that data into the plain text so that it
@@ -220,7 +220,7 @@ def openai_assistant_content(message: ChatMessageAssistant) -> str:
220220
else:
221221
content = ''
222222
for c in message.content:
223-
if c.type == 'reasoning':
223+
if c.type == 'reasoning' and include_reasoning:
224224
attribs = ''
225225
if c.signature is not None:
226226
attribs = f'{attribs} signature="{c.signature}"'
@@ -239,11 +239,14 @@ def openai_assistant_content(message: ChatMessageAssistant) -> str:
239239
return content
240240

241241

242-
def openai_chat_choices(choices: List[ChatCompletionChoice]) -> List[Choice]:
242+
def openai_chat_choices(choices: List[ChatCompletionChoice], include_reasoning: bool = True) -> List[Choice]:
243243
oai_choices: List[Choice] = []
244244

245245
for index, choice in enumerate(choices):
246-
content = openai_assistant_content(choice.message)
246+
# Handle content
247+
content = openai_assistant_content(choice.message, include_reasoning=include_reasoning)
248+
249+
# Handle tool calls
247250
if choice.message.tool_calls:
248251
tool_calls = [openai_chat_tool_call(tc) for tc in choice.message.tool_calls]
249252
else:

tests/benchmark/test_eval.py

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -364,21 +364,22 @@ def test_bfcl(self):
364364
'underscore_to_dot': True
365365
}
366366
}
367-
self._run_dataset_test('bfcl_v3', dataset_args)
367+
self._run_dataset_test('bfcl_v3', dataset_args, model='qwq-plus', stream=True)
368368

369369
def test_tau_bench(self):
370370
dataset_args = {
371371
'extra_params': {
372-
'user_model': 'qwen-plus',
372+
'user_model': 'qwq-plus',
373373
'api_key': env.get('DASHSCOPE_API_KEY'),
374374
'api_base': 'https://dashscope.aliyuncs.com/compatible-mode/v1',
375375
'generation_config': {
376376
'temperature': 0.7,
377-
'max_tokens': 12000
377+
'max_tokens': 12000,
378+
'stream': True
378379
}
379380
}
380381
}
381-
self._run_dataset_test('tau_bench', dataset_args, limit=1)
382+
self._run_dataset_test('tau_bench', dataset_args, limit=1, model='qwq-plus', stream=True)
382383

383384
if __name__ == '__main__':
384385
# Run specific test: python -m unittest test_eval.TestBenchmark.test_gsm8k

0 commit comments

Comments
 (0)