[Fix] Fix reasoning content (#797)

Yunnglin · web-flow · commit 4351a39cc997 · 2025-08-27T16:48:42.000+08:00
* fix reasoning

* fix reasoning
diff --git a/evalscope/benchmarks/bfcl/generation.py b/evalscope/benchmarks/bfcl/generation.py
@@ -78,7 +78,7 @@ def generate_turn(model: Model, row: dict[str, Any]):
             if isinstance(message, str):
                 result = message
             else:
-                result = message.content
+                result = message.text
 
             logger.debug(f'Turn:{turn_idx} Step:{n_steps} Result: {result}')
             current_responses.append(result)
@@ -186,7 +186,7 @@ def generate_turn_with_tools(model: Model, row: dict[str, Any]):
                     logger.error(f'Error converting tool calls to function call strings: {e}')
                     tool_call_strs = None
             else:
-                model_responses = [message.content]
+                model_responses = [message.text]
                 tool_call_strs = None
 
             current_responses.extend(model_responses)
diff --git a/evalscope/benchmarks/tau_bench/generation.py b/evalscope/benchmarks/tau_bench/generation.py
@@ -45,7 +45,7 @@ def patched_solve(
                 input=[dict_to_chat_message(msg) for msg in messages],
                 tools=[ToolInfo.model_validate(tool['function']) for tool in self.tools_info]
             )
-            oai_res = openai_chat_choices(res.choices)
+            oai_res = openai_chat_choices(res.choices, include_reasoning=False)
 
             next_message = oai_res[0].message.model_dump(exclude_none=True)
 
diff --git a/evalscope/benchmarks/tau_bench/tau_bench_adapter.py b/evalscope/benchmarks/tau_bench/tau_bench_adapter.py
@@ -79,10 +79,10 @@ def new_generate_next_message(self, messages):
 
             res = user_server.generate(input=[dict_to_chat_message(msg) for msg in messages])
 
-            message = res.message.model_dump(exclude_none=True)
+            message = {'role': 'assistant', 'content': res.completion}
             self.messages.append(message)
             self.total_cost = 0
-            return message['content']
+            return res.completion
 
         # get the current instance of TauBenchAdapter
         adapter_instance = self
diff --git a/evalscope/models/utils/openai.py b/evalscope/models/utils/openai.py
@@ -209,7 +209,7 @@ def openai_completion_params(model: str, config: GenerateConfig, tools: bool) ->
     return params
 
 
-def openai_assistant_content(message: ChatMessageAssistant) -> str:
+def openai_assistant_content(message: ChatMessageAssistant, include_reasoning=True) -> str:
     # In agent bridge scenarios, we could encounter concepts such as reasoning and
     # .internal use in the ChatMessageAssistant that are not supported by the OpenAI
     # choices API. This code smuggles that data into the plain text so that it
@@ -220,7 +220,7 @@ def openai_assistant_content(message: ChatMessageAssistant) -> str:
     else:
         content = ''
         for c in message.content:
-            if c.type == 'reasoning':
+            if c.type == 'reasoning' and include_reasoning:
                 attribs = ''
                 if c.signature is not None:
                     attribs = f'{attribs} signature="{c.signature}"'
@@ -239,11 +239,14 @@ def openai_assistant_content(message: ChatMessageAssistant) -> str:
     return content
 
 
-def openai_chat_choices(choices: List[ChatCompletionChoice]) -> List[Choice]:
+def openai_chat_choices(choices: List[ChatCompletionChoice], include_reasoning: bool = True) -> List[Choice]:
     oai_choices: List[Choice] = []
 
     for index, choice in enumerate(choices):
-        content = openai_assistant_content(choice.message)
+        # Handle content
+        content = openai_assistant_content(choice.message, include_reasoning=include_reasoning)
+
+        # Handle tool calls
         if choice.message.tool_calls:
             tool_calls = [openai_chat_tool_call(tc) for tc in choice.message.tool_calls]
         else:
diff --git a/tests/benchmark/test_eval.py b/tests/benchmark/test_eval.py
@@ -364,21 +364,22 @@ def test_bfcl(self):
                 'underscore_to_dot': True
             }
         }
-        self._run_dataset_test('bfcl_v3', dataset_args)
+        self._run_dataset_test('bfcl_v3', dataset_args, model='qwq-plus', stream=True)
 
     def test_tau_bench(self):
         dataset_args = {
             'extra_params': {
-                'user_model': 'qwen-plus',
+                'user_model': 'qwq-plus',
                 'api_key': env.get('DASHSCOPE_API_KEY'),
                 'api_base': 'https://dashscope.aliyuncs.com/compatible-mode/v1',
                 'generation_config': {
                     'temperature': 0.7,
-                    'max_tokens': 12000
+                    'max_tokens': 12000,
+                    'stream': True
                 }
             }
         }
-        self._run_dataset_test('tau_bench', dataset_args, limit=1)
+        self._run_dataset_test('tau_bench', dataset_args, limit=1, model='qwq-plus', stream=True)
 
 if __name__ == '__main__':
     # Run specific test: python -m unittest test_eval.TestBenchmark.test_gsm8k

Original file line number	Diff line number	Diff line change
`@@ -45,7 +45,7 @@ def patched_solve(`
`45`	`45`	`input=[dict_to_chat_message(msg) for msg in messages],`
`46`	`46`	`tools=[ToolInfo.model_validate(tool['function']) for tool in self.tools_info]`
`47`	`47`	`)`
`48`		`- oai_res = openai_chat_choices(res.choices)`
	`48`	`+ oai_res = openai_chat_choices(res.choices, include_reasoning=False)`
`49`	`49`
`50`	`50`	`next_message = oai_res[0].message.model_dump(exclude_none=True)`
`51`	`51`
Original file line number	Diff line number	Diff line change
`@@ -364,21 +364,22 @@ def test_bfcl(self):`
`364`	`364`	`'underscore_to_dot': True`
`365`	`365`	`}`
`366`	`366`	`}`
`367`		`- self._run_dataset_test('bfcl_v3', dataset_args)`
	`367`	`+ self._run_dataset_test('bfcl_v3', dataset_args, model='qwq-plus', stream=True)`
`368`	`368`
`369`	`369`	`def test_tau_bench(self):`
`370`	`370`	`dataset_args = {`
`371`	`371`	`'extra_params': {`
`372`		`- 'user_model': 'qwen-plus',`
	`372`	`+ 'user_model': 'qwq-plus',`
`373`	`373`	`'api_key': env.get('DASHSCOPE_API_KEY'),`
`374`	`374`	`'api_base': 'https://dashscope.aliyuncs.com/compatible-mode/v1',`
`375`	`375`	`'generation_config': {`
`376`	`376`	`'temperature': 0.7,`
`377`		`- 'max_tokens': 12000`
	`377`	`+ 'max_tokens': 12000,`
	`378`	`+ 'stream': True`
`378`	`379`	`}`
`379`	`380`	`}`
`380`	`381`	`}`
`381`		`- self._run_dataset_test('tau_bench', dataset_args, limit=1)`
	`382`	`+ self._run_dataset_test('tau_bench', dataset_args, limit=1, model='qwq-plus', stream=True)`
`382`	`383`
`383`	`384`	`if __name__ == '__main__':`
`384`	`385`	`# Run specific test: python -m unittest test_eval.TestBenchmark.test_gsm8k`