Skip to content

Commit b6e8dfd

Browse files
authored
[BFCL] Omit Reasoning Content from Chat History for Function-Calling Models (#1064)
OpenAI models do not return reasoning content within API responses. However, other models using the OpenAI-compatible interface, such as DeepSeek, include reasoning details in their responses. These reasoning contents are typically not intended for inclusion in subsequent chat turns. This PR addresses this behavior by updating the handler to store any available reasoning content into response_data (primarily for local result logging), while ensuring that reasoning content does not propagate into the chat history. This approach has previously been implemented for prompt-based models. This PR extends that logic to also support function-calling models.
1 parent a925e13 commit b6e8dfd

File tree

5 files changed

+80
-18
lines changed

5 files changed

+80
-18
lines changed

berkeley-function-call-leaderboard/bfcl_eval/model_handler/api_inference/deepseek.py

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -118,6 +118,20 @@ def _pre_query_processing_prompting(self, test_entry: dict) -> dict:
118118

119119
@override
120120
def _parse_query_response_prompting(self, api_response: any) -> dict:
121+
"""
122+
DeepSeek does not take reasoning content in next turn chat history, for both prompting and function calling mode.
123+
Error: Error code: 400 - {'error': {'message': 'The reasoning_content is an intermediate result for display purposes only and will not be included in the context for inference. Please remove the reasoning_content from your message to reduce network traffic.', 'type': 'invalid_request_error', 'param': None, 'code': 'invalid_request_error'}}
124+
"""
121125
response_data = super()._parse_query_response_prompting(api_response)
122-
self._add_reasoning_content_if_available(api_response, response_data)
126+
self._add_reasoning_content_if_available_prompting(api_response, response_data)
123127
return response_data
128+
129+
@override
130+
def _parse_query_response_FC(self, api_response: any) -> dict:
131+
"""
132+
DeepSeek does not take reasoning content in next turn chat history, for both prompting and function calling mode.
133+
Error: Error code: 400 - {'error': {'message': 'The reasoning_content is an intermediate result for display purposes only and will not be included in the context for inference. Please remove the reasoning_content from your message to reduce network traffic.', 'type': 'invalid_request_error', 'param': None, 'code': 'invalid_request_error'}}
134+
"""
135+
response_data = super()._parse_query_response_FC(api_response)
136+
self._add_reasoning_content_if_available_FC(api_response, response_data)
137+
return response_data

berkeley-function-call-leaderboard/bfcl_eval/model_handler/api_inference/grok.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,11 +17,11 @@ def __init__(self, model_name, temperature) -> None:
1717
@override
1818
def _parse_query_response_prompting(self, api_response: any) -> dict:
1919
response_data = super()._parse_query_response_prompting(api_response)
20-
self._add_reasoning_content_if_available(api_response, response_data)
20+
self._add_reasoning_content_if_available_prompting(api_response, response_data)
2121
return response_data
2222

2323
@override
2424
def _parse_query_response_FC(self, api_response: any) -> dict:
2525
response_data = super()._parse_query_response_FC(api_response)
26-
self._add_reasoning_content_if_available(api_response, response_data)
26+
self._add_reasoning_content_if_available_prompting(api_response, response_data)
2727
return response_data

berkeley-function-call-leaderboard/bfcl_eval/model_handler/api_inference/ling.py

Lines changed: 6 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -13,15 +13,13 @@
1313
from openai import OpenAI, RateLimitError
1414
from overrides import override
1515

16+
1617
class LingAPIHandler(OpenAIHandler):
1718
def __init__(self, model_name, temperature) -> None:
1819
super().__init__(model_name, temperature)
1920
self.model_style = ModelStyle.OpenAI
20-
api_url="https://bailingchat.alipay.com"
21-
self.client = OpenAI(
22-
base_url=api_url,
23-
api_key=os.getenv("LING_API_KEY")
24-
)
21+
api_url = "https://bailingchat.alipay.com"
22+
self.client = OpenAI(base_url=api_url, api_key=os.getenv("LING_API_KEY"))
2523

2624
@retry_with_backoff(error_type=[RateLimitError, json.JSONDecodeError])
2725
def generate_with_backoff(self, **kwargs):
@@ -41,7 +39,7 @@ def _query_prompting(self, inference_data: dict):
4139
inference_data["inference_input_log"] = {"message": repr(message)}
4240

4341
if "Ling/ling-lite-v1.5" in self.model_name:
44-
api_name="Ling-lite-1.5-250604"
42+
api_name = "Ling-lite-1.5-250604"
4543
else:
4644
raise ValueError(
4745
f"Model name {self.model_name} not yet supported in this method"
@@ -73,5 +71,5 @@ def _pre_query_processing_prompting(self, test_entry: dict) -> dict:
7371
@override
7472
def _parse_query_response_prompting(self, api_response: any) -> dict:
7573
response_data = super()._parse_query_response_prompting(api_response)
76-
self._add_reasoning_content_if_available(api_response, response_data)
77-
return response_data
74+
self._add_reasoning_content_if_available_prompting(api_response, response_data)
75+
return response_data

berkeley-function-call-leaderboard/bfcl_eval/model_handler/api_inference/openai.py

Lines changed: 55 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -162,6 +162,51 @@ def _add_execution_results_FC(
162162

163163
return inference_data
164164

165+
def _add_reasoning_content_if_available_FC(
166+
self, api_response: any, response_data: dict
167+
) -> None:
168+
"""
169+
OpenAI models don't show reasoning content in the api response,
170+
but many other models that use the OpenAI interface do, such as DeepSeek and Grok.
171+
This method is included here to avoid code duplication.
172+
173+
These models often don't take reasoning content in the chat history for next turn.
174+
Thus, this method saves reasoning content to response_data (for local result file) if present in the response,
175+
but does not include it in the chat history.
176+
"""
177+
# Original assistant message object (contains `reasoning_content` on DeepSeek).
178+
message = api_response.choices[0].message
179+
180+
# Preserve tool_call information but strip the unsupported `reasoning_content` field before inserting into chat history.
181+
if getattr(message, "tool_calls", None):
182+
assistant_message = {
183+
"role": "assistant",
184+
"content": message.content,
185+
"tool_calls": [
186+
{
187+
"id": tool_call.id,
188+
"type": tool_call.type,
189+
"function": {
190+
"name": tool_call.function.name,
191+
"arguments": tool_call.function.arguments,
192+
},
193+
}
194+
for tool_call in message.tool_calls
195+
],
196+
}
197+
response_data["model_responses_message_for_chat_history"] = assistant_message
198+
199+
# If no tool_calls, we still need to strip reasoning_content.
200+
elif hasattr(message, "reasoning_content"):
201+
response_data["model_responses_message_for_chat_history"] = {
202+
"role": "assistant",
203+
"content": message.content,
204+
}
205+
206+
# Capture the reasoning trace so it can be logged to the local result file.
207+
if hasattr(message, "reasoning_content"):
208+
response_data["reasoning_content"] = message.reasoning_content
209+
165210
#### Prompting methods ####
166211

167212
def _query_prompting(self, inference_data: dict):
@@ -233,13 +278,18 @@ def _add_execution_results_prompting(
233278

234279
return inference_data
235280

236-
# Adds reasoning content to response_data if present in the response.
237-
# OpenAI models don't show reasoning content in the api response,
238-
# but many other models that use the OpenAI interface do, such as DeepSeek and Grok.
239-
# So this method is included here to avoid code duplication.
240-
def _add_reasoning_content_if_available(
281+
def _add_reasoning_content_if_available_prompting(
241282
self, api_response: any, response_data: dict
242283
) -> None:
284+
"""
285+
OpenAI models don't show reasoning content in the api response,
286+
but many other models that use the OpenAI interface do, such as DeepSeek and Grok.
287+
This method is included here to avoid code duplication.
288+
289+
These models often don't take reasoning content in the chat history for next turn.
290+
Thus, this method saves reasoning content to response_data (for local result file) if present in the response,
291+
but does not include it in the chat history.
292+
"""
243293
message = api_response.choices[0].message
244294
if hasattr(message, "reasoning_content"):
245295
response_data["reasoning_content"] = message.reasoning_content

berkeley-function-call-leaderboard/pyproject.toml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,8 +7,8 @@ name = "bfcl_eval"
77
dynamic = ["version"]
88
description = "Berkeley Function Calling Leaderboard (BFCL)"
99
authors = [
10-
{ name = "Huanzhi Mao", email = "huanzhimao@cs.berkeley.edu" },
11-
{ name = "Shishir Patil", email = "shishirpatil@cs.berkeley.edu" },
10+
{ name = "Huanzhi Mao", email = "huanzhimao@eecs.berkeley.edu" },
11+
{ name = "Shishir Patil", email = "shishirpatil@eecs.berkeley.edu" },
1212
{ name = "Sky Computing Lab", email = "[email protected]" },
1313
]
1414
readme = "README.md"

0 commit comments

Comments
 (0)