[BFCL] Omit Reasoning Content from Chat History for Function-Calling Models (#1064)

HuanzhiMao · web-flow · commit b6e8dfde34ba · 2025-06-13T10:28:49.000-07:00
OpenAI models do not return reasoning content within API responses.
However, other models using the OpenAI-compatible interface, such as
DeepSeek, include reasoning details in their responses.

These reasoning contents are typically not intended for inclusion in
subsequent chat turns. This PR addresses this behavior by updating the
handler to store any available reasoning content into response_data
(primarily for local result logging), while ensuring that reasoning
content does not propagate into the chat history.

This approach has previously been implemented for prompt-based models.
This PR extends that logic to also support function-calling models.
diff --git a/berkeley-function-call-leaderboard/bfcl_eval/model_handler/api_inference/deepseek.py b/berkeley-function-call-leaderboard/bfcl_eval/model_handler/api_inference/deepseek.py
@@ -118,6 +118,20 @@ def _pre_query_processing_prompting(self, test_entry: dict) -> dict:
 
     @override
     def _parse_query_response_prompting(self, api_response: any) -> dict:
+        """
+        DeepSeek does not take reasoning content in next turn chat history, for both prompting and function calling mode.
+        Error: Error code: 400 - {'error': {'message': 'The reasoning_content is an intermediate result for display purposes only and will not be included in the context for inference. Please remove the reasoning_content from your message to reduce network traffic.', 'type': 'invalid_request_error', 'param': None, 'code': 'invalid_request_error'}}
+        """
         response_data = super()._parse_query_response_prompting(api_response)
-        self._add_reasoning_content_if_available(api_response, response_data)
+        self._add_reasoning_content_if_available_prompting(api_response, response_data)
         return response_data
+
+    @override
+    def _parse_query_response_FC(self, api_response: any) -> dict:
+        """
+        DeepSeek does not take reasoning content in next turn chat history, for both prompting and function calling mode.
+        Error: Error code: 400 - {'error': {'message': 'The reasoning_content is an intermediate result for display purposes only and will not be included in the context for inference. Please remove the reasoning_content from your message to reduce network traffic.', 'type': 'invalid_request_error', 'param': None, 'code': 'invalid_request_error'}}
+        """
+        response_data = super()._parse_query_response_FC(api_response)
+        self._add_reasoning_content_if_available_FC(api_response, response_data)
+        return response_data
diff --git a/berkeley-function-call-leaderboard/bfcl_eval/model_handler/api_inference/grok.py b/berkeley-function-call-leaderboard/bfcl_eval/model_handler/api_inference/grok.py
@@ -17,11 +17,11 @@ def __init__(self, model_name, temperature) -> None:
     @override
     def _parse_query_response_prompting(self, api_response: any) -> dict:
         response_data = super()._parse_query_response_prompting(api_response)
-        self._add_reasoning_content_if_available(api_response, response_data)
+        self._add_reasoning_content_if_available_prompting(api_response, response_data)
         return response_data
 
     @override
     def _parse_query_response_FC(self, api_response: any) -> dict:
         response_data = super()._parse_query_response_FC(api_response)
-        self._add_reasoning_content_if_available(api_response, response_data)
+        self._add_reasoning_content_if_available_prompting(api_response, response_data)
         return response_data
diff --git a/berkeley-function-call-leaderboard/bfcl_eval/model_handler/api_inference/ling.py b/berkeley-function-call-leaderboard/bfcl_eval/model_handler/api_inference/ling.py
@@ -13,15 +13,13 @@
 from openai import OpenAI, RateLimitError
 from overrides import override
 
+
 class LingAPIHandler(OpenAIHandler):
     def __init__(self, model_name, temperature) -> None:
         super().__init__(model_name, temperature)
         self.model_style = ModelStyle.OpenAI
-        api_url="https://bailingchat.alipay.com"
-        self.client = OpenAI(
-            base_url=api_url,
-            api_key=os.getenv("LING_API_KEY")
-        )
+        api_url = "https://bailingchat.alipay.com"
+        self.client = OpenAI(base_url=api_url, api_key=os.getenv("LING_API_KEY"))
 
     @retry_with_backoff(error_type=[RateLimitError, json.JSONDecodeError])
     def generate_with_backoff(self, **kwargs):
@@ -41,7 +39,7 @@ def _query_prompting(self, inference_data: dict):
         inference_data["inference_input_log"] = {"message": repr(message)}
 
         if "Ling/ling-lite-v1.5" in self.model_name:
-            api_name="Ling-lite-1.5-250604"
+            api_name = "Ling-lite-1.5-250604"
         else:
             raise ValueError(
                 f"Model name {self.model_name} not yet supported in this method"
@@ -73,5 +71,5 @@ def _pre_query_processing_prompting(self, test_entry: dict) -> dict:
     @override
     def _parse_query_response_prompting(self, api_response: any) -> dict:
         response_data = super()._parse_query_response_prompting(api_response)
-        self._add_reasoning_content_if_available(api_response, response_data)
-        return response_data
+        self._add_reasoning_content_if_available_prompting(api_response, response_data)
+        return response_data
diff --git a/berkeley-function-call-leaderboard/bfcl_eval/model_handler/api_inference/openai.py b/berkeley-function-call-leaderboard/bfcl_eval/model_handler/api_inference/openai.py
@@ -162,6 +162,51 @@ def _add_execution_results_FC(
 
         return inference_data
 
+    def _add_reasoning_content_if_available_FC(
+        self, api_response: any, response_data: dict
+    ) -> None:
+        """
+        OpenAI models don't show reasoning content in the api response,
+        but many other models that use the OpenAI interface do, such as DeepSeek and Grok.
+        This method is included here to avoid code duplication.
+
+        These models often don't take reasoning content in the chat history for next turn.
+        Thus, this method saves reasoning content to response_data (for local result file) if present in the response,
+        but does not include it in the chat history.
+        """
+        # Original assistant message object (contains `reasoning_content` on DeepSeek).
+        message = api_response.choices[0].message
+
+        # Preserve tool_call information but strip the unsupported `reasoning_content` field before inserting into chat history.
+        if getattr(message, "tool_calls", None):
+            assistant_message = {
+                "role": "assistant",
+                "content": message.content,
+                "tool_calls": [
+                    {
+                        "id": tool_call.id,
+                        "type": tool_call.type,
+                        "function": {
+                            "name": tool_call.function.name,
+                            "arguments": tool_call.function.arguments,
+                        },
+                    }
+                    for tool_call in message.tool_calls
+                ],
+            }
+            response_data["model_responses_message_for_chat_history"] = assistant_message
+
+        # If no tool_calls, we still need to strip reasoning_content.
+        elif hasattr(message, "reasoning_content"):
+            response_data["model_responses_message_for_chat_history"] = {
+                "role": "assistant",
+                "content": message.content,
+            }
+
+        # Capture the reasoning trace so it can be logged to the local result file.
+        if hasattr(message, "reasoning_content"):
+            response_data["reasoning_content"] = message.reasoning_content
+
     #### Prompting methods ####
 
     def _query_prompting(self, inference_data: dict):
@@ -233,13 +278,18 @@ def _add_execution_results_prompting(
 
         return inference_data
 
-    # Adds reasoning content to response_data if present in the response.
-    # OpenAI models don't show reasoning content in the api response,
-    # but many other models that use the OpenAI interface do, such as DeepSeek and Grok.
-    # So this method is included here to avoid code duplication.
-    def _add_reasoning_content_if_available(
+    def _add_reasoning_content_if_available_prompting(
         self, api_response: any, response_data: dict
     ) -> None:
+        """
+        OpenAI models don't show reasoning content in the api response,
+        but many other models that use the OpenAI interface do, such as DeepSeek and Grok.
+        This method is included here to avoid code duplication.
+
+        These models often don't take reasoning content in the chat history for next turn.
+        Thus, this method saves reasoning content to response_data (for local result file) if present in the response,
+        but does not include it in the chat history.
+        """
         message = api_response.choices[0].message
         if hasattr(message, "reasoning_content"):
             response_data["reasoning_content"] = message.reasoning_content
diff --git a/berkeley-function-call-leaderboard/pyproject.toml b/berkeley-function-call-leaderboard/pyproject.toml
@@ -7,8 +7,8 @@ name = "bfcl_eval"
 dynamic = ["version"]
 description = "Berkeley Function Calling Leaderboard (BFCL)"
 authors = [
-    { name = "Huanzhi Mao", email = "huanzhimao@cs.berkeley.edu" },
-    { name = "Shishir Patil", email = "shishirpatil@cs.berkeley.edu" },
+    { name = "Huanzhi Mao", email = "huanzhimao@eecs.berkeley.edu" },
+    { name = "Shishir Patil", email = "shishirpatil@eecs.berkeley.edu" },
     { name = "Sky Computing Lab", email = "sky@cs.berkeley.edu" },
 ]
 readme = "README.md"