[BFCL] Migrate Gemini Inference to Google AI Studio (#1099)

HuanzhiMao · web-flow · commit 30994aa8b349 · 2025-07-06T15:09:43.000-07:00
This PR updates the inference mechanism for Google Gemini models, replacing the use of Google Vertex AI with Google AI Studio. In addition, this PR downgrades `tenacity` from 9.0.0 → 8.5.0 because `google-genai` currently pins `tenacity<9.0`. ---- **Compatibility note on tenacity** Our code does exercise the retry path affected by [jd/tenacity #425](jd/tenacity#425), but the issue has no functional impact on our evaluation accuracy. Therefore, the temporary downgrade is considered safe. We will revert to tenacity ≥9.0 once python-genai removes the <9.0 pin (tracked in [googleapis/python-genai #1005](googleapis/python-genai#1005)).
diff --git a/berkeley-function-call-leaderboard/CHANGELOG.md b/berkeley-function-call-leaderboard/CHANGELOG.md
@@ -2,6 +2,7 @@
 
 All notable changes to the Berkeley Function Calling Leaderboard will be documented in this file.
 
+- [Jul 6, 2025] [#1099](https://github.com/ShishirPatil/gorilla/pull/1099): Migrate Gemini inference to Google AI Studio.
 - [Jul 2, 2025] [#1090](https://github.com/ShishirPatil/gorilla/pull/1090): Updated OpenAI models to use `developer` role instead of `system` role, following OpenAI's documentation recommendations. This change affects only the OpenAI Responses handler.
 - [Jul 2, 2025] [#1062](https://github.com/ShishirPatil/gorilla/pull/1062): Introduce OpenAI Responses handler, and add support for `o3-2025-04-16` and `o4-mini-2025-04-16`.
 - [Jun 30, 2025] [#956](https://github.com/ShishirPatil/gorilla/pull/956): Fix typo in ground truth for multi_turn_base.
diff --git a/berkeley-function-call-leaderboard/SUPPORTED_MODELS.md b/berkeley-function-call-leaderboard/SUPPORTED_MODELS.md
@@ -135,7 +135,7 @@ For model names containing `{...}`, multiple versions are available. For example
 ## Additional Requirements for Certain Models
 
 - **Gemini Models:**
-  For `Gemini` models, we use the Google Vertex AI endpoint for inference. Ensure you have set the `VERTEX_AI_PROJECT_ID` and `VERTEX_AI_LOCATION` in your `.env` file.
+  For `Gemini` models, we use the Google AI Studio API for inference. Ensure you have set the `GOOGLE_API_KEY` in your `.env` file.
 
 - **Databricks Models:**
   For `databrick-dbrx-instruct`, you must create an Azure Databricks workspace and set up a dedicated inference endpoint. Provide the endpoint URL via `DATABRICKS_AZURE_ENDPOINT_URL` in `.env`.
diff --git a/berkeley-function-call-leaderboard/bfcl_eval/.env.example b/berkeley-function-call-leaderboard/bfcl_eval/.env.example
@@ -17,9 +17,8 @@ MINING_API_KEY=sk-XXXXXX
 DMCITO_BASE_URL=XXXXXX
 DMCITO_API_KEY=sk-XXXXXX
 
-# We use Vertex AI to inference Google Gemini models
-VERTEX_AI_PROJECT_ID=
-VERTEX_AI_LOCATION=
+# We use Google AI Studio to inference Google Gemini models
+GOOGLE_API_KEY=
 
 AWS_ACCESS_KEY_ID=
 AWS_SECRET_ACCESS_KEY=
diff --git a/berkeley-function-call-leaderboard/bfcl_eval/_llm_response_generation.py b/berkeley-function-call-leaderboard/bfcl_eval/_llm_response_generation.py
@@ -205,7 +205,7 @@ def multi_threaded_inference(handler, test_case, include_input_log, exclude_stat
                     "❗️❗️ Error occurred during inference. Maximum reties reached for rate limit or other error. Continuing to next test case."
                 )
                 print(f"❗️❗️ Test case ID: {test_case['id']}, Error: {str(e)}")
-                traceback.print_exc()
+                traceback.print_exc(limit=10)
                 print("-" * 100)
 
                 return {
diff --git a/berkeley-function-call-leaderboard/bfcl_eval/model_handler/api_inference/gemini.py b/berkeley-function-call-leaderboard/bfcl_eval/model_handler/api_inference/gemini.py
@@ -1,7 +1,6 @@
 import os
 import time
 
-import vertexai
 from bfcl_eval.constants.type_mappings import GORILLA_TO_OPENAPI
 from bfcl_eval.model_handler.base_handler import BaseHandler
 from bfcl_eval.model_handler.model_style import ModelStyle
@@ -15,38 +14,37 @@
     retry_with_backoff,
     system_prompt_pre_processing_chat_model,
 )
-from google.api_core.exceptions import ResourceExhausted, TooManyRequests
-from vertexai.generative_models import (
+from google import genai
+from google.genai import errors as genai_errors
+from google.genai.types import (
+    AutomaticFunctionCallingConfig,
     Content,
-    FunctionDeclaration,
-    GenerationConfig,
-    GenerativeModel,
+    GenerateContentConfig,
     Part,
+    ThinkingConfig,
     Tool,
 )
 
 
 class GeminiHandler(BaseHandler):
     def __init__(self, model_name, temperature) -> None:
         super().__init__(model_name, temperature)
-        self.model_style = ModelStyle.Google
-        # Initialize Vertex AI
-        vertexai.init(
-            project=os.getenv("VERTEX_AI_PROJECT_ID"),
-            location=os.getenv("VERTEX_AI_LOCATION"),
-        )
-        self.client = GenerativeModel(self.model_name.replace("-FC", ""))
+        self.model_style = ModelStyle.GOOGLE
+        api_key = os.getenv("GOOGLE_API_KEY")
+        if not api_key:
+            raise ValueError(
+                "GOOGLE_API_KEY environment variable must be set for Gemini models"
+            )
+        self.client = genai.Client(api_key=api_key)
 
     @staticmethod
     def _substitute_prompt_role(prompts: list[dict]) -> list[dict]:
-        # Allowed roles: user, model, function
+        # Allowed roles: user, model
         for prompt in prompts:
             if prompt["role"] == "user":
                 prompt["role"] = "user"
             elif prompt["role"] == "assistant":
                 prompt["role"] = "model"
-            elif prompt["role"] == "tool":
-                prompt["role"] = "function"
 
         return prompts
 
@@ -72,58 +70,41 @@ def decode_execute(self, result):
                     )
             return func_call_list
 
-    @retry_with_backoff(error_type=[ResourceExhausted, TooManyRequests])
-    def generate_with_backoff(self, client, **kwargs):
+    # We can't retry on ClientError because it's too broad.
+    # Both rate limit and invalid function description will trigger google.genai.errors.ClientError
+    @retry_with_backoff(error_message_pattern=r".*RESOURCE_EXHAUSTED.*")
+    def generate_with_backoff(self, **kwargs):
         start_time = time.time()
-        api_response = client.generate_content(**kwargs)
+        api_response = self.client.models.generate_content(**kwargs)
         end_time = time.time()
 
         return api_response, end_time - start_time
 
     #### FC methods ####
 
     def _query_FC(self, inference_data: dict):
-        # Gemini models needs to first conver the function doc to FunctionDeclaration and Tools objects.
-        # We do it here to avoid json serialization issues.
-        func_declarations = []
-        for function in inference_data["tools"]:
-            func_declarations.append(
-                FunctionDeclaration(
-                    name=function["name"],
-                    description=function["description"],
-                    parameters=function["parameters"],
-                )
-            )
-
-        if func_declarations:
-            tools = [Tool(function_declarations=func_declarations)]
-        else:
-            tools = None
-
         inference_data["inference_input_log"] = {
             "message": repr(inference_data["message"]),
             "tools": inference_data["tools"],
             "system_prompt": inference_data.get("system_prompt", None),
         }
 
-        # messages are already converted to Content object
+        config = GenerateContentConfig(
+            temperature=self.temperature,
+            automatic_function_calling=AutomaticFunctionCallingConfig(disable=True),
+            thinking_config=ThinkingConfig(include_thoughts=True),
+        )
+
         if "system_prompt" in inference_data:
-            # We re-instantiate the GenerativeModel object with the system prompt
-            # We cannot reassign the self.client object as it will affect other entries
-            client = GenerativeModel(
-                self.model_name.replace("-FC", ""),
-                system_instruction=inference_data["system_prompt"],
-            )
-        else:
-            client = self.client
+            config.system_instruction = inference_data["system_prompt"]
+
+        if len(inference_data["tools"]) > 0:
+            config.tools = [Tool(function_declarations=inference_data["tools"])]
 
         return self.generate_with_backoff(
-            client=client,
+            model=self.model_name.replace("-FC", ""),
             contents=inference_data["message"],
-            generation_config=GenerationConfig(
-                temperature=self.temperature,
-            ),
-            tools=tools,
+            config=config,
         )
 
     def _pre_query_processing_FC(self, inference_data: dict, test_entry: dict) -> dict:
@@ -155,9 +136,12 @@ def _parse_query_response_FC(self, api_response: any) -> dict:
         tool_call_func_names = []
         fc_parts = []
         text_parts = []
+        reasoning_content = []
 
         if (
             len(api_response.candidates) > 0
+            and api_response.candidates[0].content
+            and api_response.candidates[0].content.parts
             and len(api_response.candidates[0].content.parts) > 0
         ):
             response_function_call_content = api_response.candidates[0].content
@@ -172,13 +156,17 @@ def _parse_query_response_FC(self, api_response: any) -> dict:
 
                     fc_parts.append({part_func_name: part_func_args_dict})
                     tool_call_func_names.append(part_func_name)
+                # Aggregate reasoning content
+                elif part.thought:
+                    reasoning_content.append(part.text)
                 else:
                     text_parts.append(part.text)
+
         else:
             response_function_call_content = Content(
                 role="model",
                 parts=[
-                    Part.from_text("The model did not return any response."),
+                    Part(text="The model did not return any response."),
                 ],
             )
 
@@ -188,6 +176,7 @@ def _parse_query_response_FC(self, api_response: any) -> dict:
             "model_responses": model_responses,
             "model_responses_message_for_chat_history": response_function_call_content,
             "tool_call_func_names": tool_call_func_names,
+            "reasoning_content": "\n".join(reasoning_content),
             "input_token": api_response.usage_metadata.prompt_token_count,
             "output_token": api_response.usage_metadata.candidates_token_count,
         }
@@ -200,7 +189,7 @@ def add_first_turn_message_FC(
                 Content(
                     role=message["role"],
                     parts=[
-                        Part.from_text(message["content"]),
+                        Part(text=message["content"]),
                     ],
                 )
             )
@@ -235,12 +224,12 @@ def _add_execution_results_FC(
                 Part.from_function_response(
                     name=tool_call_func_name,
                     response={
-                        "content": execution_result,
+                        "result": execution_result,
                     },
                 )
             )
 
-        tool_response_content = Content(parts=tool_response_parts)
+        tool_response_content = Content(role="user", parts=tool_response_parts)
         inference_data["message"].append(tool_response_content)
 
         return inference_data
@@ -253,20 +242,18 @@ def _query_prompting(self, inference_data: dict):
             "system_prompt": inference_data.get("system_prompt", None),
         }
 
-        # messages are already converted to Content object
+        config = GenerateContentConfig(
+            temperature=self.temperature,
+            thinking_config=ThinkingConfig(include_thoughts=True),
+        )
+
         if "system_prompt" in inference_data:
-            client = GenerativeModel(
-                self.model_name.replace("-FC", ""),
-                system_instruction=inference_data["system_prompt"],
-            )
-        else:
-            client = self.client
+            config.system_instruction = inference_data["system_prompt"]
+
         api_response = self.generate_with_backoff(
-            client=client,
+            model=self.model_name.replace("-FC", ""),
             contents=inference_data["message"],
-            generation_config=GenerationConfig(
-                temperature=self.temperature,
-            ),
+            config=config,
         )
         return api_response
 
@@ -295,13 +282,28 @@ def _pre_query_processing_prompting(self, test_entry: dict) -> dict:
     def _parse_query_response_prompting(self, api_response: any) -> dict:
         if (
             len(api_response.candidates) > 0
+            and api_response.candidates[0].content
+            and api_response.candidates[0].content.parts
             and len(api_response.candidates[0].content.parts) > 0
         ):
-            model_responses = api_response.text
+            assert (
+                len(api_response.candidates[0].content.parts) == 2
+            ), api_response.candidates[0].content.parts
+
+            model_responses = ""
+            reasoning_content = ""
+            for part in api_response.candidates[0].content.parts:
+                if part.thought:
+                    reasoning_content = part.text
+                else:
+                    model_responses = part.text
+
         else:
             model_responses = "The model did not return any response."
+
         return {
             "model_responses": model_responses,
+            "reasoning_content": reasoning_content,
             "input_token": api_response.usage_metadata.prompt_token_count,
             "output_token": api_response.usage_metadata.candidates_token_count,
         }
@@ -314,7 +316,7 @@ def add_first_turn_message_prompting(
                 Content(
                     role=message["role"],
                     parts=[
-                        Part.from_text(message["content"]),
+                        Part(text=message["content"]),
                     ],
                 )
             )
@@ -332,7 +334,7 @@ def _add_assistant_message_prompting(
             Content(
                 role="model",
                 parts=[
-                    Part.from_text(model_response_data["model_responses"]),
+                    Part(text=model_response_data["model_responses"]),
                 ],
             )
         )
@@ -347,7 +349,7 @@ def _add_execution_results_prompting(
         tool_message = Content(
             role="user",
             parts=[
-                Part.from_text(formatted_results_message),
+                Part(text=formatted_results_message),
             ],
         )
         inference_data["message"].append(tool_message)
diff --git a/berkeley-function-call-leaderboard/bfcl_eval/model_handler/local_inference/base_oss_handler.py b/berkeley-function-call-leaderboard/bfcl_eval/model_handler/local_inference/base_oss_handler.py
@@ -285,7 +285,7 @@ def _multi_threaded_inference(
                 "❗️❗️ Error occurred during inference. Maximum reties reached for rate limit or other error. Continuing to next test case."
             )
             print(f"❗️❗️ Test case ID: {test_case['id']}, Error: {str(e)}")
-            traceback.print_exc()
+            traceback.print_exc(limit=10)
             print("-" * 100)
 
             model_responses = f"Error during inference: {str(e)}"
diff --git a/berkeley-function-call-leaderboard/bfcl_eval/model_handler/model_style.py b/berkeley-function-call-leaderboard/bfcl_eval/model_handler/model_style.py
@@ -8,7 +8,7 @@ class ModelStyle(Enum):
     OpenAI_Responses = "openai-responses"
     Anthropic = "claude"
     Mistral = "mistral"
-    Google = "google"
+    GOOGLE = "google"
     AMAZON = "amazon"
     FIREWORK_AI = "firework_ai"
     NEXUS = "nexus"
diff --git a/berkeley-function-call-leaderboard/bfcl_eval/model_handler/utils.py b/berkeley-function-call-leaderboard/bfcl_eval/model_handler/utils.py
diff --git a/berkeley-function-call-leaderboard/pyproject.toml b/berkeley-function-call-leaderboard/pyproject.toml

Original file line number	Diff line number	Diff line change
`@@ -205,7 +205,7 @@ def multi_threaded_inference(handler, test_case, include_input_log, exclude_stat`
`205`	`205`	`"❗️❗️ Error occurred during inference. Maximum reties reached for rate limit or other error. Continuing to next test case."`
`206`	`206`	`)`
`207`	`207`	`print(f"❗️❗️ Test case ID: {test_case['id']}, Error: {str(e)}")`
`208`		`- traceback.print_exc()`
	`208`	`+ traceback.print_exc(limit=10)`
`209`	`209`	`print("-" * 100)`
`210`	`210`
`211`	`211`	`return {`
Original file line number	Diff line number	Diff line change
`@@ -285,7 +285,7 @@ def _multi_threaded_inference(`
`285`	`285`	`"❗️❗️ Error occurred during inference. Maximum reties reached for rate limit or other error. Continuing to next test case."`
`286`	`286`	`)`
`287`	`287`	`print(f"❗️❗️ Test case ID: {test_case['id']}, Error: {str(e)}")`
`288`		`- traceback.print_exc()`
	`288`	`+ traceback.print_exc(limit=10)`
`289`	`289`	`print("-" * 100)`
`290`	`290`
`291`	`291`	`model_responses = f"Error during inference: {str(e)}"`