[BFCL] Patch Gemini Handler (#421)

HuanzhiMao · web-flow · commit 40b07a5de16c · 2024-05-15T00:48:48.000-07:00
Build on the work of @vandyxiaowei #406, this PR is a bug fix for the generation pipeline for Gemini models ([Gemini-1.5-Pro (FC)](https://deepmind.google/technologies/gemini/#introduction) and [Gemini-1.0-Pro (FC)](https://deepmind.google/technologies/gemini/#introduction)) to make it more robust. The Gemini model output `result["candidates"][0]` does not always have the key `"content"`, which would cause the current generation pipeline to error out. This PR **DOES NOT** change the leaderboard score.
diff --git a/berkeley-function-call-leaderboard/README.md b/berkeley-function-call-leaderboard/README.md
@@ -231,7 +231,10 @@ For inferencing `Databrick-DBRX-instruct`, you need to create a Databrick Azure
 
 
 ## Changelog
+
+* [May 8, 2024] [#406](https://github.com/ShishirPatil/gorilla/pull/406) and [#421](https://github.com/ShishirPatil/gorilla/pull/421): Update the `gemini_handler.py` to better handle parallel function calls for Gemini models.
 * [May 6, 2024] [#412](https://github.com/ShishirPatil/gorilla/pull/412): Bug fix in evaluation dataset for AST categories. This includes updates to both prompts and function docs.
+* [May 2, 2024] [#405](https://github.com/ShishirPatil/gorilla/pull/405): Bug fix in the possible answers for the AST Simple evaluation dataset. Prompt and function docs are not affected.
 * [April 28, 2024] [#397](https://github.com/ShishirPatil/gorilla/pull/397): Add new model `snowflake/arctic` to the leaderboard. Note that there are multiple ways to inference the model, and we choose to do it via Nvidia API catalog. 
 * [April 27, 2024] [#390](https://github.com/ShishirPatil/gorilla/pull/390): Bug fix in cost and latency calculation for open-source models, which are now all calculated when serving the model with [vLLM](https://github.com/vllm-project/vllm) using 8 V100 GPUs for consistency. $$\text{Cost} = \text{Latency per 1000 function call} * (\text{8xV100 azure-pay-as-you-go-price per hour / 3600})$$
 * [April 25, 2024] [#386](https://github.com/ShishirPatil/gorilla/pull/386): Add 5 new models to the leaderboard: `meta-llama/Meta-Llama-3-8B-Instruct`, `meta-llama/Meta-Llama-3-70B-Instruct`, `gemini-1.5-pro-preview-0409`, `command-r-plus`, `command-r-plus-FC`.
diff --git a/berkeley-function-call-leaderboard/eval_checker/eval_runner_helper.py b/berkeley-function-call-leaderboard/eval_checker/eval_runner_helper.py
@@ -681,9 +681,11 @@ def record_cost_latency(leaderboard_table, model_name, model_output_data):
                 )
                 print("*" * 100)
         if "input_token_count" in data:
-            input_token.append(data["input_token_count"])
+            if data["input_token_count"] != 0:
+                input_token.append(data["input_token_count"])
         if "output_token_count" in data:
-            output_token.append(data["output_token_count"])
+            if data["output_token_count"] != 0:
+                output_token.append(data["output_token_count"])
 
     leaderboard_table[model_name]["cost"]["input_data"].extend(input_token)
     leaderboard_table[model_name]["cost"]["output_data"].extend(output_token)
diff --git a/berkeley-function-call-leaderboard/model_handler/gemini_handler.py b/berkeley-function-call-leaderboard/model_handler/gemini_handler.py
@@ -62,26 +62,38 @@ def _query_gemini(self, user_query, functions):
                 "output_tokens": 0,
                 "latency": latency,
             }
-        parts = []
-        for part in result["candidates"][0]["content"]["parts"]:
-            if "functionCall" in part:
-                if (
-                    "name" in part["functionCall"]
-                    and "args" in part["functionCall"]
-                ):
-                    parts.append({part["functionCall"]["name"]: json.dumps(part["functionCall"]["args"])})
+        try:
+            parts = []
+            for part in result["candidates"][0]["content"]["parts"]:
+                if "functionCall" in part:
+                    if (
+                        "name" in part["functionCall"]
+                        and "args" in part["functionCall"]
+                    ):
+                        parts.append({part["functionCall"]["name"]: json.dumps(part["functionCall"]["args"])})
+                    else:
+                        parts.append("Parsing error: " + json.dumps(part["functionCall"]))
                 else:
-                    parts.append("Parsing error: " + json.dumps(part["functionCall"]))
-            else:
-                parts.append(part["text"])
-        result = parts
+                    parts.append(part["text"])
+            result = parts
+        # This try-except is necessary because sometimes `result["candidates"][0]` does not have the key "content"
+        except Exception as e:
+            result = f"Parsing error: {e}" 
+            
         metatdata = {}
-        metatdata["input_tokens"] = json.loads(response.content)["usageMetadata"][
-            "promptTokenCount"
-        ]
-        metatdata["output_tokens"] = json.loads(response.content)["usageMetadata"][
-            "candidatesTokenCount"
-        ]
+        try:
+            metatdata["input_tokens"] = json.loads(response.content)["usageMetadata"][
+                "promptTokenCount"
+            ]
+        except:
+            metatdata["input_tokens"] = 0  # We special handle the 0 value when aggregating the results. 0 token will be ignored and not be counted in the average.
+        try:
+            metatdata["output_tokens"] = json.loads(response.content)["usageMetadata"][
+                "candidatesTokenCount"
+            ]
+        except:
+            metatdata["output_tokens"] = 0  # We special handle the 0 value when aggregating the results. 0 token will be ignored and not be counted in the average.
+            
         metatdata["latency"] = latency
         return result, metatdata