[BFCL] Reintroduce latency stats for local models, update cost calculation (#1098)

Gnav3852 · HuanzhiMao · web-flow · commit 65a1c9449a1b · 2025-07-08T00:33:45.000-07:00
Resolves #1035 - Re-introduce latency statistics for locally hosted models - Update cost calculation to cover the entire dataset batch, instead of the average cost per 1k function calls - For local-hosted models, we compute total cost by summing all latencies and multiplying by the hourly GPU price. --------- Co-authored-by: Huanzhi Mao <huanzhimao@gmail.com>
diff --git a/berkeley-function-call-leaderboard/CHANGELOG.md b/berkeley-function-call-leaderboard/CHANGELOG.md
@@ -2,6 +2,9 @@
 
 All notable changes to the Berkeley Function Calling Leaderboard will be documented in this file.
 
+- [Jul 8, 2025] [#1098](https://github.com/ShishirPatil/gorilla/pull/1098):
+  - Re-introduce latency statistics for locally hosted models
+  - Update cost calculation to cover the entire dataset batch, instead of the average cost per 1k function calls
 - [Jul 6, 2025] [#1100](https://github.com/ShishirPatil/gorilla/pull/1100): Add the following new models to the leaderboard:
   - `gemini-2.5-pro-FC`
   - `gemini-2.5-pro`
diff --git a/berkeley-function-call-leaderboard/bfcl_eval/constants/column_headers.py b/berkeley-function-call-leaderboard/bfcl_eval/constants/column_headers.py
@@ -43,7 +43,7 @@
     "Overall Acc",
     "Model",
     "Model Link",
-    "Cost ($ Per 1k Function Calls)",
+    "Total Cost ($)",
     "Latency Mean (s)",
     "Latency Standard Deviation (s)",
     "Latency 95th Percentile (s)",
diff --git a/berkeley-function-call-leaderboard/bfcl_eval/constants/eval_config.py b/berkeley-function-call-leaderboard/bfcl_eval/constants/eval_config.py
@@ -3,6 +3,10 @@
 
 VLLM_PORT = 1053
 
+# Price got from Lambda Cloud, 23.92 per hour for 8x H100, on-demand pay as you go total price
+# Reference: https://lambda.ai/pricing
+H100_X8_PRICE_PER_HOUR = 23.92
+
 # Directory of the installed package
 PACKAGE_ROOT = Path(__file__).resolve().parents[1]
 
diff --git a/berkeley-function-call-leaderboard/bfcl_eval/eval_checker/eval_runner_helper.py b/berkeley-function-call-leaderboard/bfcl_eval/eval_checker/eval_runner_helper.py
@@ -112,24 +112,28 @@ def get_cost_latency_info(model_name, cost_data, latency_data):
     cost, mean_latency, std_latency, percentile_95_latency = "N/A", "N/A", "N/A", "N/A"
     model_config = MODEL_CONFIG_MAPPING[model_name]
 
-    if model_config.input_price is None or model_config.output_price is None:
-        # Open source models should not have a cost or latency
-        return "N/A", "N/A", "N/A", "N/A"
-
-    if (
-        model_config.input_price is not None
-        and len(cost_data["input_data"]) > 0
-        and len(cost_data["output_data"]) > 0
-    ):
-
-        mean_input_token = statistics.mean(cost_data["input_data"])
-        mean_output_token = statistics.mean(cost_data["output_data"])
-        cost = (
-            mean_input_token * model_config.input_price
-            + mean_output_token * model_config.output_price
-        ) / 1000
+    # For API models, we use the input and output token counts to calculate the cost
+    if model_config.input_price is not None and model_config.output_price is not None:
+        if len(cost_data["input_data"]) > 0 and len(cost_data["output_data"]) > 0:
+            total_input_tokens = sum(cost_data["input_data"])
+            total_output_tokens = sum(cost_data["output_data"])
+            # price is in USD per million tokens
+            cost = (
+                total_input_tokens * model_config.input_price / 1000000
+                + total_output_tokens * model_config.output_price / 1000000
+            )
+            cost = round(cost, 2)
+
+    # For local-hosted models, we calculate the total GPU cost by summing all latencies and multiplying by the hourly GPU price.
+    elif len(latency_data["data"]) > 0:
+        total_latency_seconds = sum(latency_data["data"])
+        total_latency_hours = total_latency_seconds / 3600
+
+        # Divide by 100 since we are doing 100x parallel inference; this is an approximation to the GPU up-time.
+        cost = total_latency_hours * H100_X8_PRICE_PER_HOUR / 100
         cost = round(cost, 2)
 
+    # Calculate latency statistics for ALL models (both API and local)
     if len(latency_data["data"]) != 0:
         mean_latency = statistics.mean(latency_data["data"])
         std_latency = statistics.stdev(latency_data["data"])
@@ -208,7 +212,9 @@ def generate_leaderboard_csv(
         python_simple_ast_non_live = get_category_score(value, "simple")
         python_multiple_ast_non_live = get_category_score(value, "multiple")
         python_parallel_ast_non_live = get_category_score(value, "parallel")
-        python_parallel_multiple_ast_non_live = get_category_score(value, "parallel_multiple")
+        python_parallel_multiple_ast_non_live = get_category_score(
+            value, "parallel_multiple"
+        )
         java_simple_ast_non_live = get_category_score(value, "java")
         javascript_simple_ast_non_live = get_category_score(value, "javascript")
         irrelevance_non_live = get_category_score(value, "irrelevance")
@@ -264,7 +270,9 @@ def generate_leaderboard_csv(
         python_simple_ast_live = get_category_score(value, "live_simple")
         python_multiple_ast_live = get_category_score(value, "live_multiple")
         python_parallel_ast_live = get_category_score(value, "live_parallel")
-        python_parallel_multiple_ast_live = get_category_score(value, "live_parallel_multiple")
+        python_parallel_multiple_ast_live = get_category_score(
+            value, "live_parallel_multiple"
+        )
         irrelevance_live = get_category_score(value, "live_irrelevance")
         relevance_live = get_category_score(value, "live_relevance")
         summary_ast_live = calculate_weighted_accuracy(