@@ -112,24 +112,28 @@ def get_cost_latency_info(model_name, cost_data, latency_data):
112112 cost , mean_latency , std_latency , percentile_95_latency = "N/A" , "N/A" , "N/A" , "N/A"
113113 model_config = MODEL_CONFIG_MAPPING [model_name ]
114114
115- if model_config .input_price is None or model_config .output_price is None :
116- # Open source models should not have a cost or latency
117- return "N/A" , "N/A" , "N/A" , "N/A"
118-
119- if (
120- model_config .input_price is not None
121- and len (cost_data ["input_data" ]) > 0
122- and len (cost_data ["output_data" ]) > 0
123- ):
124-
125- mean_input_token = statistics .mean (cost_data ["input_data" ])
126- mean_output_token = statistics .mean (cost_data ["output_data" ])
127- cost = (
128- mean_input_token * model_config .input_price
129- + mean_output_token * model_config .output_price
130- ) / 1000
115+ # For API models, we use the input and output token counts to calculate the cost
116+ if model_config .input_price is not None and model_config .output_price is not None :
117+ if len (cost_data ["input_data" ]) > 0 and len (cost_data ["output_data" ]) > 0 :
118+ total_input_tokens = sum (cost_data ["input_data" ])
119+ total_output_tokens = sum (cost_data ["output_data" ])
120+ # price is in USD per million tokens
121+ cost = (
122+ total_input_tokens * model_config .input_price / 1000000
123+ + total_output_tokens * model_config .output_price / 1000000
124+ )
125+ cost = round (cost , 2 )
126+
127+ # For local-hosted models, we calculate the total GPU cost by summing all latencies and multiplying by the hourly GPU price.
128+ elif len (latency_data ["data" ]) > 0 :
129+ total_latency_seconds = sum (latency_data ["data" ])
130+ total_latency_hours = total_latency_seconds / 3600
131+
132+ # Divide by 100 since we are doing 100x parallel inference; this is an approximation to the GPU up-time.
133+ cost = total_latency_hours * H100_X8_PRICE_PER_HOUR / 100
131134 cost = round (cost , 2 )
132135
136+ # Calculate latency statistics for ALL models (both API and local)
133137 if len (latency_data ["data" ]) != 0 :
134138 mean_latency = statistics .mean (latency_data ["data" ])
135139 std_latency = statistics .stdev (latency_data ["data" ])
@@ -208,7 +212,9 @@ def generate_leaderboard_csv(
208212 python_simple_ast_non_live = get_category_score (value , "simple" )
209213 python_multiple_ast_non_live = get_category_score (value , "multiple" )
210214 python_parallel_ast_non_live = get_category_score (value , "parallel" )
211- python_parallel_multiple_ast_non_live = get_category_score (value , "parallel_multiple" )
215+ python_parallel_multiple_ast_non_live = get_category_score (
216+ value , "parallel_multiple"
217+ )
212218 java_simple_ast_non_live = get_category_score (value , "java" )
213219 javascript_simple_ast_non_live = get_category_score (value , "javascript" )
214220 irrelevance_non_live = get_category_score (value , "irrelevance" )
@@ -264,7 +270,9 @@ def generate_leaderboard_csv(
264270 python_simple_ast_live = get_category_score (value , "live_simple" )
265271 python_multiple_ast_live = get_category_score (value , "live_multiple" )
266272 python_parallel_ast_live = get_category_score (value , "live_parallel" )
267- python_parallel_multiple_ast_live = get_category_score (value , "live_parallel_multiple" )
273+ python_parallel_multiple_ast_live = get_category_score (
274+ value , "live_parallel_multiple"
275+ )
268276 irrelevance_live = get_category_score (value , "live_irrelevance" )
269277 relevance_live = get_category_score (value , "live_relevance" )
270278 summary_ast_live = calculate_weighted_accuracy (
0 commit comments