Skip to content

Commit bced21a

Browse files
committed
1
1 parent a15f4fc commit bced21a

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

43 files changed

+2697
-594
lines changed
Lines changed: 62 additions & 115 deletions
Original file line numberDiff line numberDiff line change
@@ -1,113 +1,69 @@
11
VERSION_PREFIX = "BFCL_v3"
22

3-
# These are in the PROMPT_PATH
4-
# Commented out ones are not used in the current version of benchmarking
5-
TEST_FILE_MAPPING = {
6-
# V1 Non-Live Dataset
7-
# "exec_simple": f"{VERSION_PREFIX}_exec_simple.json",
8-
# "exec_parallel": f"{VERSION_PREFIX}_exec_parallel.json",
9-
# "exec_multiple": f"{VERSION_PREFIX}_exec_multiple.json",
10-
# "exec_parallel_multiple": f"{VERSION_PREFIX}_exec_parallel_multiple.json",
11-
"simple": f"{VERSION_PREFIX}_simple.json",
12-
"irrelevance": f"{VERSION_PREFIX}_irrelevance.json",
13-
"parallel": f"{VERSION_PREFIX}_parallel.json",
14-
"multiple": f"{VERSION_PREFIX}_multiple.json",
15-
"parallel_multiple": f"{VERSION_PREFIX}_parallel_multiple.json",
16-
"java": f"{VERSION_PREFIX}_java.json",
17-
"javascript": f"{VERSION_PREFIX}_javascript.json",
18-
# "rest": f"{VERSION_PREFIX}_rest.json",
19-
# "sql": f"{VERSION_PREFIX}_sql.json",
20-
# "chatable": f"{VERSION_PREFIX}_chatable.json",
213

22-
# V2 Live Datasets
23-
"live_simple": f"{VERSION_PREFIX}_live_simple.json",
24-
"live_multiple": f"{VERSION_PREFIX}_live_multiple.json",
25-
"live_parallel": f"{VERSION_PREFIX}_live_parallel.json",
26-
"live_parallel_multiple": f"{VERSION_PREFIX}_live_parallel_multiple.json",
27-
"live_irrelevance": f"{VERSION_PREFIX}_live_irrelevance.json",
28-
"live_relevance": f"{VERSION_PREFIX}_live_relevance.json",
4+
ALL_AVAILABLE_MEMORY_BACKENDS = [
5+
"kv",
6+
"vector",
7+
"rec_sum",
8+
"knowledge_graph",
9+
]
2910

30-
# V3 Multi-turn Datasets
31-
"multi_turn_base": f"{VERSION_PREFIX}_multi_turn_base.json",
32-
"multi_turn_miss_func": f"{VERSION_PREFIX}_multi_turn_miss_func.json",
33-
"multi_turn_miss_param": f"{VERSION_PREFIX}_multi_turn_miss_param.json",
34-
"multi_turn_long_context": f"{VERSION_PREFIX}_multi_turn_long_context.json",
35-
# "multi_turn_composite": f"{VERSION_PREFIX}_multi_turn_composite.json",
36-
}
11+
NON_LIVE_CATEGORY = [
12+
"simple",
13+
"java",
14+
"javascript",
15+
"multiple",
16+
"parallel",
17+
"parallel_multiple",
18+
"irrelevance",
19+
# "exec_simple",
20+
# "exec_parallel",
21+
# "exec_multiple",
22+
# "exec_parallel_multiple",
23+
# "rest",
24+
# "sql",
25+
# "chatable",
26+
]
27+
LIVE_CATEGORY = [
28+
"live_simple",
29+
"live_multiple",
30+
"live_parallel",
31+
"live_parallel_multiple",
32+
"live_irrelevance",
33+
"live_relevance",
34+
]
35+
MULTI_TURN_CATEGORY = [
36+
"multi_turn_base",
37+
"multi_turn_miss_func",
38+
"multi_turn_miss_param",
39+
"multi_turn_long_context",
40+
# "multi_turn_composite",
41+
]
42+
WEB_SEARCH_CATEGORY = [
43+
"web_search",
44+
]
45+
46+
MEMORY_CATEGORY = [f"memory_{backend}" for backend in ALL_AVAILABLE_MEMORY_BACKENDS]
47+
MEMORY_SCENARIO_NAME = [
48+
"student",
49+
"customer",
50+
"finance",
51+
"healthcare",
52+
"notetaker",
53+
]
54+
55+
56+
SINGLE_TURN_CATEGORY = NON_LIVE_CATEGORY + LIVE_CATEGORY
57+
AGENTIC_CATEGORY = MEMORY_CATEGORY + WEB_SEARCH_CATEGORY
58+
59+
ALL_CATEGORIES = SINGLE_TURN_CATEGORY + MULTI_TURN_CATEGORY + AGENTIC_CATEGORY
3760

3861
TEST_COLLECTION_MAPPING = {
39-
"all": [
40-
"simple",
41-
"irrelevance",
42-
"parallel",
43-
"multiple",
44-
"parallel_multiple",
45-
"java",
46-
"javascript",
47-
"live_simple",
48-
"live_multiple",
49-
"live_parallel",
50-
"live_parallel_multiple",
51-
"live_irrelevance",
52-
"live_relevance",
53-
"multi_turn_base",
54-
"multi_turn_miss_func",
55-
"multi_turn_miss_param",
56-
"multi_turn_long_context",
57-
],
58-
"multi_turn": [
59-
"multi_turn_base",
60-
"multi_turn_miss_func",
61-
"multi_turn_miss_param",
62-
"multi_turn_long_context",
63-
],
64-
"single_turn": [
65-
"simple",
66-
"irrelevance",
67-
"parallel",
68-
"multiple",
69-
"parallel_multiple",
70-
"java",
71-
"javascript",
72-
"live_simple",
73-
"live_multiple",
74-
"live_parallel",
75-
"live_parallel_multiple",
76-
"live_irrelevance",
77-
"live_relevance",
78-
],
79-
"live": [
80-
"live_simple",
81-
"live_multiple",
82-
"live_parallel",
83-
"live_parallel_multiple",
84-
"live_irrelevance",
85-
"live_relevance",
86-
],
87-
"non_live": [
88-
"simple",
89-
"irrelevance",
90-
"parallel",
91-
"multiple",
92-
"parallel_multiple",
93-
"java",
94-
"javascript",
95-
],
96-
"ast": [
97-
"simple",
98-
"irrelevance",
99-
"parallel",
100-
"multiple",
101-
"parallel_multiple",
102-
"java",
103-
"javascript",
104-
"live_simple",
105-
"live_multiple",
106-
"live_parallel",
107-
"live_parallel_multiple",
108-
"live_irrelevance",
109-
"live_relevance",
110-
],
62+
"all": ALL_CATEGORIES,
63+
"multi_turn": MULTI_TURN_CATEGORY,
64+
"single_turn": SINGLE_TURN_CATEGORY,
65+
"live": LIVE_CATEGORY,
66+
"non_live": NON_LIVE_CATEGORY,
11167
"non_python": [
11268
"java",
11369
"javascript",
@@ -125,15 +81,6 @@
12581
"live_irrelevance",
12682
"live_relevance",
12783
],
128-
}
129-
130-
MULTI_TURN_FUNC_DOC_FILE_MAPPING = {
131-
"GorillaFileSystem": "gorilla_file_system.json",
132-
"MathAPI": "math_api.json",
133-
"MessageAPI": "message_api.json",
134-
"TwitterAPI": "posting_api.json",
135-
"TicketAPI": "ticket_api.json",
136-
"TradingBot": "trading_bot.json",
137-
"TravelAPI": "travel_booking.json",
138-
"VehicleControlAPI": "vehicle_control.json",
84+
"memory": MEMORY_CATEGORY,
85+
"agentic": AGENTIC_CATEGORY,
13986
}

berkeley-function-call-leaderboard/bfcl/constants/column_headers.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,19 @@
3838
]
3939

4040

41+
COLUMNS_AGENTIC = [
42+
"Rank",
43+
"Model",
44+
"Agentic Overall Acc",
45+
"Web Search",
46+
"Memory Summary",
47+
"Memory KV",
48+
"Memory Vector",
49+
"Memory Recursive Summarization",
50+
"Memory Knowledge Graph",
51+
]
52+
53+
4154
COLUMNS_OVERALL = [
4255
"Rank",
4356
"Overall Acc",
@@ -62,6 +75,9 @@
6275
"Multi Turn Miss Func",
6376
"Multi Turn Miss Param",
6477
"Multi Turn Long Context",
78+
"Agentic Acc",
79+
"Agentic Web Search",
80+
"Agentic Memory Summary",
6581
"Relevance Detection",
6682
"Irrelevance Detection",
6783
"Organization",

berkeley-function-call-leaderboard/bfcl/constants/default_prompts.py

Lines changed: 41 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,45 @@
1717
"""
1818
)
1919

20-
DEFAULT_USER_PROMPT_FOR_ADDITIONAL_FUNCTION_FC = "I have updated some more functions you can choose from. What about now?"
20+
DEFAULT_USER_PROMPT_FOR_ADDITIONAL_FUNCTION_FC = (
21+
"I have updated some more functions you can choose from. What about now?"
22+
)
23+
24+
DEFAULT_USER_PROMPT_FOR_ADDITIONAL_FUNCTION_PROMPTING = (
25+
"{functions}\n" + DEFAULT_USER_PROMPT_FOR_ADDITIONAL_FUNCTION_FC
26+
)
27+
28+
ADDITIONAL_SYSTEM_PROMPT_FOR_AGENTIC_RESPONSE_FORMAT = """For your final answer to the user, you must respond in this format: {'answer': A short and precise answer to the question, 'context': A brief explanation of how you arrived at this answer or why it is correct}. If you do not know the answer, respond with {'answer': 'I do not know', 'context': 'I do not know'}. If you think the question cannot be properly answered, response with {'answer': 'I cannot answer this question', 'context': A short reason explaining why this question cannot be answered}.
29+
"""
30+
31+
MEMORY_AGENT_SETTINGS = {
32+
"student": "You are an academic-support assistant for college student. Remember key personal and academic details discussed across sessions, and draw on them to answer questions or give guidance.",
33+
"customer": "You are a general customer support assistant for an e-commerce platform. Your task is to understand and remember information that can be used to provide information about user inquiries, preferences, and offer consistent, helpful assistance over multiple interactions.",
34+
"finance": "You are a high-level executive assistant supporting a senior finance professional. Retain and synthesize both personal and professional information including facts, goals, prior decisions, and family life across sessions to provide strategic, context-rich guidance and continuity.",
35+
"healthcare": "You are a healthcare assistant supporting a patient across appointments. Retain essential medical history, treatment plans, and personal preferences to offer coherent, context-aware guidance and reminders.",
36+
"notetaker": "You are a personal organization assistant. Capture key information from conversations, like tasks, deadlines, and preferences, and use it to give reliable reminders and answers in future sessions.",
37+
}
38+
39+
40+
MEMORY_BACKEND_INSTRUCTION_CORE_ARCHIVAL = """{scenario_setting}
41+
42+
You have access to an advanced memory system, consisting of two memory types 'Core Memory' and 'Archival Memory'. Both type of memory is persistent across multiple conversations with the user, and can be accessed in a later interactions. You should actively manage your memory data to keep track of important information, ensure that it is up-to-date and easy to retrieve to provide personalized responses to the user later.
2143
22-
DEFAULT_USER_PROMPT_FOR_ADDITIONAL_FUNCTION_PROMPTING = "{functions}\n" + DEFAULT_USER_PROMPT_FOR_ADDITIONAL_FUNCTION_FC
44+
The Core memory is limited in size, but always visible to you in context. The Archival Memory has a much larger capacity, but will be held outside of your immediate context due to its size.
45+
46+
Here is the content of your Core Memory from previous interactions:
47+
{memory_content}
48+
"""
49+
50+
MEMORY_BACKEND_INSTRUCTION_UNIFIED_WITHOUT_CONTENT = """{scenario_setting}
51+
52+
You have access to an advanced memory system, which is persistent across multiple conversations with the user, and can be accessed in a later interactions. You should actively manage your memory data to keep track of important information, ensure that it is up-to-date and easy to retrieve to provide personalized responses to the user later.
53+
"""
54+
55+
MEMORY_BACKEND_INSTRUCTION_UNIFIED = (
56+
MEMORY_BACKEND_INSTRUCTION_UNIFIED_WITHOUT_CONTENT
57+
+ """
58+
Here is the content of your memory system from previous interactions:
59+
{memory_content}
60+
"""
61+
)

berkeley-function-call-leaderboard/bfcl/constants/eval_config.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414
DOTENV_PATH = "./.env"
1515
UTILS_PATH = "./utils/"
1616
TEST_IDS_TO_GENERATE_PATH = "./test_case_ids_to_generate.json"
17-
17+
MEMORY_PREREQ_CONVERSATION_PATH = "./data/memory_prereq_conversation/"
1818

1919

2020
RED_FONT = "\033[91m"
@@ -32,6 +32,7 @@
3232
DOTENV_PATH = (PROJECT_ROOT / DOTENV_PATH).resolve()
3333
UTILS_PATH = (PROJECT_ROOT / UTILS_PATH).resolve()
3434
TEST_IDS_TO_GENERATE_PATH = (PROJECT_ROOT / TEST_IDS_TO_GENERATE_PATH).resolve()
35+
MEMORY_PREREQ_CONVERSATION_PATH = (PROJECT_ROOT / MEMORY_PREREQ_CONVERSATION_PATH).resolve()
3536

3637
RESULT_PATH.mkdir(parents=True, exist_ok=True)
3738
SCORE_PATH.mkdir(parents=True, exist_ok=True)
Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,49 @@
1+
MULTI_TURN_FUNC_DOC_FILE_MAPPING = {
2+
"GorillaFileSystem": "gorilla_file_system.json",
3+
"MathAPI": "math_api.json",
4+
"MessageAPI": "message_api.json",
5+
"TwitterAPI": "posting_api.json",
6+
"TicketAPI": "ticket_api.json",
7+
"TradingBot": "trading_bot.json",
8+
"TravelAPI": "travel_booking.json",
9+
"VehicleControlAPI": "vehicle_control.json",
10+
"WebSearchAPI": "web_search.json",
11+
"MemoryAPI_kv": "memory_kv.json",
12+
"MemoryAPI_vector": "memory_vector.json",
13+
"MemoryAPI_rec_sum": "memory_rec_sum.json",
14+
"MemoryAPI_knowledge_graph": "memory_knowledge_graph.json",
15+
}
16+
17+
BACKEND_PATH_PREFIX = "bfcl.eval_checker.multi_turn_eval.func_source_code"
18+
19+
CLASS_FILE_PATH_MAPPING = {
20+
"GorillaFileSystem": f"{BACKEND_PATH_PREFIX}.gorilla_file_system",
21+
"MathAPI": f"{BACKEND_PATH_PREFIX}.math_api",
22+
"MessageAPI": f"{BACKEND_PATH_PREFIX}.message_api",
23+
"TwitterAPI": f"{BACKEND_PATH_PREFIX}.posting_api",
24+
"TicketAPI": f"{BACKEND_PATH_PREFIX}.ticket_api",
25+
"TradingBot": f"{BACKEND_PATH_PREFIX}.trading_bot",
26+
"TravelAPI": f"{BACKEND_PATH_PREFIX}.travel_booking",
27+
"VehicleControlAPI": f"{BACKEND_PATH_PREFIX}.vehicle_control",
28+
# The following classes are not part of the multi-turn categories suite, but they share the same evaluation pipeline for simplicity
29+
"WebSearchAPI": f"{BACKEND_PATH_PREFIX}.web_search",
30+
"MemoryAPI_kv": f"{BACKEND_PATH_PREFIX}.memory_kv",
31+
"MemoryAPI_vector": f"{BACKEND_PATH_PREFIX}.memory_vector",
32+
"MemoryAPI_rec_sum": f"{BACKEND_PATH_PREFIX}.memory_rec_sum",
33+
"MemoryAPI_knowledge_graph": f"{BACKEND_PATH_PREFIX}.memory_knowledge_graph",
34+
}
35+
36+
# These classes are stateless and do not require any initial configuration
37+
STATELESS_CLASSES = [
38+
"MathAPI",
39+
"WebSearchAPI",
40+
]
41+
42+
# These classes are stateful, but their state is too verbose to include in the inference log
43+
# Their state will be displayed and stored in separate files
44+
OMIT_STATE_INFO_CLASSES = [
45+
"MemoryAPI_kv",
46+
"MemoryAPI_vector",
47+
"MemoryAPI_rec_sum",
48+
"MemoryAPI_knowledge_graph",
49+
]

berkeley-function-call-leaderboard/bfcl/eval_checker/agentic_eval/__init__.py

Whitespace-only changes.
Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
# from bfcl.eval_checker.ast_eval.ast_checker import standardize_string
2+
import re
3+
4+
#### Main functions ####
5+
6+
7+
def agentic_checker(model_response: str, possible_answer_list: list[str]) -> dict:
8+
"""
9+
Check if one of the possible answers is contained in the model response, ignoring case, whitespace and ",./-_*^" punctuation.
10+
"""
11+
standardized_possible_answer_list = [
12+
standardize_string(possible_answer) for possible_answer in possible_answer_list
13+
]
14+
# Sometimes the model response is a list of one string
15+
if type(model_response) is list:
16+
model_response = model_response[0]
17+
if type(model_response) is not str:
18+
model_response = str(model_response)
19+
20+
standardized_model_response = standardize_string(model_response)
21+
22+
for possible_answer in standardized_possible_answer_list:
23+
if re.search(rf"\b{re.escape(possible_answer)}\b", standardized_model_response):
24+
return {"valid": True, "error": []}
25+
26+
return {
27+
"valid": False,
28+
"error_message": f"None of the expected answers were found in the model response.",
29+
"error_type": "agentic:answer_not_found",
30+
"details": {
31+
"model_response": model_response,
32+
"possible_answers": possible_answer_list,
33+
"standardized_model_response": standardized_model_response,
34+
"standardized_possible_answers": standardized_possible_answer_list,
35+
},
36+
}
37+
38+
39+
#### Helper functions ####
40+
41+
42+
def standardize_string(input_string: str):
43+
"""
44+
This function standardizes the string by removing all the whitespace, ",./-_*^()" punctuation, and converting it to lowercase
45+
It will also convert all the single quotes to double quotes
46+
This is used to compare the model output with the possible answers
47+
We don't want to punish model for answer like April 1, 2024 vs April 1,2024, vs April 1 2024
48+
"""
49+
regex_string = r"[\,\.\/\-\_\*\^\(\)]"
50+
return re.sub(regex_string, "", input_string).lower().replace("'", '"')

0 commit comments

Comments
 (0)