HuanzhiMao
diff --git a/‎berkeley-function-call-leaderboard/bfcl/constants/category_mapping.py
Lines changed: 62 additions & 115 deletions b/‎berkeley-function-call-leaderboard/bfcl/constants/category_mapping.py
Lines changed: 62 additions & 115 deletions
diff --git a/‎berkeley-function-call-leaderboard/bfcl/constants/column_headers.py
Lines changed: 16 additions & 0 deletions b/‎berkeley-function-call-leaderboard/bfcl/constants/column_headers.py
Lines changed: 16 additions & 0 deletions
diff --git a/‎berkeley-function-call-leaderboard/bfcl/constants/default_prompts.py
Lines changed: 41 additions & 2 deletions b/‎berkeley-function-call-leaderboard/bfcl/constants/default_prompts.py
Lines changed: 41 additions & 2 deletions
diff --git a/‎berkeley-function-call-leaderboard/bfcl/constants/eval_config.py
Lines changed: 2 additions & 1 deletion b/‎berkeley-function-call-leaderboard/bfcl/constants/eval_config.py
Lines changed: 2 additions & 1 deletion
diff --git a/‎berkeley-function-call-leaderboard/bfcl/constants/executable_backend_config.py
Lines changed: 49 additions & 0 deletions b/‎berkeley-function-call-leaderboard/bfcl/constants/executable_backend_config.py
Lines changed: 49 additions & 0 deletions
diff --git a/‎berkeley-function-call-leaderboard/bfcl/eval_checker/agentic_eval/__init__.py b/‎berkeley-function-call-leaderboard/bfcl/eval_checker/agentic_eval/__init__.py
diff --git a/‎berkeley-function-call-leaderboard/bfcl/eval_checker/agentic_eval/agentic_checker.py
Lines changed: 50 additions & 0 deletions b/‎berkeley-function-call-leaderboard/bfcl/eval_checker/agentic_eval/agentic_checker.py
Lines changed: 50 additions & 0 deletions
@@ -1,113 +1,69 @@
 VERSION_PREFIX = "BFCL_v3"
 
-# These are in the PROMPT_PATH
-# Commented out ones are not used in the current version of benchmarking
-TEST_FILE_MAPPING = {
-    # V1 Non-Live Dataset
-    # "exec_simple": f"{VERSION_PREFIX}_exec_simple.json",
-    # "exec_parallel": f"{VERSION_PREFIX}_exec_parallel.json",
-    # "exec_multiple": f"{VERSION_PREFIX}_exec_multiple.json",
-    # "exec_parallel_multiple": f"{VERSION_PREFIX}_exec_parallel_multiple.json",
-    "simple": f"{VERSION_PREFIX}_simple.json",
-    "irrelevance": f"{VERSION_PREFIX}_irrelevance.json",
-    "parallel": f"{VERSION_PREFIX}_parallel.json",
-    "multiple": f"{VERSION_PREFIX}_multiple.json",
-    "parallel_multiple": f"{VERSION_PREFIX}_parallel_multiple.json",
-    "java": f"{VERSION_PREFIX}_java.json",
-    "javascript": f"{VERSION_PREFIX}_javascript.json",
-    # "rest": f"{VERSION_PREFIX}_rest.json",
-    # "sql": f"{VERSION_PREFIX}_sql.json",
-    # "chatable": f"{VERSION_PREFIX}_chatable.json",
 
-    # V2 Live Datasets
-    "live_simple": f"{VERSION_PREFIX}_live_simple.json",
-    "live_multiple": f"{VERSION_PREFIX}_live_multiple.json",
-    "live_parallel": f"{VERSION_PREFIX}_live_parallel.json",
-    "live_parallel_multiple": f"{VERSION_PREFIX}_live_parallel_multiple.json",
-    "live_irrelevance": f"{VERSION_PREFIX}_live_irrelevance.json",
-    "live_relevance": f"{VERSION_PREFIX}_live_relevance.json",
+ALL_AVAILABLE_MEMORY_BACKENDS = [
+    "kv",
+    "vector",
+    "rec_sum",
+    "knowledge_graph",
+]
 
-    # V3 Multi-turn Datasets
-    "multi_turn_base": f"{VERSION_PREFIX}_multi_turn_base.json",
-    "multi_turn_miss_func": f"{VERSION_PREFIX}_multi_turn_miss_func.json",
-    "multi_turn_miss_param": f"{VERSION_PREFIX}_multi_turn_miss_param.json",
-    "multi_turn_long_context": f"{VERSION_PREFIX}_multi_turn_long_context.json",
-    # "multi_turn_composite": f"{VERSION_PREFIX}_multi_turn_composite.json",
-}
+NON_LIVE_CATEGORY = [
+    "simple",
+    "java",
+    "javascript",
+    "multiple",
+    "parallel",
+    "parallel_multiple",
+    "irrelevance",
+    # "exec_simple",
+    # "exec_parallel",
+    # "exec_multiple",
+    # "exec_parallel_multiple",
+    # "rest",
+    # "sql",
+    # "chatable",
+]
+LIVE_CATEGORY = [
+    "live_simple",
+    "live_multiple",
+    "live_parallel",
+    "live_parallel_multiple",
+    "live_irrelevance",
+    "live_relevance",
+]
+MULTI_TURN_CATEGORY = [
+    "multi_turn_base",
+    "multi_turn_miss_func",
+    "multi_turn_miss_param",
+    "multi_turn_long_context",
+    # "multi_turn_composite",
+]
+WEB_SEARCH_CATEGORY = [
+    "web_search",
+]
+
+MEMORY_CATEGORY = [f"memory_{backend}" for backend in ALL_AVAILABLE_MEMORY_BACKENDS]
+MEMORY_SCENARIO_NAME = [
+    "student",
+    "customer",
+    "finance",
+    "healthcare",
+    "notetaker",
+]
+
+
+SINGLE_TURN_CATEGORY = NON_LIVE_CATEGORY + LIVE_CATEGORY
+AGENTIC_CATEGORY = MEMORY_CATEGORY + WEB_SEARCH_CATEGORY
+
+ALL_CATEGORIES = SINGLE_TURN_CATEGORY + MULTI_TURN_CATEGORY + AGENTIC_CATEGORY
 
 TEST_COLLECTION_MAPPING = {
-    "all": [
-        "simple",
-        "irrelevance",
-        "parallel",
-        "multiple",
-        "parallel_multiple",
-        "java",
-        "javascript",
-        "live_simple",
-        "live_multiple",
-        "live_parallel",
-        "live_parallel_multiple",
-        "live_irrelevance",
-        "live_relevance",
-        "multi_turn_base",
-        "multi_turn_miss_func",
-        "multi_turn_miss_param",
-        "multi_turn_long_context",
-    ],
-    "multi_turn": [
-        "multi_turn_base",
-        "multi_turn_miss_func",
-        "multi_turn_miss_param",
-        "multi_turn_long_context",
-    ],
-    "single_turn": [
-        "simple",
-        "irrelevance",
-        "parallel",
-        "multiple",
-        "parallel_multiple",
-        "java",
-        "javascript",
-        "live_simple",
-        "live_multiple",
-        "live_parallel",
-        "live_parallel_multiple",
-        "live_irrelevance",
-        "live_relevance",
-    ],
-    "live": [
-        "live_simple",
-        "live_multiple",
-        "live_parallel",
-        "live_parallel_multiple",
-        "live_irrelevance",
-        "live_relevance",
-    ],
-    "non_live": [
-        "simple",
-        "irrelevance",
-        "parallel",
-        "multiple",
-        "parallel_multiple",
-        "java",
-        "javascript",
-    ],
-    "ast": [
-        "simple",
-        "irrelevance",
-        "parallel",
-        "multiple",
-        "parallel_multiple",
-        "java",
-        "javascript",
-        "live_simple",
-        "live_multiple",
-        "live_parallel",
-        "live_parallel_multiple",
-        "live_irrelevance",
-        "live_relevance",
-    ],
+    "all": ALL_CATEGORIES,
+    "multi_turn": MULTI_TURN_CATEGORY,
+    "single_turn": SINGLE_TURN_CATEGORY,
+    "live": LIVE_CATEGORY,
+    "non_live": NON_LIVE_CATEGORY,
     "non_python": [
         "java",
         "javascript",
@@ -125,15 +81,6 @@
         "live_irrelevance",
         "live_relevance",
     ],
-}
-
-MULTI_TURN_FUNC_DOC_FILE_MAPPING = {
-    "GorillaFileSystem": "gorilla_file_system.json",
-    "MathAPI": "math_api.json",
-    "MessageAPI": "message_api.json",
-    "TwitterAPI": "posting_api.json",
-    "TicketAPI": "ticket_api.json",
-    "TradingBot": "trading_bot.json",
-    "TravelAPI": "travel_booking.json",
-    "VehicleControlAPI": "vehicle_control.json",
+    "memory": MEMORY_CATEGORY,
+    "agentic": AGENTIC_CATEGORY,
 }
@@ -38,6 +38,19 @@
 ]
 
 
+COLUMNS_AGENTIC = [
+    "Rank",
+    "Model",
+    "Agentic Overall Acc",
+    "Web Search",
+    "Memory Summary",
+    "Memory KV",
+    "Memory Vector",
+    "Memory Recursive Summarization",
+    "Memory Knowledge Graph",
+]
+
+
 COLUMNS_OVERALL = [
     "Rank",
     "Overall Acc",
@@ -62,6 +75,9 @@
     "Multi Turn Miss Func",
     "Multi Turn Miss Param",
     "Multi Turn Long Context",
+    "Agentic Acc",
+    "Agentic Web Search",
+    "Agentic Memory Summary",
     "Relevance Detection",
     "Irrelevance Detection",
     "Organization",
 
@@ -17,6 +17,45 @@
 """
 )
 
-DEFAULT_USER_PROMPT_FOR_ADDITIONAL_FUNCTION_FC = "I have updated some more functions you can choose from. What about now?"
+DEFAULT_USER_PROMPT_FOR_ADDITIONAL_FUNCTION_FC = (
+    "I have updated some more functions you can choose from. What about now?"
+)
+
+DEFAULT_USER_PROMPT_FOR_ADDITIONAL_FUNCTION_PROMPTING = (
+    "{functions}\n" + DEFAULT_USER_PROMPT_FOR_ADDITIONAL_FUNCTION_FC
+)
+
+ADDITIONAL_SYSTEM_PROMPT_FOR_AGENTIC_RESPONSE_FORMAT = """For your final answer to the user, you must respond in this format: {'answer': A short and precise answer to the question, 'context': A brief explanation of how you arrived at this answer or why it is correct}. If you do not know the answer, respond with {'answer': 'I do not know', 'context': 'I do not know'}. If you think the question cannot be properly answered, response with {'answer': 'I cannot answer this question', 'context': A short reason explaining why this question cannot be answered}.
+"""
+
+MEMORY_AGENT_SETTINGS = {
+    "student": "You are an academic-support assistant for college student. Remember key personal and academic details discussed across sessions, and draw on them to answer questions or give guidance.",
+    "customer": "You are a general customer support assistant for an e-commerce platform. Your task is to understand and remember information that can be used to provide information about user inquiries, preferences, and offer consistent, helpful assistance over multiple interactions.",
+    "finance": "You are a high-level executive assistant supporting a senior finance professional. Retain and synthesize both personal and professional information including facts, goals, prior decisions, and family life across sessions to provide strategic, context-rich guidance and continuity.",
+    "healthcare": "You are a healthcare assistant supporting a patient across appointments. Retain essential medical history, treatment plans, and personal preferences to offer coherent, context-aware guidance and reminders.",
+    "notetaker": "You are a personal organization assistant. Capture key information from conversations, like tasks, deadlines, and preferences, and use it to give reliable reminders and answers in future sessions.",
+}
+
+
+MEMORY_BACKEND_INSTRUCTION_CORE_ARCHIVAL = """{scenario_setting}
+
+You have access to an advanced memory system, consisting of two memory types 'Core Memory' and 'Archival Memory'. Both type of memory is persistent across multiple conversations with the user, and can be accessed in a later interactions. You should actively manage your memory data to keep track of important information, ensure that it is up-to-date and easy to retrieve to provide personalized responses to the user later.
 
-DEFAULT_USER_PROMPT_FOR_ADDITIONAL_FUNCTION_PROMPTING = "{functions}\n" + DEFAULT_USER_PROMPT_FOR_ADDITIONAL_FUNCTION_FC
+The Core memory is limited in size, but always visible to you in context. The Archival Memory has a much larger capacity, but will be held outside of your immediate context due to its size.
+
+Here is the content of your Core Memory from previous interactions:
+{memory_content}
+"""
+
+MEMORY_BACKEND_INSTRUCTION_UNIFIED_WITHOUT_CONTENT = """{scenario_setting}
+
+You have access to an advanced memory system, which is persistent across multiple conversations with the user, and can be accessed in a later interactions. You should actively manage your memory data to keep track of important information, ensure that it is up-to-date and easy to retrieve to provide personalized responses to the user later.
+"""
+
+MEMORY_BACKEND_INSTRUCTION_UNIFIED = (
+    MEMORY_BACKEND_INSTRUCTION_UNIFIED_WITHOUT_CONTENT
+    + """
+Here is the content of your memory system from previous interactions:
+{memory_content}
+"""
+)
@@ -14,7 +14,7 @@
 DOTENV_PATH = "./.env"
 UTILS_PATH = "./utils/"
 TEST_IDS_TO_GENERATE_PATH = "./test_case_ids_to_generate.json"
-
+MEMORY_PREREQ_CONVERSATION_PATH = "./data/memory_prereq_conversation/"
 
 
 RED_FONT = "\033[91m"
@@ -32,6 +32,7 @@
 DOTENV_PATH = (PROJECT_ROOT / DOTENV_PATH).resolve()
 UTILS_PATH = (PROJECT_ROOT / UTILS_PATH).resolve()
 TEST_IDS_TO_GENERATE_PATH = (PROJECT_ROOT / TEST_IDS_TO_GENERATE_PATH).resolve()
+MEMORY_PREREQ_CONVERSATION_PATH = (PROJECT_ROOT / MEMORY_PREREQ_CONVERSATION_PATH).resolve()
 
 RESULT_PATH.mkdir(parents=True, exist_ok=True)
 SCORE_PATH.mkdir(parents=True, exist_ok=True)
@@ -0,0 +1,49 @@
+MULTI_TURN_FUNC_DOC_FILE_MAPPING = {
+    "GorillaFileSystem": "gorilla_file_system.json",
+    "MathAPI": "math_api.json",
+    "MessageAPI": "message_api.json",
+    "TwitterAPI": "posting_api.json",
+    "TicketAPI": "ticket_api.json",
+    "TradingBot": "trading_bot.json",
+    "TravelAPI": "travel_booking.json",
+    "VehicleControlAPI": "vehicle_control.json",
+    "WebSearchAPI": "web_search.json",
+    "MemoryAPI_kv": "memory_kv.json",
+    "MemoryAPI_vector": "memory_vector.json",
+    "MemoryAPI_rec_sum": "memory_rec_sum.json",
+    "MemoryAPI_knowledge_graph": "memory_knowledge_graph.json",
+}
+
+BACKEND_PATH_PREFIX = "bfcl.eval_checker.multi_turn_eval.func_source_code"
+
+CLASS_FILE_PATH_MAPPING = {
+    "GorillaFileSystem": f"{BACKEND_PATH_PREFIX}.gorilla_file_system",
+    "MathAPI": f"{BACKEND_PATH_PREFIX}.math_api",
+    "MessageAPI": f"{BACKEND_PATH_PREFIX}.message_api",
+    "TwitterAPI": f"{BACKEND_PATH_PREFIX}.posting_api",
+    "TicketAPI": f"{BACKEND_PATH_PREFIX}.ticket_api",
+    "TradingBot": f"{BACKEND_PATH_PREFIX}.trading_bot",
+    "TravelAPI": f"{BACKEND_PATH_PREFIX}.travel_booking",
+    "VehicleControlAPI": f"{BACKEND_PATH_PREFIX}.vehicle_control",
+    # The following classes are not part of the multi-turn categories suite, but they share the same evaluation pipeline for simplicity
+    "WebSearchAPI": f"{BACKEND_PATH_PREFIX}.web_search",
+    "MemoryAPI_kv": f"{BACKEND_PATH_PREFIX}.memory_kv",
+    "MemoryAPI_vector": f"{BACKEND_PATH_PREFIX}.memory_vector",
+    "MemoryAPI_rec_sum": f"{BACKEND_PATH_PREFIX}.memory_rec_sum",
+    "MemoryAPI_knowledge_graph": f"{BACKEND_PATH_PREFIX}.memory_knowledge_graph",
+}
+
+# These classes are stateless and do not require any initial configuration
+STATELESS_CLASSES = [
+    "MathAPI",
+    "WebSearchAPI",
+]
+
+# These classes are stateful, but their state is too verbose to include in the inference log
+# Their state will be displayed and stored in separate files
+OMIT_STATE_INFO_CLASSES = [
+    "MemoryAPI_kv",
+    "MemoryAPI_vector",
+    "MemoryAPI_rec_sum",
+    "MemoryAPI_knowledge_graph",
+]
@@ -0,0 +1,50 @@
+# from bfcl.eval_checker.ast_eval.ast_checker import standardize_string
+import re
+
+#### Main functions ####
+
+
+def agentic_checker(model_response: str, possible_answer_list: list[str]) -> dict:
+    """
+    Check if one of the possible answers is contained in the model response, ignoring case, whitespace and ",./-_*^" punctuation.
+    """
+    standardized_possible_answer_list = [
+        standardize_string(possible_answer) for possible_answer in possible_answer_list
+    ]
+    # Sometimes the model response is a list of one string
+    if type(model_response) is list:
+        model_response = model_response[0]
+    if type(model_response) is not str:
+        model_response = str(model_response)
+
+    standardized_model_response = standardize_string(model_response)
+
+    for possible_answer in standardized_possible_answer_list:
+        if re.search(rf"\b{re.escape(possible_answer)}\b", standardized_model_response):
+            return {"valid": True, "error": []}
+
+    return {
+        "valid": False,
+        "error_message": f"None of the expected answers were found in the model response.",
+        "error_type": "agentic:answer_not_found",
+        "details": {
+            "model_response": model_response,
+            "possible_answers": possible_answer_list,
+            "standardized_model_response": standardized_model_response,
+            "standardized_possible_answers": standardized_possible_answer_list,
+        },
+    }
+
+
+#### Helper functions ####
+
+
+def standardize_string(input_string: str):
+    """
+    This function standardizes the string by removing all the whitespace, ",./-_*^()" punctuation, and converting it to lowercase
+    It will also convert all the single quotes to double quotes
+    This is used to compare the model output with the possible answers
+    We don't want to punish model for answer like April 1, 2024 vs April 1,2024, vs April 1 2024
+    """
+    regex_string = r"[\,\.\/\-\_\*\^\(\)]"
+    return re.sub(regex_string, "", input_string).lower().replace("'", '"')