MedARC-AI · warner-benjamin · Oct 17, 2025 · Oct 2, 2025 · Oct 3, 2025 · Oct 14, 2025
diff --git a/environments/medxpertqa/README.md b/environments/medxpertqa/README.md
@@ -0,0 +1,50 @@
+# medxpertqa
+
+
+### Overview
+- **Environment ID**: `medxpertqa`
+- **Short description**: MedXpertQA is a highly challenging and comprehensive benchmark designed to evaluate expert-level medical knowledge and advanced reasoning capabilities. We only use the text subset for now.
+- **Tags**: mcq
+
+### Datasets
+- **Primary dataset(s)**: TsinghuaC3I/MedXpertQA
+- **Source links**: [HuggingFace](https://huggingface.co/datasets/TsinghuaC3I/MedXpertQA)
+- **Split sizes**: test subset - 2.45k rows
+
+### Task
+- **Type**: single-turn
+- **Parser**: <e.g., custom
+- **Rubric overview**: a reward of 1.0 is awarded if the model chooses the corret option, else it is 0
+
+### Quickstart
+Run an evaluation with default settings:
+
+```bash
+uv run vf-eval my-new-env
+```
+
+Configure model and sampling:
+
+```bash
+uv run vf-eval my-new-env   -m gpt-4.1-mini   -n 20 -r 3 -t 1024 -T 0.7
+```
+
+Notes:
+- Use `-a` / `--env-args` to pass environment-specific configuration as a JSON object.
+
+### Environment Arguments
+Document any supported environment arguments and their meaning. Example:
+
+| Arg | Type | Default | Description |
+| --- | ---- | ------- | ----------- |
+| `foo` | str | `"bar"` | What this controls |
+| `max_examples` | int | `-1` | Limit on dataset size (use -1 for all) |
+
+### Metrics
+Summarize key metrics your rubric emits and how they’re interpreted.
+
+| Metric | Meaning |
+| ------ | ------- |
+| `reward` | Main scalar reward (weighted sum of criteria) |
+| `accuracy` | Exact match on target answer |
+
diff --git a/environments/medxpertqa/medxpertqa.py b/environments/medxpertqa/medxpertqa.py
@@ -0,0 +1,83 @@
+import verifiers as vf
+from datasets import load_dataset
+from verifiers.utils.data_utils import extract_boxed_answer
+
+
+def _get_system_prompt(use_think: bool) -> str:
+    think_system_prompt = "You are a helpful medical assistant. Think step-by-step inside <think>...</think> tags. Put your final answer within \\boxed{}."
+    no_think_system_prompt = (
+        "You are a helpful medical assistant. Think step-by-step and put your final answer within \\boxed{}."
+    )
+    system_prompt = think_system_prompt if use_think else no_think_system_prompt
+    return system_prompt
+
+
+def _format_question_with_options(question_with_options: str, options) -> str:
+    """
+    Rebuild the composite question string from the standalone stem and options.
+    This keeps the current formatting while letting us randomize the options later.
+    """
+    if not options:
+        return question_with_options
+
+    if isinstance(options, dict):
+        option_items = list(options.items())
+    elif isinstance(options, list):
+        option_items = [(chr(ord("A") + idx), value) for idx, value in enumerate(options)]
+    else:
+        return question_with_options
+
+    question, sep, _ = question_with_options.partition("Answer Choices:")
+    question = question.strip() if sep else question_with_options.strip()
+    formatted_options = " ".join(f"({key}) {value}" for key, value in option_items)
+    if not formatted_options:
+        return question
+    return f"{question}\nAnswer Choices: {formatted_options}"
+
+
+def load_environment(
+    use_think: bool = False,
+) -> vf.Environment:
+    """
+    MedXpertQA environment using equality with the "label" column as the eval criterion.
+    This environment loads the MedXpertQA dataset and compares model responses (diagnosis) with the ground truth in "label" column.
+    """
+    full_dataset = load_dataset("TsinghuaC3I/MedXpertQA", "Text")
+    test_dataset = full_dataset["test"].map(
+        lambda x: {
+            "question": x["question"],
+            "answer": x["label"],
+            "task": "medxpertqa",
+            "options": x.get("options"),
+        }
+    )
+
+    def _map(ex):
+        return {
+            "question": _format_question_with_options(ex["question"], ex.get("options")),
+            "answer": ex["label"],
+        }
+
+    mapped = test_dataset.map(_map).filter(lambda r: r is not None)
+
+    async def medxpertqa_reward_func(
+        completion: str,
+        answer: str,
+    ) -> float:
+        """
+        Reward function for MedXpertQA environment.
+        Compares the model response with the ground truth answer.
+        Returns 1.0 if they match (case-insensitive), else 0.0.
+        """
+        parser = vf.ThinkParser(extract_boxed_answer) if use_think else vf.Parser(extract_boxed_answer)
+        final_answer = parser.parse_answer(completion).strip()
+        if final_answer.lower() == answer.lower():
+            return 1.0
+        else:
+            return 0.0
+
+    rubric = vf.Rubric(funcs=[medxpertqa_reward_func], weights=[1.0])
+
+    vf_env = vf.SingleTurnEnv(eval_dataset=mapped, system_prompt=_get_system_prompt(use_think), rubric=rubric)
+
+    return vf_env
diff --git a/environments/medxpertqa/pyproject.toml b/environments/medxpertqa/pyproject.toml
@@ -0,0 +1,23 @@
+[project]
+name = "medxpertqa"
+description = "MedXpertQA is a highly challenging and comprehensive benchmark designed to evaluate expert-level medical knowledge and advanced reasoning capabilities. We use the text subset for now."
+tags = ["eval"]
+version = "0.1.0"
+requires-python = ">=3.11"
+dependencies = [
+    "verifiers>=0.1.4",
+    "datasets>=4.0.0",
+]
+
+[build-system]
+requires = ["hatchling"]
+build-backend = "hatchling.build"
+
+[tool.hatch.build]
+include = ["medxpertqa.py"]
+
+[tool.prime.environment]
+# lets Prime/vf-eval know where the loader lives in a flat repo
+loader = "c:load_environment"
+display_name = "MedXpertQA"
+visibility = "PUBLIC"