Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
50 changes: 50 additions & 0 deletions environments/medxpertqa/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
# medxpertqa


### Overview
- **Environment ID**: `medxpertqa`
- **Short description**: MedXpertQA is a highly challenging and comprehensive benchmark designed to evaluate expert-level medical knowledge and advanced reasoning capabilities. We only use the text subset for now.
- **Tags**: mcq

### Datasets
- **Primary dataset(s)**: TsinghuaC3I/MedXpertQA
- **Source links**: [HuggingFace](https://huggingface.co/datasets/TsinghuaC3I/MedXpertQA)
- **Split sizes**: test subset - 2.45k rows

### Task
- **Type**: single-turn
- **Parser**: <e.g., custom
- **Rubric overview**: a reward of 1.0 is awarded if the model chooses the corret option, else it is 0

### Quickstart
Run an evaluation with default settings:

```bash
uv run vf-eval my-new-env
```

Configure model and sampling:

```bash
uv run vf-eval my-new-env -m gpt-4.1-mini -n 20 -r 3 -t 1024 -T 0.7
```

Notes:
- Use `-a` / `--env-args` to pass environment-specific configuration as a JSON object.

### Environment Arguments
Document any supported environment arguments and their meaning. Example:

| Arg | Type | Default | Description |
| --- | ---- | ------- | ----------- |
| `foo` | str | `"bar"` | What this controls |
| `max_examples` | int | `-1` | Limit on dataset size (use -1 for all) |

### Metrics
Summarize key metrics your rubric emits and how they’re interpreted.

| Metric | Meaning |
| ------ | ------- |
| `reward` | Main scalar reward (weighted sum of criteria) |
| `accuracy` | Exact match on target answer |

83 changes: 83 additions & 0 deletions environments/medxpertqa/medxpertqa.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
import verifiers as vf
from datasets import load_dataset
from verifiers.utils.data_utils import extract_boxed_answer


def _get_system_prompt(use_think: bool) -> str:
think_system_prompt = "You are a helpful medical assistant. Think step-by-step inside <think>...</think> tags. Put your final answer within \\boxed{}."
no_think_system_prompt = (
"You are a helpful medical assistant. Think step-by-step and put your final answer within \\boxed{}."
)
system_prompt = think_system_prompt if use_think else no_think_system_prompt
return system_prompt


def _format_question_with_options(question_with_options: str, options) -> str:
"""
Rebuild the composite question string from the standalone stem and options.
This keeps the current formatting while letting us randomize the options later.
"""
if not options:
return question_with_options

if isinstance(options, dict):
option_items = list(options.items())
elif isinstance(options, list):
option_items = [(chr(ord("A") + idx), value) for idx, value in enumerate(options)]
else:
return question_with_options

question, sep, _ = question_with_options.partition("Answer Choices:")
question = question.strip() if sep else question_with_options.strip()
formatted_options = " ".join(f"({key}) {value}" for key, value in option_items)
if not formatted_options:
return question
return f"{question}\nAnswer Choices: {formatted_options}"


def load_environment(
use_think: bool = False,
) -> vf.Environment:
"""
MedXpertQA environment using equality with the "label" column as the eval criterion.
This environment loads the MedXpertQA dataset and compares model responses (diagnosis) with the ground truth in "label" column.
"""
full_dataset = load_dataset("TsinghuaC3I/MedXpertQA", "Text")
test_dataset = full_dataset["test"].map(
lambda x: {
"question": x["question"],
"answer": x["label"],
"task": "medxpertqa",
"options": x.get("options"),
}
)

def _map(ex):
return {
"question": _format_question_with_options(ex["question"], ex.get("options")),
"answer": ex["label"],
}

mapped = test_dataset.map(_map).filter(lambda r: r is not None)

async def medxpertqa_reward_func(
completion: str,
answer: str,
) -> float:
"""
Reward function for MedXpertQA environment.
Compares the model response with the ground truth answer.
Returns 1.0 if they match (case-insensitive), else 0.0.
"""
parser = vf.ThinkParser(extract_boxed_answer) if use_think else vf.Parser(extract_boxed_answer)
final_answer = parser.parse_answer(completion).strip()
if final_answer.lower() == answer.lower():
return 1.0
else:
return 0.0

rubric = vf.Rubric(funcs=[medxpertqa_reward_func], weights=[1.0])

vf_env = vf.SingleTurnEnv(eval_dataset=mapped, system_prompt=_get_system_prompt(use_think), rubric=rubric)

return vf_env
23 changes: 23 additions & 0 deletions environments/medxpertqa/pyproject.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
[project]
name = "medxpertqa"
description = "MedXpertQA is a highly challenging and comprehensive benchmark designed to evaluate expert-level medical knowledge and advanced reasoning capabilities. We use the text subset for now."
tags = ["eval"]
version = "0.1.0"
requires-python = ">=3.11"
dependencies = [
"verifiers>=0.1.4",
"datasets>=4.0.0",
]

[build-system]
requires = ["hatchling"]
build-backend = "hatchling.build"

[tool.hatch.build]
include = ["medxpertqa.py"]

[tool.prime.environment]
# lets Prime/vf-eval know where the loader lives in a flat repo
loader = "c:load_environment"
display_name = "MedXpertQA"
visibility = "PUBLIC"