Skip to content

Commit 2d62efc

Browse files
authored
Merge pull request #221 from OpenPipe/ruler-docs
ruler docs
2 parents 1f1165f + 6bd33e6 commit 2d62efc

File tree

12 files changed

+2149
-1538
lines changed

12 files changed

+2149
-1538
lines changed

docs/docs.json

Lines changed: 5 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -56,21 +56,17 @@
5656
"pages": [
5757
"fundamentals/training-loop",
5858
"fundamentals/art-client",
59-
"fundamentals/art-backend"
59+
"fundamentals/art-backend",
60+
"fundamentals/ruler"
6061
]
6162
},
6263
{
6364
"group": "Tutorials",
64-
"pages": [
65-
"tutorials/summarizer"
66-
]
65+
"pages": ["tutorials/summarizer"]
6766
},
6867
{
6968
"group": "Resources",
70-
"pages": [
71-
"resources/models",
72-
"resources/glossary"
73-
]
69+
"pages": ["resources/models", "resources/glossary"]
7470
}
7571
]
7672
},
@@ -81,4 +77,4 @@
8177
"bluesky": "https://bsky.app/profile/openpipe.bsky.social",
8278
"github": "https://github.com/openpipe/ART"
8379
}
84-
}
80+
}

docs/fundamentals/ruler.mdx

Lines changed: 299 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,299 @@
1+
---
2+
title: "RULER"
3+
icon: "ruler"
4+
description: "Learn how to use RULER to automatically reward your agents."
5+
---
6+
7+
# 📏RULER: Relative Universal LLM-Elicited Rewards
8+
9+
RULER (Relative Universal LLM-Elicited Rewards) is a general-purpose reward function that uses an LLM-as-judge to rank multiple agent trajectories. It requires no labeled data, expert feedback, or hand-crafted reward functions, yet reliably improves agent performance.
10+
11+
<div align="center">
12+
<img src="/images/ruler-results.png" alt="RULER Performance Results" style={{maxWidth: "100%", height: "auto"}} />
13+
<p><em>RULER performance across multiple tasks at launch. In 3 out of 4 tasks, models trained with RULER slightly outperform those trained with hand-crafted reward functions. See the full <a href="https://openpipe.ai/blog/ruler">launch announcement</a> for details.</em></p>
14+
</div>
15+
16+
## Key Benefits
17+
18+
- **No labeled data required**: RULER works by comparing trajectories against each other
19+
- **General-purpose**: Can be applied to a wide variety of RL tasks without modification
20+
- **Fast development**: Can reduce implementation time by 2-3x compared to hand-crafted rewards
21+
- **Strong performance**: Often matches or exceeds hand-crafted reward functions
22+
23+
## How RULER Works
24+
25+
RULER leverages two key insights:
26+
27+
1. **Relative scoring is easier than absolute scoring**: It's easier for an LLM to rank several solutions relative to each other than to score them in isolation
28+
2. **GRPO only needs relative scores**: Since GRPO normalizes scores within each group, only the relative rankings matter, not absolute values
29+
30+
The process:
31+
1. Generate N trajectories for a given scenario
32+
2. Pass all N trajectories to RULER
33+
3. RULER deduplicates common prefixes (e.g., identical system messages)
34+
4. An LLM judge scores each trajectory from 0 to 1 based on goal achievement
35+
5. These scores are used directly as rewards in GRPO training
36+
37+
## Basic Usage
38+
39+
```python
40+
import art
41+
from art.rewards import ruler_score_group
42+
43+
# Create a TrajectoryGroup from your trajectories
44+
group = art.TrajectoryGroup([...]) # List of art.Trajectory objects
45+
46+
# Use RULER to score them
47+
judged_group = await ruler_score_group(
48+
group,
49+
"openai/o3",
50+
debug=True # Shows the judge's reasoning
51+
)
52+
53+
# Access the scores
54+
if judged_group: # Check if judging succeeded
55+
for traj in judged_group.trajectories:
56+
print(f"Reward: {traj.reward}")
57+
print(f"RULER explanation: {traj.logs[-1]}")
58+
```
59+
60+
## Complete Example: Joke Generation
61+
62+
Here's a toy example showing how RULER ranks different quality trajectories:
63+
64+
```python
65+
import asyncio
66+
import art
67+
from art.rewards import ruler_score_group
68+
from openai.types.chat.chat_completion import Choice
69+
from openai.types.chat import ChatCompletionMessage
70+
71+
async def main():
72+
# Initial messages shared by all trajectories
73+
initial_messages = [
74+
{"role": "system", "content": "You are a comedy writer. Generate funny jokes based on the given topic."},
75+
{"role": "user", "content": "Tell me a funny joke about computers"}
76+
]
77+
78+
# Create three trajectories with different quality responses
79+
good_trajectory = art.Trajectory(
80+
messages_and_choices=[
81+
*initial_messages,
82+
Choice(finish_reason="stop", index=0, message=ChatCompletionMessage(
83+
role="assistant",
84+
content="Why don't computers ever get invited to parties?\n\nBecause they always crash! 🥁\n\nBut seriously, have you tried turning them off and on again?"
85+
))
86+
],
87+
reward=0.0
88+
)
89+
90+
mediocre_trajectory = art.Trajectory(
91+
messages_and_choices=[
92+
*initial_messages,
93+
Choice(finish_reason="stop", index=0, message=ChatCompletionMessage(
94+
role="assistant",
95+
content="What do you call a computer that doesn't work?\n\nBroken."
96+
))
97+
],
98+
reward=0.0
99+
)
100+
101+
off_topic_trajectory = art.Trajectory(
102+
messages_and_choices=[
103+
*initial_messages,
104+
Choice(finish_reason="stop", index=0, message=ChatCompletionMessage(
105+
role="assistant",
106+
content="I don't really know jokes about computers, but here's a fact: The sky is blue because of Rayleigh scattering."
107+
))
108+
],
109+
reward=0.0
110+
)
111+
112+
# Create a TrajectoryGroup and use RULER to score
113+
group = art.TrajectoryGroup([good_trajectory, mediocre_trajectory, off_topic_trajectory])
114+
judged_group = await ruler_score_group(group, "openai/o3", debug=True)
115+
116+
# Display rankings
117+
if judged_group:
118+
sorted_trajectories = sorted(judged_group.trajectories, key=lambda t: t.reward, reverse=True)
119+
for rank, traj in enumerate(sorted_trajectories, 1):
120+
messages = traj.messages()
121+
print(f"Rank {rank}: Score {traj.reward:.3f}")
122+
print(f" Response: {messages[-1]['content'][:50]}...")
123+
124+
asyncio.run(main())
125+
```
126+
127+
### Example Output
128+
129+
```
130+
[RULER] Pretty-printed LLM choice JSON:
131+
{
132+
'scores': [
133+
{
134+
'trajectory_id': '1',
135+
'explanation': 'This joke cleverly connects computer crashes with social situations, making it relatable and humorous. It also includes a common tech support line for added humor.',
136+
'score': 0.9
137+
},
138+
{
139+
'trajectory_id': '2',
140+
'explanation': "While this joke is straightforward and a pun, it's quite simple and lacks depth. Still, it stays relevant to the computer theme.",
141+
'score': 0.5
142+
},
143+
{
144+
'trajectory_id': '3',
145+
'explanation': 'This trajectory fails to deliver a joke about computers, instead providing an unrelated fact, resulting in a very low score.',
146+
'score': 0.1
147+
}
148+
]
149+
}
150+
151+
Rank 1: Score 0.900
152+
Response: Why don't computers ever get invited to parties?...
153+
Rank 2: Score 0.500
154+
Response: What do you call a computer that doesn't work?...
155+
Rank 3: Score 0.100
156+
Response: I don't really know jokes about computers, but h...
157+
```
158+
159+
## Customization
160+
161+
### Judge Model
162+
163+
You can use any LLM supported by LiteLLM as the judge:
164+
165+
```python
166+
# Using o4-mini
167+
await ruler_score_group(group, "openai/o4-mini")
168+
169+
# Using Claude
170+
await ruler_score_group(group, "anthropic/claude-sonnet-4-20250514")
171+
172+
# Using local models
173+
await ruler_score_group(group, "ollama/qwen3:32b")
174+
```
175+
176+
### Extra LiteLLM Parameters
177+
178+
You can pass additional parameters to LiteLLM for fine-tuning the judge behavior:
179+
180+
```python
181+
# Adjust temperature and max tokens
182+
await ruler_score_group(
183+
group,
184+
"openai/o3",
185+
extra_litellm_params={"temperature": 0.7, "max_tokens": 1000}
186+
)
187+
188+
# Use custom API base for local models
189+
await ruler_score_group(
190+
group,
191+
"openai/gpt-4",
192+
extra_litellm_params={"api_base": "http://localhost:8000"}
193+
)
194+
```
195+
196+
### Custom Rubric
197+
198+
While the default rubric works well for most tasks, you can provide a custom one:
199+
200+
```python
201+
custom_rubric = """
202+
- Prioritize responses that are concise and clear
203+
- Penalize responses that include emojis or informal language
204+
- Reward responses that cite sources
205+
"""
206+
207+
await ruler_score_group(
208+
group,
209+
"openai/o3",
210+
rubric=custom_rubric
211+
)
212+
```
213+
214+
### Using Raw Message Lists
215+
216+
If you're not using `art.Trajectory` objects, you can use the lower-level `ruler` function:
217+
218+
```python
219+
from art.rewards import ruler
220+
221+
# Each message list is a list of ChatCompletionMessageParam dicts
222+
message_lists = [
223+
[
224+
{"role": "system", "content": "You are a helpful assistant."},
225+
{"role": "user", "content": "What is 2+2?"},
226+
{"role": "assistant", "content": "2+2 equals 4."}
227+
],
228+
# ... more trajectories
229+
]
230+
231+
scores = await ruler(
232+
message_lists,
233+
"openai/o3"
234+
)
235+
236+
for score in scores:
237+
print(f"Trajectory {score.trajectory_id}: {score.score} - {score.explanation}")
238+
```
239+
240+
## Best Practices
241+
242+
1. **Clear system prompts**: RULER uses the system prompt to understand the agent's goal. Make sure your system prompts clearly describe what the agent should do.
243+
244+
2. **Group size**: Use 4-8 trajectories per group for optimal balance between diversity and cost. Very large groups are not recommended because they can confuse the judge.
245+
246+
3. **Debug mode**: Enable `debug=True` to see the judge's reasoning, which helps identify scoring patterns.
247+
248+
4. **Judge selection**: Cheaper models like Qwen3 32B often work well and are more cost-effective than larger models.
249+
250+
## Integration with Training
251+
252+
RULER integrates into ART's training loop using the `gather_trajectory_groups` helper with an `after_each` callback:
253+
254+
```python
255+
import art
256+
from art.rewards import ruler_score_group
257+
258+
# In your training loop
259+
groups = await art.gather_trajectory_groups(
260+
(
261+
art.TrajectoryGroup(
262+
rollout(model, scenario) for _ in range(4) # 4 trajectories per group
263+
)
264+
for scenario in batch_scenarios
265+
),
266+
after_each=lambda group: ruler_score_group(
267+
group,
268+
"openai/o3",
269+
swallow_exceptions=True # Return None on error, filtering out the group
270+
)
271+
)
272+
273+
# Train on the judged groups
274+
await model.train(groups)
275+
```
276+
277+
The `swallow_exceptions=True` parameter is recommended in production to handle judge API failures gracefully - groups that fail to be judged are simply filtered out rather than crashing the training loop.
278+
279+
## Performance Tips
280+
281+
- **Caching**: RULER automatically caches judge responses to disk to avoid redundant API calls
282+
- **Batch processing**: Process multiple groups in parallel when possible
283+
- **Token efficiency**: Common prefixes are automatically deduplicated to save tokens
284+
285+
## Troubleshooting
286+
287+
### Low scores for all trajectories
288+
- Check that your system prompt clearly defines the task
289+
- Ensure trajectories are actually attempting the task
290+
- Try the default rubric before customizing
291+
292+
### Inconsistent rankings
293+
- Increase group size for more stable relative rankings
294+
- Use a more capable judge model
295+
- Add more specific criteria to your rubric
296+
297+
### High API costs
298+
- Use cheaper judge models (e.g., Qwen3 32B)
299+
- Reduce group size

docs/images/ruler-results.png

127 KB
Loading

examples/art-e/all_experiments.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -146,7 +146,7 @@
146146

147147
# Model 217: like 206 but with Qwen/Qwen3-14B base model and nothink option enabled
148148
models["217"] = models["206"].model_copy(deep=True)
149-
models["217"].name = "email-agent-217-2"
149+
models["217"].name = "email-agent-217-3"
150150
models["217"].base_model = "Qwen/Qwen3-14B"
151151
models["217"].config.include_qwen3_nothink = True
152152

@@ -203,4 +203,4 @@
203203

204204
models["224"] = models["223"].model_copy(deep=True)
205205
models["224"].name = "email-agent-224"
206-
models["224"].config.learning_rate = 2e-6
206+
models["224"].config.learning_rate = 1e-6

examples/art-e/art_e/test_ruler.py

Lines changed: 11 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
from art_e.data.query_iterators import load_synthetic_queries
66
from art_e.rollout import rollout
77
from tqdm.asyncio import tqdm
8-
from art.rewards import art_ruler
8+
from art.rewards import ruler_score_group
99

1010
load_dotenv()
1111

@@ -49,15 +49,19 @@ async def main():
4949
for m, t in zip(models, rollouts):
5050
print(f" {m.name:10s}: {t.reward:.3f}")
5151

52-
judged_rollouts = await art_ruler(
53-
rollouts,
54-
{"model": "openai/o3"},
52+
# Create a TrajectoryGroup from the rollouts
53+
group = art.TrajectoryGroup(rollouts)
54+
55+
judged_group = await ruler_score_group(
56+
group,
57+
"openai/o3",
5558
debug=True,
5659
)
5760

58-
print("\nGroup-judge rewards:")
59-
for m, t in zip(models, judged_rollouts):
60-
print(f" {m.name:10s}: {t.reward:.3f}")
61+
if judged_group:
62+
print("\nRULER rewards:")
63+
for m, t in zip(models, judged_group.trajectories):
64+
print(f" {m.name:10s}: {t.reward:.3f}")
6165

6266

6367
asyncio.run(main())

0 commit comments

Comments
 (0)