Skip to content

Commit 311bbf9

Browse files
authored
Merge pull request #371 from trycua/chore/hud-upgrade
[Agent] Upgrade HUD SDK to 0.4.12
2 parents 6866b5a + 5fafe86 commit 311bbf9

File tree

17 files changed

+1469
-110702
lines changed

17 files changed

+1469
-110702
lines changed

docs/content/docs/agent-sdk/integrations/hud.mdx

Lines changed: 19 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -10,37 +10,35 @@ The HUD integration allows you to use ComputerAgent with the [HUD benchmarking f
1010
```bash
1111
pip install "cua-agent[hud]"
1212
## or install hud-python directly
13-
# pip install hud-python==0.2.10
13+
# pip install hud-python==0.4.12
1414
```
1515

1616
## Usage
1717

1818
```python
19-
from agent.integrations.hud import run_job
20-
from hud import load_taskset
21-
from hud.taskset import TaskSet
22-
import logging
19+
# Quick single-task smoke test
20+
from agent.integrations.hud import run_single_task
2321

24-
# Load taskset
25-
taskset = await load_taskset("OSWorld-Verified")
26-
taskset = TaskSet(tasks=taskset[:10]) # limit to 10 tasks instead of all 370
22+
await run_single_task(
23+
dataset="hud-evals/OSWorld-Verified-XLang", # or another HUD dataset
24+
model="openai/computer-use-preview+openai/gpt-5-nano", # any supported model string
25+
task_id=155, # e.g., reopen last closed tab
26+
)
27+
28+
# Run a small split of OSWorld-Verified in parallel
29+
from agent.integrations.hud import run_full_dataset
2730

28-
# Run benchmark job
29-
job = await run_job(
31+
results = await run_full_dataset(
32+
dataset="hud-evals/OSWorld-Verified-XLang", # can also pass a Dataset or list[dict]
3033
model="openai/computer-use-preview",
31-
# model="anthropic/claude-3-5-sonnet-20241022",
32-
# model="huggingface-local/HelloKKMe/GTA1-7B+openai/gpt-5",
33-
task_or_taskset=taskset,
34-
job_name="test-computeragent-job",
35-
max_concurrent_tasks=5,
36-
# add any extra ComputerAgent kwargs:
37-
verbosity=logging.INFO, # Enable logging
38-
# trajectory_dir=".." # Save trajectories locally
34+
split="train[:3]", # try a few tasks to start
35+
max_concurrent=20, # tune to your infra
36+
max_steps=50 # safety cap per task
3937
)
4038

41-
# Get results OR view them at app.hud.so
42-
print(await job.get_analytics())
43-
print(f"View results at: https://app.hud.so/jobs/{job.id}")
39+
# Environment variables required:
40+
# - HUD_API_KEY (HUD access)
41+
# - OPENAI_API_KEY or ANTHROPIC_API_KEY depending on your chosen model(s)
4442
```
4543

4644
**Available Benchmarks:**

libs/python/agent/agent/agent.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@
3030
TrajectorySaverCallback,
3131
BudgetManagerCallback,
3232
TelemetryCallback,
33+
OperatorNormalizerCallback
3334
)
3435
from .computers import (
3536
AsyncComputerHandler,
@@ -202,6 +203,9 @@ def __init__(
202203

203204
# == Add built-in callbacks ==
204205

206+
# Prepend operator normalizer callback
207+
self.callbacks.insert(0, OperatorNormalizerCallback())
208+
205209
# Add telemetry callback if telemetry_enabled is set
206210
if self.telemetry_enabled:
207211
if isinstance(self.telemetry_enabled, bool):

libs/python/agent/agent/callbacks/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
from .trajectory_saver import TrajectorySaverCallback
99
from .budget_manager import BudgetManagerCallback
1010
from .telemetry import TelemetryCallback
11+
from .operator_validator import OperatorNormalizerCallback
1112

1213
__all__ = [
1314
"AsyncCallbackHandler",
@@ -16,4 +17,5 @@
1617
"TrajectorySaverCallback",
1718
"BudgetManagerCallback",
1819
"TelemetryCallback",
20+
"OperatorNormalizerCallback",
1921
]
Lines changed: 138 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,138 @@
1+
"""
2+
OperatorValidatorCallback
3+
4+
Ensures agent output actions conform to expected schemas by fixing common issues:
5+
- click: add default button='left' if missing
6+
- keypress: wrap keys string into a list
7+
- etc.
8+
9+
This runs in on_llm_end, which receives the output array (AgentMessage[] as dicts).
10+
The purpose is to avoid spending another LLM call to fix broken computer call syntax when possible.
11+
"""
12+
from __future__ import annotations
13+
14+
from typing import Any, Dict, List
15+
16+
from .base import AsyncCallbackHandler
17+
18+
19+
class OperatorNormalizerCallback(AsyncCallbackHandler):
20+
"""Normalizes common computer call hallucinations / errors in computer call syntax."""
21+
22+
async def on_llm_end(self, output: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
23+
# Mutate in-place as requested, but still return the list for chaining
24+
for item in output or []:
25+
if item.get("type") != "computer_call":
26+
continue
27+
action = item.get("action")
28+
if not isinstance(action, dict):
29+
continue
30+
31+
# rename mouse click actions to "click"
32+
for mouse_btn in ["left", "right", "wheel", "back", "forward"]:
33+
if action.get("type", "") == f"{mouse_btn}_click":
34+
action["type"] = "click"
35+
action["button"] = mouse_btn
36+
# rename hotkey actions to "keypress"
37+
for alias in ["hotkey", "key", "press", "key_press"]:
38+
if action.get("type", "") == alias:
39+
action["type"] = "keypress"
40+
# assume click actions
41+
if "button" in action and "type" not in action:
42+
action["type"] = "click"
43+
if "click" in action and "type" not in action:
44+
action["type"] = "click"
45+
if ("scroll_x" in action or "scroll_y" in action) and "type" not in action:
46+
action["type"] = "scroll"
47+
if "text" in action and "type" not in action:
48+
action["type"] = "type"
49+
50+
action_type = action.get("type")
51+
def _keep_keys(action: Dict[str, Any], keys_to_keep: List[str]):
52+
"""Keep only the provided keys on action; delete everything else.
53+
Always ensures required 'type' is present if listed in keys_to_keep.
54+
"""
55+
for key in list(action.keys()):
56+
if key not in keys_to_keep:
57+
del action[key]
58+
# rename "coordinate" to "x", "y"
59+
if "coordinate" in action:
60+
action["x"] = action["coordinate"][0]
61+
action["y"] = action["coordinate"][1]
62+
del action["coordinate"]
63+
if action_type == "click":
64+
# convert "click" to "button"
65+
if "button" not in action and "click" in action:
66+
action["button"] = action["click"]
67+
del action["click"]
68+
# default button to "left"
69+
action["button"] = action.get("button", "left")
70+
# add default scroll x, y if missing
71+
if action_type == "scroll":
72+
action["scroll_x"] = action.get("scroll_x", 0)
73+
action["scroll_y"] = action.get("scroll_y", 0)
74+
# ensure keys arg is a list (normalize aliases first)
75+
if action_type == "keypress":
76+
keys = action.get("keys")
77+
for keys_alias in ["keypress", "key", "press", "key_press", "text"]:
78+
if keys_alias in action:
79+
action["keys"] = action[keys_alias]
80+
del action[keys_alias]
81+
keys = action.get("keys")
82+
if isinstance(keys, str):
83+
action["keys"] = keys.replace("-", "+").split("+") if len(keys) > 1 else [keys]
84+
required_keys_by_type = {
85+
# OpenAI actions
86+
"click": ["type", "button", "x", "y"],
87+
"double_click": ["type", "x", "y"],
88+
"drag": ["type", "path"],
89+
"keypress": ["type", "keys"],
90+
"move": ["type", "x", "y"],
91+
"screenshot": ["type"],
92+
"scroll": ["type", "scroll_x", "scroll_y", "x", "y"],
93+
"type": ["type", "text"],
94+
"wait": ["type"],
95+
# Anthropic actions
96+
"left_mouse_down": ["type", "x", "y"],
97+
"left_mouse_up": ["type", "x", "y"],
98+
"triple_click": ["type", "button", "x", "y"],
99+
}
100+
keep = required_keys_by_type.get(action_type or "")
101+
if keep:
102+
_keep_keys(action, keep)
103+
104+
105+
# Second pass: if an assistant message is immediately followed by a computer_call,
106+
# replace the assistant message itself with a reasoning message with summary text.
107+
if isinstance(output, list):
108+
for i, item in enumerate(output):
109+
# AssistantMessage shape: { type: 'message', role: 'assistant', content: OutputContent[] }
110+
if item.get("type") == "message" and item.get("role") == "assistant":
111+
next_idx = i + 1
112+
if next_idx >= len(output):
113+
continue
114+
next_item = output[next_idx]
115+
if not isinstance(next_item, dict):
116+
continue
117+
if next_item.get("type") != "computer_call":
118+
continue
119+
contents = item.get("content") or []
120+
# Extract text from OutputContent[]
121+
text_parts: List[str] = []
122+
if isinstance(contents, list):
123+
for c in contents:
124+
if isinstance(c, dict) and c.get("type") == "output_text" and isinstance(c.get("text"), str):
125+
text_parts.append(c["text"])
126+
text_content = "\n".join(text_parts).strip()
127+
# Replace assistant message with reasoning message
128+
output[i] = {
129+
"type": "reasoning",
130+
"summary": [
131+
{
132+
"type": "summary_text",
133+
"text": text_content,
134+
}
135+
],
136+
}
137+
138+
return output

libs/python/agent/agent/callbacks/trajectory_saver.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -94,6 +94,10 @@ def _save_artifact(self, name: str, artifact: Union[str, bytes, Dict[str, Any]])
9494
# format: turn_000/0000_name.json
9595
artifact_filename = f"{self.current_artifact:04d}_{name}"
9696
artifact_path = turn_dir / f"{artifact_filename}.json"
97+
# add created_at
98+
if isinstance(artifact, dict):
99+
artifact = artifact.copy()
100+
artifact["created_at"] = str(uuid.uuid1().time)
97101
with open(artifact_path, "w") as f:
98102
json.dump(sanitize_image_urls(artifact), f, indent=2)
99103
self.current_artifact += 1
@@ -171,7 +175,7 @@ async def on_run_end(self, kwargs: Dict[str, Any], old_items: List[Dict[str, Any
171175
"status": "completed",
172176
"completed_at": str(uuid.uuid1().time),
173177
"total_usage": self.total_usage,
174-
"new_items": sanitize_image_urls(new_items),
178+
"new_items": new_items,
175179
"total_turns": self.current_turn
176180
})
177181

0 commit comments

Comments
 (0)