trycua
diff --git a/‎docs/content/docs/agent-sdk/integrations/hud.mdx‎
Lines changed: 19 additions & 21 deletions b/‎docs/content/docs/agent-sdk/integrations/hud.mdx‎
Lines changed: 19 additions & 21 deletions
diff --git a/‎libs/python/agent/agent/agent.py‎
Lines changed: 4 additions & 0 deletions b/‎libs/python/agent/agent/agent.py‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎libs/python/agent/agent/callbacks/__init__.py‎
Lines changed: 2 additions & 0 deletions b/‎libs/python/agent/agent/callbacks/__init__.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎libs/python/agent/agent/callbacks/operator_validator.py‎
Lines changed: 138 additions & 0 deletions b/‎libs/python/agent/agent/callbacks/operator_validator.py‎
Lines changed: 138 additions & 0 deletions
diff --git a/‎libs/python/agent/agent/callbacks/trajectory_saver.py‎
Lines changed: 5 additions & 1 deletion b/‎libs/python/agent/agent/callbacks/trajectory_saver.py‎
Lines changed: 5 additions & 1 deletion
@@ -10,37 +10,35 @@ The HUD integration allows you to use ComputerAgent with the [HUD benchmarking f
 ```bash
 pip install "cua-agent[hud]"
 ## or install hud-python directly
-# pip install hud-python==0.2.10
+# pip install hud-python==0.4.12
 ```
 
 ## Usage
 
 ```python
-from agent.integrations.hud import run_job
-from hud import load_taskset
-from hud.taskset import TaskSet
-import logging
+# Quick single-task smoke test
+from agent.integrations.hud import run_single_task
 
-# Load taskset
-taskset = await load_taskset("OSWorld-Verified")
-taskset = TaskSet(tasks=taskset[:10]) # limit to 10 tasks instead of all 370
+await run_single_task(
+    dataset="hud-evals/OSWorld-Verified-XLang",   # or another HUD dataset
+    model="openai/computer-use-preview+openai/gpt-5-nano",  # any supported model string
+    task_id=155,  # e.g., reopen last closed tab
+)
+
+# Run a small split of OSWorld-Verified in parallel
+from agent.integrations.hud import run_full_dataset
 
-# Run benchmark job
-job = await run_job(
+results = await run_full_dataset(
+    dataset="hud-evals/OSWorld-Verified-XLang",   # can also pass a Dataset or list[dict]
     model="openai/computer-use-preview",
-    # model="anthropic/claude-3-5-sonnet-20241022",
-    # model="huggingface-local/HelloKKMe/GTA1-7B+openai/gpt-5",
-    task_or_taskset=taskset,
-    job_name="test-computeragent-job",
-    max_concurrent_tasks=5,
-    # add any extra ComputerAgent kwargs:
-    verbosity=logging.INFO,  # Enable logging
-    # trajectory_dir=".."       # Save trajectories locally
+    split="train[:3]",           # try a few tasks to start
+    max_concurrent=20,            # tune to your infra
+    max_steps=50                  # safety cap per task
 )
 
-# Get results OR view them at app.hud.so
-print(await job.get_analytics())
-print(f"View results at: https://app.hud.so/jobs/{job.id}")
+# Environment variables required:
+# - HUD_API_KEY (HUD access)
+# - OPENAI_API_KEY or ANTHROPIC_API_KEY depending on your chosen model(s)
 ```
 
 **Available Benchmarks:**
 
@@ -30,6 +30,7 @@
     TrajectorySaverCallback, 
     BudgetManagerCallback,
     TelemetryCallback,
+    OperatorNormalizerCallback
 )
 from .computers import (
     AsyncComputerHandler,
@@ -202,6 +203,9 @@ def __init__(
 
         # == Add built-in callbacks ==
 
+        # Prepend operator normalizer callback
+        self.callbacks.insert(0, OperatorNormalizerCallback())
+
         # Add telemetry callback if telemetry_enabled is set
         if self.telemetry_enabled:
             if isinstance(self.telemetry_enabled, bool):
 
@@ -8,6 +8,7 @@
 from .trajectory_saver import TrajectorySaverCallback
 from .budget_manager import BudgetManagerCallback
 from .telemetry import TelemetryCallback
+from .operator_validator import OperatorNormalizerCallback
 
 __all__ = [
     "AsyncCallbackHandler",
@@ -16,4 +17,5 @@
     "TrajectorySaverCallback",
     "BudgetManagerCallback",
     "TelemetryCallback",
+    "OperatorNormalizerCallback",
 ]
@@ -0,0 +1,138 @@
+"""
+OperatorValidatorCallback
+
+Ensures agent output actions conform to expected schemas by fixing common issues:
+- click: add default button='left' if missing
+- keypress: wrap keys string into a list
+- etc.
+
+This runs in on_llm_end, which receives the output array (AgentMessage[] as dicts).
+The purpose is to avoid spending another LLM call to fix broken computer call syntax when possible.
+"""
+from __future__ import annotations
+
+from typing import Any, Dict, List
+
+from .base import AsyncCallbackHandler
+
+
+class OperatorNormalizerCallback(AsyncCallbackHandler):
+    """Normalizes common computer call hallucinations / errors in computer call syntax."""
+
+    async def on_llm_end(self, output: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
+        # Mutate in-place as requested, but still return the list for chaining
+        for item in output or []:
+            if item.get("type") != "computer_call":
+                continue
+            action = item.get("action")
+            if not isinstance(action, dict):
+                continue
+
+            # rename mouse click actions to "click"
+            for mouse_btn in ["left", "right", "wheel", "back", "forward"]:
+                if action.get("type", "") == f"{mouse_btn}_click":
+                    action["type"] = "click"
+                    action["button"] = mouse_btn
+            # rename hotkey actions to "keypress"
+            for alias in ["hotkey", "key", "press", "key_press"]:
+                if action.get("type", "") == alias:
+                    action["type"] = "keypress"
+            # assume click actions
+            if "button" in action and "type" not in action:
+                action["type"] = "click"
+            if "click" in action and "type" not in action:
+                action["type"] = "click"
+            if ("scroll_x" in action or "scroll_y" in action) and "type" not in action:
+                action["type"] = "scroll"
+            if "text" in action and "type" not in action:
+                action["type"] = "type"
+
+            action_type = action.get("type")
+            def _keep_keys(action: Dict[str, Any], keys_to_keep: List[str]):
+                """Keep only the provided keys on action; delete everything else.
+                Always ensures required 'type' is present if listed in keys_to_keep.
+                """
+                for key in list(action.keys()):
+                    if key not in keys_to_keep:
+                        del action[key]
+            # rename "coordinate" to "x", "y"
+            if "coordinate" in action:
+                action["x"] = action["coordinate"][0]
+                action["y"] = action["coordinate"][1]
+                del action["coordinate"]
+            if action_type == "click":
+                # convert "click" to "button"
+                if "button" not in action and "click" in action:
+                    action["button"] = action["click"]
+                    del action["click"]
+                # default button to "left"
+                action["button"] = action.get("button", "left")
+            # add default scroll x, y if missing
+            if action_type == "scroll":
+                action["scroll_x"] = action.get("scroll_x", 0)
+                action["scroll_y"] = action.get("scroll_y", 0)
+            # ensure keys arg is a list (normalize aliases first)
+            if action_type == "keypress":
+                keys = action.get("keys")
+                for keys_alias in ["keypress", "key", "press", "key_press", "text"]:
+                    if keys_alias in action:
+                        action["keys"] = action[keys_alias]
+                        del action[keys_alias]
+                keys = action.get("keys")
+                if isinstance(keys, str):
+                    action["keys"] = keys.replace("-", "+").split("+") if len(keys) > 1 else [keys]
+            required_keys_by_type = {
+                # OpenAI actions
+                "click": ["type", "button", "x", "y"],
+                "double_click": ["type", "x", "y"],
+                "drag": ["type", "path"],
+                "keypress": ["type", "keys"],
+                "move": ["type", "x", "y"],
+                "screenshot": ["type"],
+                "scroll": ["type", "scroll_x", "scroll_y", "x", "y"],
+                "type": ["type", "text"],
+                "wait": ["type"],
+                # Anthropic actions
+                "left_mouse_down": ["type", "x", "y"],
+                "left_mouse_up": ["type", "x", "y"],
+                "triple_click": ["type", "button", "x", "y"],
+            }
+            keep = required_keys_by_type.get(action_type or "")
+            if keep:
+                _keep_keys(action, keep)
+            
+
+        # Second pass: if an assistant message is immediately followed by a computer_call,
+        # replace the assistant message itself with a reasoning message with summary text.
+        if isinstance(output, list):
+            for i, item in enumerate(output):
+                # AssistantMessage shape: { type: 'message', role: 'assistant', content: OutputContent[] }
+                if item.get("type") == "message" and item.get("role") == "assistant":
+                    next_idx = i + 1
+                    if next_idx >= len(output):
+                        continue
+                    next_item = output[next_idx]
+                    if not isinstance(next_item, dict):
+                        continue
+                    if next_item.get("type") != "computer_call":
+                        continue
+                    contents = item.get("content") or []
+                    # Extract text from OutputContent[]
+                    text_parts: List[str] = []
+                    if isinstance(contents, list):
+                        for c in contents:
+                            if isinstance(c, dict) and c.get("type") == "output_text" and isinstance(c.get("text"), str):
+                                text_parts.append(c["text"])
+                    text_content = "\n".join(text_parts).strip()
+                    # Replace assistant message with reasoning message
+                    output[i] = {
+                        "type": "reasoning",
+                        "summary": [
+                            {
+                                "type": "summary_text",
+                                "text": text_content,
+                            }
+                        ],
+                    }
+
+        return output
@@ -94,6 +94,10 @@ def _save_artifact(self, name: str, artifact: Union[str, bytes, Dict[str, Any]])
             # format: turn_000/0000_name.json
             artifact_filename = f"{self.current_artifact:04d}_{name}"
             artifact_path = turn_dir / f"{artifact_filename}.json"
+            # add created_at
+            if isinstance(artifact, dict):
+                artifact = artifact.copy()
+                artifact["created_at"] = str(uuid.uuid1().time)
             with open(artifact_path, "w") as f:
                 json.dump(sanitize_image_urls(artifact), f, indent=2)
         self.current_artifact += 1
@@ -171,7 +175,7 @@ async def on_run_end(self, kwargs: Dict[str, Any], old_items: List[Dict[str, Any
             "status": "completed",
             "completed_at": str(uuid.uuid1().time),
             "total_usage": self.total_usage,
-            "new_items": sanitize_image_urls(new_items),
+            "new_items": new_items,
             "total_turns": self.current_turn
         })