From 0e146002d5ff0744b0411a239c9d0a914df7ccad Mon Sep 17 00:00:00 2001 From: Sahar Mor Date: Wed, 8 Apr 2026 12:22:17 -0700 Subject: [PATCH] Claude Optimize: Use structured outputs to guarantee valid JSON from Claude, +3 more --- libs/cua-bench/cua_bench/agents/cua_agent.py | 2 +- libs/python/agent/agent/loops/anthropic.py | 12 ++-- libs/python/agent/agent/ui/gradio/app.py | 8 +-- libs/python/cua-cli/cua_cli/commands/do.py | 68 +++++++++++++------- 4 files changed, 58 insertions(+), 32 deletions(-) diff --git a/libs/cua-bench/cua_bench/agents/cua_agent.py b/libs/cua-bench/cua_bench/agents/cua_agent.py index e4a783a4a..cb2523e69 100644 --- a/libs/cua-bench/cua_bench/agents/cua_agent.py +++ b/libs/cua-bench/cua_bench/agents/cua_agent.py @@ -44,7 +44,7 @@ class CuaAgent(BaseAgent): def __init__(self, **kwargs): super().__init__(**kwargs) - self.model = kwargs.get("model", "anthropic/claude-sonnet-4-20250514") + self.model = kwargs.get("model", "anthropic/claude-sonnet-4-6") self.max_steps = kwargs.get("max_steps", 100) # Number of times to retry the entire task when a transient API error occurs. # Task-level retry restarts agent.run() from scratch but does NOT reset the diff --git a/libs/python/agent/agent/loops/anthropic.py b/libs/python/agent/agent/loops/anthropic.py index e0e087008..cdf416ed0 100644 --- a/libs/python/agent/agent/loops/anthropic.py +++ b/libs/python/agent/agent/loops/anthropic.py @@ -1667,14 +1667,13 @@ def _convert_completion_to_responses_items( return responses_items -def _add_cache_control(completion_messages: List[Dict[str, Any]]) -> List[Dict[str, Any]]: +def _add_cache_control(completion_messages: List[Dict[str, Any]], max_breakpoints: int = 4) -> List[Dict[str, Any]]: """Add cache control to completion messages""" num_writes = 0 for message in completion_messages: message["cache_control"] = {"type": "ephemeral"} num_writes += 1 - # Cache control has a maximum of 4 blocks - if num_writes >= 4: + if num_writes >= max_breakpoints: break return completion_messages @@ -1788,10 +1787,13 @@ async def predict_step( ) scale_x, scale_y = scale_factors if use_prompt_caching: + # Cache the tools prefix (1 breakpoint on the last tool) + if anthropic_tools: + anthropic_tools[-1]["cache_control"] = {"type": "ephemeral"} # First combine messages to reduce number of blocks completion_messages = _combine_completion_messages(completion_messages) - # Then add cache control, anthropic requires explicit "cache_control" dicts - completion_messages = _add_cache_control(completion_messages) + # Use remaining breakpoints (max 3) on messages + completion_messages = _add_cache_control(completion_messages, max_breakpoints=3) # Prepare API call kwargs api_kwargs = { diff --git a/libs/python/agent/agent/ui/gradio/app.py b/libs/python/agent/agent/ui/gradio/app.py index a3d02fc34..b3a2e0566 100644 --- a/libs/python/agent/agent/ui/gradio/app.py +++ b/libs/python/agent/agent/ui/gradio/app.py @@ -115,10 +115,10 @@ def save_settings(settings: Dict[str, Any]): "OpenAI: Computer-Use Preview": "openai/computer-use-preview", }, "anthropic": { - "default": "anthropic/claude-3-7-sonnet-20250219", - "Anthropic: Claude 4 Opus (20250514)": "anthropic/claude-opus-4-20250514", - "Anthropic: Claude 4 Sonnet (20250514)": "anthropic/claude-sonnet-4-20250514", - "Anthropic: Claude 3.7 Sonnet (20250219)": "anthropic/claude-3-7-sonnet-20250219", + "default": "anthropic/claude-sonnet-4-6", + "Anthropic: Claude Opus 4.6": "anthropic/claude-opus-4-6", + "Anthropic: Claude Sonnet 4.6": "anthropic/claude-sonnet-4-6", + "Anthropic: Claude Sonnet 4.5": "anthropic/claude-sonnet-4-5-20250929", }, "omni": { "default": "omniparser+openai/gpt-4o", diff --git a/libs/python/cua-cli/cua_cli/commands/do.py b/libs/python/cua-cli/cua_cli/commands/do.py index 1fa4fc22e..b9b16c8ce 100644 --- a/libs/python/cua-cli/cua_cli/commands/do.py +++ b/libs/python/cua-cli/cua_cli/commands/do.py @@ -800,17 +800,46 @@ async def _run() -> int: "1. Write a 1-2 sentence summary of what is currently on screen.\n" "2. List every interactive element visible (buttons, links, inputs, " "menus, checkboxes, dropdowns, etc.) with its center coordinates " - "in image pixels (origin = top-left). Be precise.\n\n" - "Respond in this exact JSON format:\n" - '{"summary": "...", "elements": [{"name": "...", "type": "...", "x": N, "y": N}, ...]}\n' + "in image pixels (origin = top-left). Be precise." ) if extra: prompt += f"\nAdditional instructions: {extra}" + screenshot_analysis_tool = { + "name": "screenshot_analysis", + "description": "Report the analysis of the screenshot, including a summary and all interactive elements.", + "input_schema": { + "type": "object", + "properties": { + "summary": { + "type": "string", + "description": "1-2 sentence summary of what is currently on screen", + }, + "elements": { + "type": "array", + "items": { + "type": "object", + "properties": { + "name": {"type": "string", "description": "Element label or text"}, + "type": {"type": "string", "description": "Element type (button, link, input, etc.)"}, + "x": {"type": "integer", "description": "Center X coordinate in pixels"}, + "y": {"type": "integer", "description": "Center Y coordinate in pixels"}, + }, + "required": ["name", "type", "x", "y"], + }, + "description": "All interactive elements visible on screen", + }, + }, + "required": ["summary", "elements"], + }, + } + img_b64 = base64.b64encode(img_bytes).decode() response = client.messages.create( model="claude-haiku-4-5-20251001", max_tokens=1024, + tools=[screenshot_analysis_tool], + tool_choice={"type": "tool", "name": "screenshot_analysis"}, messages=[ { "role": "user", @@ -829,26 +858,21 @@ async def _run() -> int: ], ) - raw = response.content[0].text.strip() - # Try to parse JSON; fall back to raw text - try: - parsed = json.loads(raw) - summary = parsed.get("summary", "") - elements = parsed.get("elements", []) - print(f"✅ snapshot — {save_path}") - print() - print(summary) - if elements: - print() - print("Interactive elements:") - for el in elements: - print( - f" • {el.get('name', '?')} [{el.get('type', '?')}] ({el.get('x', '?')}, {el.get('y', '?')})" - ) - except json.JSONDecodeError: - print(f"✅ snapshot — {save_path}") + # Extract the tool use result — guaranteed valid JSON matching the schema + tool_block = next(b for b in response.content if b.type == "tool_use") + parsed = tool_block.input + summary = parsed.get("summary", "") + elements = parsed.get("elements", []) + print(f"✅ snapshot — {save_path}") + print() + print(summary) + if elements: print() - print(raw) + print("Interactive elements:") + for el in elements: + print( + f" • {el.get('name', '?')} [{el.get('type', '?')}] ({el.get('x', '?')}, {el.get('y', '?')})" + ) except ImportError: await _print_context(state["provider"], state.get("name", ""), state)