From 0e146002d5ff0744b0411a239c9d0a914df7ccad Mon Sep 17 00:00:00 2001
From: Sahar Mor <saharhashai@gmail.com>
Date: Wed, 8 Apr 2026 12:22:17 -0700
Subject: [PATCH] Claude Optimize: Use structured outputs to guarantee valid
 JSON from Claude, +3 more

---
 libs/cua-bench/cua_bench/agents/cua_agent.py |  2 +-
 libs/python/agent/agent/loops/anthropic.py   | 12 ++--
 libs/python/agent/agent/ui/gradio/app.py     |  8 +--
 libs/python/cua-cli/cua_cli/commands/do.py   | 68 +++++++++++++-------
 4 files changed, 58 insertions(+), 32 deletions(-)

diff --git a/libs/cua-bench/cua_bench/agents/cua_agent.py b/libs/cua-bench/cua_bench/agents/cua_agent.py
index e4a783a4a..cb2523e69 100644
--- a/libs/cua-bench/cua_bench/agents/cua_agent.py
+++ b/libs/cua-bench/cua_bench/agents/cua_agent.py
@@ -44,7 +44,7 @@ class CuaAgent(BaseAgent):
 
     def __init__(self, **kwargs):
         super().__init__(**kwargs)
-        self.model = kwargs.get("model", "anthropic/claude-sonnet-4-20250514")
+        self.model = kwargs.get("model", "anthropic/claude-sonnet-4-6")
         self.max_steps = kwargs.get("max_steps", 100)
         # Number of times to retry the entire task when a transient API error occurs.
         # Task-level retry restarts agent.run() from scratch but does NOT reset the
diff --git a/libs/python/agent/agent/loops/anthropic.py b/libs/python/agent/agent/loops/anthropic.py
index e0e087008..cdf416ed0 100644
--- a/libs/python/agent/agent/loops/anthropic.py
+++ b/libs/python/agent/agent/loops/anthropic.py
@@ -1667,14 +1667,13 @@ def _convert_completion_to_responses_items(
     return responses_items
 
 
-def _add_cache_control(completion_messages: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
+def _add_cache_control(completion_messages: List[Dict[str, Any]], max_breakpoints: int = 4) -> List[Dict[str, Any]]:
     """Add cache control to completion messages"""
     num_writes = 0
     for message in completion_messages:
         message["cache_control"] = {"type": "ephemeral"}
         num_writes += 1
-        # Cache control has a maximum of 4 blocks
-        if num_writes >= 4:
+        if num_writes >= max_breakpoints:
             break
 
     return completion_messages
@@ -1788,10 +1787,13 @@ async def predict_step(
         )
         scale_x, scale_y = scale_factors
         if use_prompt_caching:
+            # Cache the tools prefix (1 breakpoint on the last tool)
+            if anthropic_tools:
+                anthropic_tools[-1]["cache_control"] = {"type": "ephemeral"}
             # First combine messages to reduce number of blocks
             completion_messages = _combine_completion_messages(completion_messages)
-            # Then add cache control, anthropic requires explicit "cache_control" dicts
-            completion_messages = _add_cache_control(completion_messages)
+            # Use remaining breakpoints (max 3) on messages
+            completion_messages = _add_cache_control(completion_messages, max_breakpoints=3)
 
         # Prepare API call kwargs
         api_kwargs = {
diff --git a/libs/python/agent/agent/ui/gradio/app.py b/libs/python/agent/agent/ui/gradio/app.py
index a3d02fc34..b3a2e0566 100644
--- a/libs/python/agent/agent/ui/gradio/app.py
+++ b/libs/python/agent/agent/ui/gradio/app.py
@@ -115,10 +115,10 @@ def save_settings(settings: Dict[str, Any]):
         "OpenAI: Computer-Use Preview": "openai/computer-use-preview",
     },
     "anthropic": {
-        "default": "anthropic/claude-3-7-sonnet-20250219",
-        "Anthropic: Claude 4 Opus (20250514)": "anthropic/claude-opus-4-20250514",
-        "Anthropic: Claude 4 Sonnet (20250514)": "anthropic/claude-sonnet-4-20250514",
-        "Anthropic: Claude 3.7 Sonnet (20250219)": "anthropic/claude-3-7-sonnet-20250219",
+        "default": "anthropic/claude-sonnet-4-6",
+        "Anthropic: Claude Opus 4.6": "anthropic/claude-opus-4-6",
+        "Anthropic: Claude Sonnet 4.6": "anthropic/claude-sonnet-4-6",
+        "Anthropic: Claude Sonnet 4.5": "anthropic/claude-sonnet-4-5-20250929",
     },
     "omni": {
         "default": "omniparser+openai/gpt-4o",
diff --git a/libs/python/cua-cli/cua_cli/commands/do.py b/libs/python/cua-cli/cua_cli/commands/do.py
index 1fa4fc22e..b9b16c8ce 100644
--- a/libs/python/cua-cli/cua_cli/commands/do.py
+++ b/libs/python/cua-cli/cua_cli/commands/do.py
@@ -800,17 +800,46 @@ async def _run() -> int:
                 "1. Write a 1-2 sentence summary of what is currently on screen.\n"
                 "2. List every interactive element visible (buttons, links, inputs, "
                 "menus, checkboxes, dropdowns, etc.) with its center coordinates "
-                "in image pixels (origin = top-left). Be precise.\n\n"
-                "Respond in this exact JSON format:\n"
-                '{"summary": "...", "elements": [{"name": "...", "type": "...", "x": N, "y": N}, ...]}\n'
+                "in image pixels (origin = top-left). Be precise."
             )
             if extra:
                 prompt += f"\nAdditional instructions: {extra}"
 
+            screenshot_analysis_tool = {
+                "name": "screenshot_analysis",
+                "description": "Report the analysis of the screenshot, including a summary and all interactive elements.",
+                "input_schema": {
+                    "type": "object",
+                    "properties": {
+                        "summary": {
+                            "type": "string",
+                            "description": "1-2 sentence summary of what is currently on screen",
+                        },
+                        "elements": {
+                            "type": "array",
+                            "items": {
+                                "type": "object",
+                                "properties": {
+                                    "name": {"type": "string", "description": "Element label or text"},
+                                    "type": {"type": "string", "description": "Element type (button, link, input, etc.)"},
+                                    "x": {"type": "integer", "description": "Center X coordinate in pixels"},
+                                    "y": {"type": "integer", "description": "Center Y coordinate in pixels"},
+                                },
+                                "required": ["name", "type", "x", "y"],
+                            },
+                            "description": "All interactive elements visible on screen",
+                        },
+                    },
+                    "required": ["summary", "elements"],
+                },
+            }
+
             img_b64 = base64.b64encode(img_bytes).decode()
             response = client.messages.create(
                 model="claude-haiku-4-5-20251001",
                 max_tokens=1024,
+                tools=[screenshot_analysis_tool],
+                tool_choice={"type": "tool", "name": "screenshot_analysis"},
                 messages=[
                     {
                         "role": "user",
@@ -829,26 +858,21 @@ async def _run() -> int:
                 ],
             )
 
-            raw = response.content[0].text.strip()
-            # Try to parse JSON; fall back to raw text
-            try:
-                parsed = json.loads(raw)
-                summary = parsed.get("summary", "")
-                elements = parsed.get("elements", [])
-                print(f"✅ snapshot — {save_path}")
-                print()
-                print(summary)
-                if elements:
-                    print()
-                    print("Interactive elements:")
-                    for el in elements:
-                        print(
-                            f"  • {el.get('name', '?')} [{el.get('type', '?')}]  ({el.get('x', '?')}, {el.get('y', '?')})"
-                        )
-            except json.JSONDecodeError:
-                print(f"✅ snapshot — {save_path}")
+            # Extract the tool use result — guaranteed valid JSON matching the schema
+            tool_block = next(b for b in response.content if b.type == "tool_use")
+            parsed = tool_block.input
+            summary = parsed.get("summary", "")
+            elements = parsed.get("elements", [])
+            print(f"✅ snapshot — {save_path}")
+            print()
+            print(summary)
+            if elements:
                 print()
-                print(raw)
+                print("Interactive elements:")
+                for el in elements:
+                    print(
+                        f"  • {el.get('name', '?')} [{el.get('type', '?')}]  ({el.get('x', '?')}, {el.get('y', '?')})"
+                    )
 
         except ImportError:
             await _print_context(state["provider"], state.get("name", ""), state)