Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion libs/cua-bench/cua_bench/agents/cua_agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ class CuaAgent(BaseAgent):

def __init__(self, **kwargs):
super().__init__(**kwargs)
self.model = kwargs.get("model", "anthropic/claude-sonnet-4-20250514")
self.model = kwargs.get("model", "anthropic/claude-sonnet-4-6")
self.max_steps = kwargs.get("max_steps", 100)
# Number of times to retry the entire task when a transient API error occurs.
# Task-level retry restarts agent.run() from scratch but does NOT reset the
Expand Down
12 changes: 7 additions & 5 deletions libs/python/agent/agent/loops/anthropic.py
Original file line number Diff line number Diff line change
Expand Up @@ -1667,14 +1667,13 @@ def _convert_completion_to_responses_items(
return responses_items


def _add_cache_control(completion_messages: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
def _add_cache_control(completion_messages: List[Dict[str, Any]], max_breakpoints: int = 4) -> List[Dict[str, Any]]:
"""Add cache control to completion messages"""
num_writes = 0
for message in completion_messages:
message["cache_control"] = {"type": "ephemeral"}
num_writes += 1
# Cache control has a maximum of 4 blocks
if num_writes >= 4:
if num_writes >= max_breakpoints:
break

return completion_messages
Expand Down Expand Up @@ -1788,10 +1787,13 @@ async def predict_step(
)
scale_x, scale_y = scale_factors
if use_prompt_caching:
# Cache the tools prefix (1 breakpoint on the last tool)
if anthropic_tools:
anthropic_tools[-1]["cache_control"] = {"type": "ephemeral"}
# First combine messages to reduce number of blocks
completion_messages = _combine_completion_messages(completion_messages)
# Then add cache control, anthropic requires explicit "cache_control" dicts
completion_messages = _add_cache_control(completion_messages)
# Use remaining breakpoints (max 3) on messages
completion_messages = _add_cache_control(completion_messages, max_breakpoints=3)

# Prepare API call kwargs
api_kwargs = {
Expand Down
8 changes: 4 additions & 4 deletions libs/python/agent/agent/ui/gradio/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,10 +115,10 @@ def save_settings(settings: Dict[str, Any]):
"OpenAI: Computer-Use Preview": "openai/computer-use-preview",
},
"anthropic": {
"default": "anthropic/claude-3-7-sonnet-20250219",
"Anthropic: Claude 4 Opus (20250514)": "anthropic/claude-opus-4-20250514",
"Anthropic: Claude 4 Sonnet (20250514)": "anthropic/claude-sonnet-4-20250514",
"Anthropic: Claude 3.7 Sonnet (20250219)": "anthropic/claude-3-7-sonnet-20250219",
"default": "anthropic/claude-sonnet-4-6",
"Anthropic: Claude Opus 4.6": "anthropic/claude-opus-4-6",
"Anthropic: Claude Sonnet 4.6": "anthropic/claude-sonnet-4-6",
"Anthropic: Claude Sonnet 4.5": "anthropic/claude-sonnet-4-5-20250929",
Comment on lines +118 to +121
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

Model label mismatch causes silent fallback to the wrong Anthropic model.

At Lines 118-121, the old keys were removed, but libs/python/agent/agent/ui/gradio/ui_components.py (Lines 41-52) still emits the old labels. Because get_model_string() falls back at Line 150, user selection can be ignored without warning.

Proposed backward-compatible fix
 "anthropic": {
     "default": "anthropic/claude-sonnet-4-6",
+    # Backward-compatible labels still used by ui_components.py
+    "Anthropic: Claude 4 Opus (20250514)": "anthropic/claude-opus-4-6",
+    "Anthropic: Claude 4 Sonnet (20250514)": "anthropic/claude-sonnet-4-6",
+    "Anthropic: Claude 3.7 Sonnet (20250219)": "anthropic/claude-sonnet-4-5-20250929",
     "Anthropic: Claude Opus 4.6": "anthropic/claude-opus-4-6",
     "Anthropic: Claude Sonnet 4.6": "anthropic/claude-sonnet-4-6",
     "Anthropic: Claude Sonnet 4.5": "anthropic/claude-sonnet-4-5-20250929",
 },
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
"default": "anthropic/claude-sonnet-4-6",
"Anthropic: Claude Opus 4.6": "anthropic/claude-opus-4-6",
"Anthropic: Claude Sonnet 4.6": "anthropic/claude-sonnet-4-6",
"Anthropic: Claude Sonnet 4.5": "anthropic/claude-sonnet-4-5-20250929",
"default": "anthropic/claude-sonnet-4-6",
# Backward-compatible labels still used by ui_components.py
"Anthropic: Claude 4 Opus (20250514)": "anthropic/claude-opus-4-6",
"Anthropic: Claude 4 Sonnet (20250514)": "anthropic/claude-sonnet-4-6",
"Anthropic: Claude 3.7 Sonnet (20250219)": "anthropic/claude-sonnet-4-5-20250929",
"Anthropic: Claude Opus 4.6": "anthropic/claude-opus-4-6",
"Anthropic: Claude Sonnet 4.6": "anthropic/claude-sonnet-4-6",
"Anthropic: Claude Sonnet 4.5": "anthropic/claude-sonnet-4-5-20250929",

},
"omni": {
"default": "omniparser+openai/gpt-4o",
Expand Down
68 changes: 46 additions & 22 deletions libs/python/cua-cli/cua_cli/commands/do.py
Original file line number Diff line number Diff line change
Expand Up @@ -800,17 +800,46 @@ async def _run() -> int:
"1. Write a 1-2 sentence summary of what is currently on screen.\n"
"2. List every interactive element visible (buttons, links, inputs, "
"menus, checkboxes, dropdowns, etc.) with its center coordinates "
"in image pixels (origin = top-left). Be precise.\n\n"
"Respond in this exact JSON format:\n"
'{"summary": "...", "elements": [{"name": "...", "type": "...", "x": N, "y": N}, ...]}\n'
"in image pixels (origin = top-left). Be precise."
)
if extra:
prompt += f"\nAdditional instructions: {extra}"

screenshot_analysis_tool = {
"name": "screenshot_analysis",
"description": "Report the analysis of the screenshot, including a summary and all interactive elements.",
"input_schema": {
"type": "object",
"properties": {
"summary": {
"type": "string",
"description": "1-2 sentence summary of what is currently on screen",
},
"elements": {
"type": "array",
"items": {
"type": "object",
"properties": {
"name": {"type": "string", "description": "Element label or text"},
"type": {"type": "string", "description": "Element type (button, link, input, etc.)"},
"x": {"type": "integer", "description": "Center X coordinate in pixels"},
"y": {"type": "integer", "description": "Center Y coordinate in pixels"},
},
"required": ["name", "type", "x", "y"],
},
"description": "All interactive elements visible on screen",
},
},
"required": ["summary", "elements"],
},
}

img_b64 = base64.b64encode(img_bytes).decode()
response = client.messages.create(
model="claude-haiku-4-5-20251001",
max_tokens=1024,
tools=[screenshot_analysis_tool],
tool_choice={"type": "tool", "name": "screenshot_analysis"},
messages=[
{
"role": "user",
Expand All @@ -829,26 +858,21 @@ async def _run() -> int:
],
)

raw = response.content[0].text.strip()
# Try to parse JSON; fall back to raw text
try:
parsed = json.loads(raw)
summary = parsed.get("summary", "")
elements = parsed.get("elements", [])
print(f"✅ snapshot — {save_path}")
print()
print(summary)
if elements:
print()
print("Interactive elements:")
for el in elements:
print(
f" • {el.get('name', '?')} [{el.get('type', '?')}] ({el.get('x', '?')}, {el.get('y', '?')})"
)
except json.JSONDecodeError:
print(f"✅ snapshot — {save_path}")
# Extract the tool use result — guaranteed valid JSON matching the schema
tool_block = next(b for b in response.content if b.type == "tool_use")
parsed = tool_block.input
summary = parsed.get("summary", "")
elements = parsed.get("elements", [])
Comment on lines +861 to +865
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟡 Minor

Handle missing tool_use block gracefully.

Using next() without a default raises StopIteration if Claude's response unexpectedly lacks a tool_use block. While caught by the outer except Exception, the error message would be unhelpful.

🛡️ Proposed fix
             # Extract the tool use result — guaranteed valid JSON matching the schema
-            tool_block = next(b for b in response.content if b.type == "tool_use")
+            tool_block = next((b for b in response.content if b.type == "tool_use"), None)
+            if tool_block is None:
+                await _print_context(state["provider"], state.get("name", ""), state)
+                return _fail("AI response missing tool_use block")
             parsed = tool_block.input
🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@libs/python/cua-cli/cua_cli/commands/do.py` around lines 861 - 865, The code
uses next(b for b in response.content if b.type == "tool_use") which will raise
StopIteration if no tool_use block exists; change this to use next(..., None)
and then explicitly handle a missing tool_block by logging or raising a clear,
specific error (e.g., "missing tool_use block in Claude response") before
proceeding to access parsed/summary/elements; update the block around response,
tool_block, parsed, summary, and elements in do.py to check for None and bail
with a helpful message instead of relying on the outer except.

print(f"✅ snapshot — {save_path}")
print()
print(summary)
if elements:
print()
print(raw)
print("Interactive elements:")
for el in elements:
print(
f" • {el.get('name', '?')} [{el.get('type', '?')}] ({el.get('x', '?')}, {el.get('y', '?')})"
)

except ImportError:
await _print_context(state["provider"], state.get("name", ""), state)
Expand Down