Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions libs/python/agent/agent/loops/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from . import (
anthropic,
composed_grounded,
doubao,
fara,
gelato,
gemini,
Expand All @@ -27,6 +28,7 @@
__all__ = [
"anthropic",
"composed_grounded",
"doubao",
"gelato",
"gemini",
"generic_vlm",
Expand Down
275 changes: 275 additions & 0 deletions libs/python/agent/agent/loops/doubao.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,275 @@
"""
Doubao (火山引擎 Ark) Responses API agent loop implementation.
Based on OpenAI adapter but customized for Doubao's specific API requirements.
"""

import asyncio
import base64
import json
import logging
import time
from io import BytesIO
from typing import Any, Dict, List, Optional, Tuple, Union

import litellm
from PIL import Image

from ..decorators import register_agent
from ..types import AgentCapability, AgentResponse, Messages, Tools
from .openai import _map_computer_tool_to_openai, _prepare_tools_for_openai

logger = logging.getLogger(__name__)


def _normalize_xy(x: int, y: int, width: int, height: int) -> Tuple[int, int]:
"""将物理坐标归一化到 1000x1000 空间"""
width = max(1, int(width))
height = max(1, int(height))
nx = max(0, min(1000, int(round((x / width) * 1000))))
ny = max(0, min(1000, int(round((y / height) * 1000))))
return nx, ny


def _denormalize_xy(
nx: float, ny: float, target_w: int = 1024, target_h: int = 768
) -> Tuple[int, int]:
"""
将 1000x1000 空间的归一化坐标还原为 Computer Server 的物理坐标系。
"""
x = int(round((nx / 1000.0) * target_w))
y = int(round((ny / 1000.0) * target_h))
Comment on lines +33 to +40
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

Clamp denormalized coordinates to the last valid pixel.

nx=1000 currently maps to x=target_w and ny=1000 to y=target_h. On a 1024x768 screen that yields (1024, 768), which is one past the right/bottom edge for zero-based coordinates and can miss edge targets in both step and click flows.

🐛 Proposed fix
 def _denormalize_xy(
     nx: float, ny: float, target_w: int = 1024, target_h: int = 768
 ) -> Tuple[int, int]:
     """
     将 1000x1000 空间的归一化坐标还原为 Computer Server 的物理坐标系。
     """
-    x = int(round((nx / 1000.0) * target_w))
-    y = int(round((ny / 1000.0) * target_h))
+    target_w = max(1, int(target_w))
+    target_h = max(1, int(target_h))
+    max_x = target_w - 1
+    max_y = target_h - 1
+    x = max(0, min(max_x, int(round((nx / 1000.0) * max_x))))
+    y = max(0, min(max_y, int(round((ny / 1000.0) * max_y))))
     return x, y
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
def _denormalize_xy(
nx: float, ny: float, target_w: int = 1024, target_h: int = 768
) -> Tuple[int, int]:
"""
1000x1000 空间的归一化坐标还原为 Computer Server 的物理坐标系
"""
x = int(round((nx / 1000.0) * target_w))
y = int(round((ny / 1000.0) * target_h))
def _denormalize_xy(
nx: float, ny: float, target_w: int = 1024, target_h: int = 768
) -> Tuple[int, int]:
"""
1000x1000 空间的归一化坐标还原为 Computer Server 的物理坐标系
"""
target_w = max(1, int(target_w))
target_h = max(1, int(target_h))
max_x = target_w - 1
max_y = target_h - 1
x = max(0, min(max_x, int(round((nx / 1000.0) * max_x))))
y = max(0, min(max_y, int(round((ny / 1000.0) * max_y))))
return x, y
🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@libs/python/agent/agent/loops/doubao.py` around lines 33 - 40,
_denormalize_xy currently maps normalized 1000-based coordinates so that nx=1000
-> x=target_w and ny=1000 -> y=target_h, producing out-of-bounds pixel indices;
update the function (_denormalize_xy) to compute x and y as before then clamp
them into the valid zero-based pixel range [0, target_w-1] and [0, target_h-1]
(e.g., use min/max or equivalent) so results never exceed the last pixel and
never go negative.

return x, y


@register_agent(models=r".*doubao.*", priority=10)
class DoubaoComputerAgentConfig:
Comment on lines +44 to +45
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

Restrict this registration to the intended openai/ Doubao models.

r".*doubao.*" matches any model string containing doubao. Because agent dispatch picks the first regex match by priority, this loop will activate even when callers did not choose the OpenAI-adapter path described in the PR, and it can shadow other provider configs that happen to include the same token.

🐛 Proposed fix
-@register_agent(models=r".*doubao.*", priority=10)
+@register_agent(models=r"^openai/.*doubao.*$", priority=10)
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
@register_agent(models=r".*doubao.*", priority=10)
class DoubaoComputerAgentConfig:
`@register_agent`(models=r"^openai/.*doubao.*$", priority=10)
class DoubaoComputerAgentConfig:
🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@libs/python/agent/agent/loops/doubao.py` around lines 44 - 45, The current
register_agent decorator on DoubaoComputerAgentConfig uses a permissive regex
r".*doubao.*" which matches any model string containing "doubao" and can
incorrectly claim models from other providers; narrow the pattern to only match
the OpenAI-adapter Doubao models (e.g. require the "openai/" prefix) by updating
the models regex in the `@register_agent` call (reference: the register_agent
decorator and the DoubaoComputerAgentConfig class) to a stricter pattern such as
one that enforces the "openai/" prefix (for example ^openai/doubao(/.*)?$) so
this loop only activates for the intended OpenAI Doubao models.

"""
Doubao (火山引擎) agent configuration using litellm responses.
Specially handles Doubao's 'reasoning' and 'input' field requirements.
Uses 1000x1000 normalized coordinates for model communication.
Converts back to 1024x768 target coordinates for Computer Server.
"""

async def predict_step(
self,
messages: List[Dict[str, Any]],
model: str,
tools: Optional[List[Dict[str, Any]]] = None,
max_retries: Optional[int] = None,
stream: bool = False,
computer_handler=None,
Comment on lines +59 to +60
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

🧩 Analysis chain

🌐 Web query:

LiteLLM aresponsesstreaming return type: whenstream=True, does the API return a streaming iterator / stream wrapper instead of a response object with .model_dump()and.usage?

💡 Result:

Yes. In LiteLLM’s Responses API wrapper, litellm.aresponses(..., stream=True) returns a streaming iterator/wrapper (a BaseResponsesAPIStreamingIterator), not a single ResponsesAPIResponse object you can immediately call .model_dump() on or read .usage from. The return type is explicitly Union[ResponsesAPIResponse, BaseResponsesAPIStreamingIterator]. [1]

In streaming mode, you consume it with async for ... in stream: and LiteLLM yields stream events; the usage (and the full response object) is only available on the final “response.completed” (or failed/incomplete) event, where the event contains a response object that has .usage (and can be .model_dump()’d). LiteLLM internally records that final event as completed_response. [2]

Sources: [1] [2]


Add streaming support or reject stream=True entirely.

The stream parameter is forwarded to litellm.aresponses() but the code immediately calls .model_dump(), iterates the result as output, and accesses .usage — none of which work with a streaming response. When stream=True, litellm.aresponses() returns a streaming iterator, not a response object; usage data is only available on the final stream event after consuming with async for. Either implement proper streaming handling with event consumption, or remove the stream parameter and always use stream=False.

Also applies to: 134-146, 151-191

🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@libs/python/agent/agent/loops/doubao.py` around lines 59 - 60, The code
incorrectly assumes litellm.aresponses() returns a final response when called
with stream=True; to fix, either remove/ban streaming by validating the stream
parameter at the start of the relevant functions (e.g., in the function
signature handling in doubao.py) and raise a clear error if stream is True, or
implement proper streaming consumption: when calling litellm.aresponses() with
stream=True, treat the result as an async iterator, iterate with "async for
event in ..." to collect content events and only access .usage on the final
event (and build the full response object before calling .model_dump()); update
the same pattern for the other places noted (around lines 134-146 and 151-191)
so you do not call .model_dump() or index .usage on the streaming iterator.

use_prompt_caching: Optional[bool] = False,
_on_api_start=None,
_on_api_end=None,
_on_usage=None,
_on_screenshot=None,
**kwargs,
) -> Dict[str, Any]:
tools = tools or []

# 1. 获取屏幕物理真实尺寸(用于将模型输出的 1000x1000 还原到物理坐标)
physical_width, physical_height = 1024, 768
if computer_handler and hasattr(computer_handler, "get_dimensions"):
try:
physical_width, physical_height = await computer_handler.get_dimensions()
logger.info(
f"📏 [物理尺寸] 从 computer_handler 获取到物理分辨率: {physical_width}x{physical_height}"
)
except Exception as e:
logger.warning(f"⚠️ [物理尺寸] 无法获取物理分辨率: {e}")

# 2. Prepare tools for OpenAI-compatible API
# 强制告诉模型屏幕是 1000x1000 (归一化空间)
openai_tools = []
for schema in tools:
if schema["type"] == "computer":
computer_tool = {
"type": "function",
"name": "computer",
"description": (
"Use a mouse and keyboard to interact with a computer, and take screenshots.\n"
"Screen resolution: 1000x1000 units.\n"
"Environment: windows."
),
"parameters": {
"type": "object",
"properties": {
"action": {
"type": "string",
"enum": [
"click",
"double_click",
"right_click",
"type",
"keypress",
"scroll",
"move",
"drag",
"screenshot",
"wait",
"terminate",
],
},
"x": {"type": "integer", "description": "X coordinate (0-1000)"},
"y": {"type": "integer", "description": "Y coordinate (0-1000)"},
"text": {"type": "string"},
"keys": {"type": "array", "items": {"type": "string"}},
},
"required": ["action"],
},
}
openai_tools.append(computer_tool)
elif schema["type"] == "function":
func = schema["function"]
openai_tools.append(
{
"type": "function",
"name": func["name"],
"description": func.get("description", ""),
"parameters": func.get("parameters", {}),
}
)

# 3. Call API
api_kwargs = {
"model": model,
"input": messages,
"tools": openai_tools if openai_tools else None,
"stream": stream,
"reasoning": {},
"num_retries": max_retries,
**kwargs,
}

if _on_api_start:
await _on_api_start(api_kwargs)
response = await litellm.aresponses(**api_kwargs)
if _on_api_end:
await _on_api_end(api_kwargs, response)

# 4. 核心转换:将模型输出的 1000x1000 坐标还原回物理坐标
output_dict = response if isinstance(response, dict) else response.model_dump()
for item in output_dict.get("output", []):
if item.get("type") == "function_call" and item.get("name") == "computer":
args = item.get("arguments", "{}")
if isinstance(args, str):
try:
args = json.loads(args)
except json.JSONDecodeError as e:
logger.warning(f"⚠️ [JSON解析失败] 无法解析工具调用参数: {args}. 错误: {e}")
# 尝试简单的清洗:去掉可能存在的 markdown 代码块标记
cleaned_args = args.strip()
if cleaned_args.startswith("```json"):
cleaned_args = cleaned_args[7:]
if cleaned_args.endswith("```"):
cleaned_args = cleaned_args[:-3]
cleaned_args = cleaned_args.strip()
Comment on lines +154 to +166
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

Don’t log raw computer arguments on parse failures.

This payload can include text and keys. If the model is typing credentials, the warning logs the whole secret-bearing argument blob, which breaks the PR’s “no secrets are logged” guarantee.

🛡️ Proposed fix
-                    except json.JSONDecodeError as e:
-                        logger.warning(f"⚠️ [JSON解析失败] 无法解析工具调用参数: {args}. 错误: {e}")
+                    except json.JSONDecodeError as e:
+                        logger.warning(
+                            "⚠️ [JSON解析失败] 无法解析 computer 工具调用参数: error=%s raw_length=%d",
+                            e,
+                            len(args),
+                        )
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
args = item.get("arguments", "{}")
if isinstance(args, str):
try:
args = json.loads(args)
except json.JSONDecodeError as e:
logger.warning(f"⚠️ [JSON解析失败] 无法解析工具调用参数: {args}. 错误: {e}")
# 尝试简单的清洗:去掉可能存在的 markdown 代码块标记
cleaned_args = args.strip()
if cleaned_args.startswith("```json"):
cleaned_args = cleaned_args[7:]
if cleaned_args.endswith("```"):
cleaned_args = cleaned_args[:-3]
cleaned_args = cleaned_args.strip()
args = item.get("arguments", "{}")
if isinstance(args, str):
try:
args = json.loads(args)
except json.JSONDecodeError as e:
logger.warning(
"⚠️ [JSON解析失败] 无法解析 computer 工具调用参数: error=%s raw_length=%d",
e,
len(args),
)
# 尝试简单的清洗:去掉可能存在的 markdown 代码块标记
cleaned_args = args.strip()
if cleaned_args.startswith("
🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@libs/python/agent/agent/loops/doubao.py` around lines 154 - 166, The current
JSON parse failure branch logs the raw tool argument blob (args / cleaned_args)
which may contain secrets; update the exception handling in the parse block
(where args is processed and cleaned_args is constructed) to avoid emitting raw
contents to logger.warning. Instead, build a sanitized summary: parse
cleaned_args into a dict if possible and redact sensitive keys like "password",
"secret", "token", "keys", "credentials", or "text" (replace values with
"<REDACTED>"), or if it cannot be parsed, log only a truncated length and a safe
hash/preview (e.g., first 32 chars) without secrets; then call logger.warning
with that sanitized summary and the error. Ensure you change the logger.warning
call and any subsequent logging to reference the sanitized variable rather than
the original args/cleaned_args.


try:
args = json.loads(cleaned_args)
except json.JSONDecodeError:
# 如果还是失败,跳过这个 item,让后续的 agent.py 逻辑处理(它也会报错或处理)
continue

if "x" in args and "y" in args:
nx, ny = float(args["x"]), float(args["y"])
target_x, target_y = _denormalize_xy(nx, ny, physical_width, physical_height)
logger.info(
f"🎯 [坐标还原] 模型预测({nx}, {ny}) -> 实际物理点击({target_x}, {target_y}) (基于屏幕: {physical_width}x{physical_height})"
)
args["x"], args["y"] = target_x, target_y
item["arguments"] = json.dumps(args)

# Extract usage and return
usage = (
response.get("usage", {}) if isinstance(response, dict) else response.usage.model_dump()
)
if hasattr(response, "_hidden_params"):
usage["response_cost"] = response._hidden_params.get("response_cost", 0.0)
if _on_usage:
await _on_usage(usage)
output_dict["usage"] = usage
return output_dict

async def predict_click(
self, model: str, image_b64: str, instruction: str, computer_handler=None, **kwargs
) -> Optional[Tuple[int, int]]:
"""Predict click coordinates specifically for Doubao with 1000x1000 scaling."""
# 获取真实物理尺寸用于还原
physical_width, physical_height = 1024, 768
if computer_handler and hasattr(computer_handler, "get_dimensions"):
try:
physical_width, physical_height = await computer_handler.get_dimensions()
logger.info(
Comment on lines +195 to +203
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

predict_click() cannot use physical screen dimensions from the current public call path.

This method expects computer_handler, but the current ComputerAgent.predict_click() integration only forwards model, image_b64, instruction, api_key, and api_base. That makes the get_dimensions() branch dead in normal use and forces denormalization to fall back to image size instead of the authoritative screen dimensions.

🐛 Complementary fix in libs/python/agent/agent/agent.py
             return await self.agent_loop.predict_click(
-                model=self.model, image_b64=image_b64, instruction=instruction, **click_kwargs
+                model=self.model,
+                image_b64=image_b64,
+                instruction=instruction,
+                computer_handler=self.computer_handler,
+                **click_kwargs,
             )
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
self, model: str, image_b64: str, instruction: str, computer_handler=None, **kwargs
) -> Optional[Tuple[int, int]]:
"""Predict click coordinates specifically for Doubao with 1000x1000 scaling."""
# 获取真实物理尺寸用于还原
physical_width, physical_height = 1024, 768
if computer_handler and hasattr(computer_handler, "get_dimensions"):
try:
physical_width, physical_height = await computer_handler.get_dimensions()
logger.info(
return await self.agent_loop.predict_click(
model=self.model,
image_b64=image_b64,
instruction=instruction,
computer_handler=self.computer_handler,
**click_kwargs,
)
🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@libs/python/agent/agent/loops/doubao.py` around lines 195 - 203,
predict_click in doubao.py tries to use authoritative screen dimensions via
computer_handler.get_dimensions, but ComputerAgent.predict_click currently only
forwards model, image_b64, instruction, api_key, and api_base so
computer_handler is never passed through; update the public call path
(ComputerAgent.predict_click) to accept and forward the computer_handler (or
relevant agent/context object) into agent.loops.doubao.predict_click so the
get_dimensions branch is reachable, and ensure any call sites that invoke
ComputerAgent.predict_click also supply the computer_handler; reference symbols:
predict_click (in libs/python/agent/agent/loops/doubao.py),
ComputerAgent.predict_click (in libs/python/agent/agent/agent.py), and
computer_handler.get_dimensions.

f"📏 [物理尺寸] predict_click 识别到物理分辨率: {physical_width}x{physical_height}"
)
except Exception as e:
logger.warning(f"⚠️ [物理尺寸] predict_click 无法获取物理分辨率: {e}")
else:
try:
image_data = base64.b64decode(image_b64)
image = Image.open(BytesIO(image_data))
physical_width, physical_height = image.size
logger.info(
f"📏 [物理尺寸] predict_click 回退使用图像分辨率: {physical_width}x{physical_height}"
)
except Exception:
pass

input_items = [
{
"role": "user",
"content": f"Task: Click {instruction}. Output ONLY a click action on the target element using 1000x1000 coordinate system.",
},
{
"role": "user",
"content": [
{"type": "input_image", "image_url": f"data:image/png;base64,{image_b64}"}
],
},
]

computer_tool = {
"type": "function",
"name": "computer",
"description": "Screen resolution: 1000x1000 units.",
"parameters": {
"type": "object",
"properties": {
"action": {"type": "string", "enum": ["click"]},
"x": {"type": "integer"},
"y": {"type": "integer"},
},
"required": ["action", "x", "y"],
},
}

api_kwargs = {
"model": model,
"input": input_items,
"tools": [computer_tool],
"stream": False,
"reasoning": {},
"max_tokens": 200,
**kwargs,
}

response = await litellm.aresponses(**api_kwargs)
output_dict = response if isinstance(response, dict) else response.model_dump()

for item in output_dict.get("output", []):
if item.get("type") == "function_call" and item.get("name") == "computer":
args = item.get("arguments", "{}")
if isinstance(args, str):
args = json.loads(args)
if args.get("x") is not None and args.get("y") is not None:
nx, ny = float(args["x"]), float(args["y"])
target_x, target_y = _denormalize_xy(nx, ny, physical_width, physical_height)
logger.info(
f"🎯 [点击还原] 模型({nx}, {ny}) -> 实际物理({target_x}, {target_y})"
)
return (target_x, target_y)
return None

def get_capabilities(self) -> List[AgentCapability]:
return ["click", "step"]