From da536dbeb2fc067721dd7e2866e871b7533c36b6 Mon Sep 17 00:00:00 2001
From: hillday <524081959@qq.com>
Date: Thu, 2 Apr 2026 20:34:11 +0800
Subject: [PATCH 1/2] feat(agent): add Doubao (Volcengine Ark) agent loop

---
 libs/python/agent/agent/loops/__init__.py |   2 +
 libs/python/agent/agent/loops/doubao.py   | 314 ++++++++++++++++++++++
 2 files changed, 316 insertions(+)
 create mode 100644 libs/python/agent/agent/loops/doubao.py

diff --git a/libs/python/agent/agent/loops/__init__.py b/libs/python/agent/agent/loops/__init__.py
index 2b528cad6..3dbad0be2 100644
--- a/libs/python/agent/agent/loops/__init__.py
+++ b/libs/python/agent/agent/loops/__init__.py
@@ -6,6 +6,7 @@
 from . import (
     anthropic,
     composed_grounded,
+    doubao,
     fara,
     gelato,
     gemini,
@@ -27,6 +28,7 @@
 __all__ = [
     "anthropic",
     "composed_grounded",
+    "doubao",
     "gelato",
     "gemini",
     "generic_vlm",
diff --git a/libs/python/agent/agent/loops/doubao.py b/libs/python/agent/agent/loops/doubao.py
new file mode 100644
index 000000000..ff4f6c4fb
--- /dev/null
+++ b/libs/python/agent/agent/loops/doubao.py
@@ -0,0 +1,314 @@
+"""
+Doubao (火山引擎 Ark) Responses API agent loop implementation.
+Based on OpenAI adapter but customized for Doubao's specific API requirements.
+"""
+
+import asyncio
+import base64
+import json
+import logging
+import os
+import time
+from datetime import datetime
+from io import BytesIO
+from pathlib import Path
+from typing import Any, Dict, List, Optional, Tuple, Union
+
+import litellm
+from PIL import Image
+
+from ..decorators import register_agent
+from ..types import AgentCapability, AgentResponse, Messages, Tools
+from .openai import _map_computer_tool_to_openai, _prepare_tools_for_openai
+
+logger = logging.getLogger(__name__)
+
+
+def _normalize_xy(x: int, y: int, width: int, height: int) -> Tuple[int, int]:
+    """将物理坐标归一化到 1000x1000 空间"""
+    width = max(1, int(width))
+    height = max(1, int(height))
+    nx = max(0, min(1000, int(round((x / width) * 1000))))
+    ny = max(0, min(1000, int(round((y / height) * 1000))))
+    return nx, ny
+
+
+def _denormalize_xy(
+    nx: float, ny: float, target_w: int = 1024, target_h: int = 768
+) -> Tuple[int, int]:
+    """
+    将 1000x1000 空间的归一化坐标还原为 Computer Server 的物理坐标系。
+    """
+    x = int(round((nx / 1000.0) * target_w))
+    y = int(round((ny / 1000.0) * target_h))
+    return x, y
+
+
+@register_agent(models=r".*doubao.*", priority=10)
+class DoubaoComputerAgentConfig:
+    """
+    Doubao (火山引擎) agent configuration using litellm responses.
+    Specially handles Doubao's 'reasoning' and 'input' field requirements.
+    Uses 1000x1000 normalized coordinates for model communication.
+    Converts back to 1024x768 target coordinates for Computer Server.
+    """
+
+    def __init__(self):
+        # 创建调试截图保存目录
+        self.debug_dir = Path(os.getcwd()) / "debug_runs"
+        self.debug_dir.mkdir(exist_ok=True)
+
+    async def _save_debug_screenshot(self, image_b64: str, step_name: str) -> str:
+        """保存 Base64 截图到本地用于调试"""
+        try:
+            # 使用更精确的时间戳（包含毫秒），防止同一秒内的多张截图相互覆盖
+            timestamp = datetime.now().strftime("%Y%m%d_%H%M%S_%f")
+            filename = f"step_{timestamp}_{step_name}.png"
+            file_path = self.debug_dir / filename
+
+            image_data = base64.b64decode(image_b64)
+            with open(file_path, "wb") as f:
+                f.write(image_data)
+            return str(file_path)
+        except Exception as e:
+            logger.warning(f"Failed to save debug screenshot: {e}")
+            return ""
+
+    async def predict_step(
+        self,
+        messages: List[Dict[str, Any]],
+        model: str,
+        tools: Optional[List[Dict[str, Any]]] = None,
+        max_retries: Optional[int] = None,
+        stream: bool = False,
+        computer_handler=None,
+        use_prompt_caching: Optional[bool] = False,
+        _on_api_start=None,
+        _on_api_end=None,
+        _on_usage=None,
+        _on_screenshot=None,
+        **kwargs,
+    ) -> Dict[str, Any]:
+        tools = tools or []
+
+        # 1. 获取屏幕物理真实尺寸（用于将模型输出的 1000x1000 还原到物理坐标）
+        physical_width, physical_height = 1024, 768
+        if computer_handler and hasattr(computer_handler, "get_dimensions"):
+            try:
+                physical_width, physical_height = await computer_handler.get_dimensions()
+                logger.info(
+                    f"📏 [物理尺寸] 从 computer_handler 获取到物理分辨率: {physical_width}x{physical_height}"
+                )
+            except Exception as e:
+                logger.warning(f"⚠️ [物理尺寸] 无法获取物理分辨率: {e}")
+
+        # 调试保存最新截图
+        found_latest = False
+        for msg in reversed(messages):
+            if found_latest:
+                break
+            if msg.get("role") == "user" and isinstance(msg.get("content"), list):
+                for part in msg["content"]:
+                    if part.get("type") == "input_image":
+                        image_b64 = part["image_url"].split(",")[-1]
+                        await self._save_debug_screenshot(image_b64, "predict_step")
+                        found_latest = True
+                        break
+
+        # 2. Prepare tools for OpenAI-compatible API
+        # 强制告诉模型屏幕是 1000x1000 (归一化空间)
+        openai_tools = []
+        for schema in tools:
+            if schema["type"] == "computer":
+                computer_tool = {
+                    "type": "function",
+                    "name": "computer",
+                    "description": (
+                        f"Use a mouse and keyboard to interact with a computer, and take screenshots.\n"
+                        f"Screen resolution: 1000x1000 units.\n"
+                        f"Environment: windows."
+                    ),
+                    "parameters": {
+                        "type": "object",
+                        "properties": {
+                            "action": {
+                                "type": "string",
+                                "enum": [
+                                    "click",
+                                    "double_click",
+                                    "right_click",
+                                    "type",
+                                    "keypress",
+                                    "scroll",
+                                    "move",
+                                    "drag",
+                                    "screenshot",
+                                    "wait",
+                                    "terminate",
+                                ],
+                            },
+                            "x": {"type": "integer", "description": "X coordinate (0-1000)"},
+                            "y": {"type": "integer", "description": "Y coordinate (0-1000)"},
+                            "text": {"type": "string"},
+                            "keys": {"type": "array", "items": {"type": "string"}},
+                        },
+                        "required": ["action"],
+                    },
+                }
+                openai_tools.append(computer_tool)
+            elif schema["type"] == "function":
+                func = schema["function"]
+                openai_tools.append(
+                    {
+                        "type": "function",
+                        "name": func["name"],
+                        "description": func.get("description", ""),
+                        "parameters": func.get("parameters", {}),
+                    }
+                )
+
+        # 3. Call API
+        api_kwargs = {
+            "model": model,
+            "input": messages,
+            "tools": openai_tools if openai_tools else None,
+            "stream": stream,
+            "reasoning": {},
+            "num_retries": max_retries,
+            **kwargs,
+        }
+
+        if _on_api_start:
+            await _on_api_start(api_kwargs)
+        response = await litellm.aresponses(**api_kwargs)
+        if _on_api_end:
+            await _on_api_end(api_kwargs, response)
+
+        # 4. 核心转换：将模型输出的 1000x1000 坐标还原回物理坐标
+        output_dict = response if isinstance(response, dict) else response.model_dump()
+        for item in output_dict.get("output", []):
+            if item.get("type") == "function_call" and item.get("name") == "computer":
+                args = item.get("arguments", "{}")
+                if isinstance(args, str):
+                    try:
+                        args = json.loads(args)
+                    except json.JSONDecodeError as e:
+                        logger.warning(f"⚠️ [JSON解析失败] 无法解析工具调用参数: {args}. 错误: {e}")
+                        # 尝试简单的清洗：去掉可能存在的 markdown 代码块标记
+                        cleaned_args = args.strip()
+                        if cleaned_args.startswith("```json"):
+                            cleaned_args = cleaned_args[7:]
+                        if cleaned_args.endswith("```"):
+                            cleaned_args = cleaned_args[:-3]
+                        cleaned_args = cleaned_args.strip()
+
+                        try:
+                            args = json.loads(cleaned_args)
+                        except json.JSONDecodeError:
+                            # 如果还是失败，跳过这个 item，让后续的 agent.py 逻辑处理（它也会报错或处理）
+                            continue
+
+                if "x" in args and "y" in args:
+                    nx, ny = float(args["x"]), float(args["y"])
+                    target_x, target_y = _denormalize_xy(nx, ny, physical_width, physical_height)
+                    logger.info(
+                        f"🎯 [坐标还原] 模型预测({nx}, {ny}) -> 实际物理点击({target_x}, {target_y}) (基于屏幕: {physical_width}x{physical_height})"
+                    )
+                    args["x"], args["y"] = target_x, target_y
+                    item["arguments"] = json.dumps(args)
+
+        # Extract usage and return
+        usage = (
+            response.get("usage", {}) if isinstance(response, dict) else response.usage.model_dump()
+        )
+        if hasattr(response, "_hidden_params"):
+            usage["response_cost"] = response._hidden_params.get("response_cost", 0.0)
+        if _on_usage:
+            await _on_usage(usage)
+        output_dict["usage"] = usage
+        return output_dict
+
+    async def predict_click(
+        self, model: str, image_b64: str, instruction: str, computer_handler=None, **kwargs
+    ) -> Optional[Tuple[int, int]]:
+        """Predict click coordinates specifically for Doubao with 1000x1000 scaling."""
+        await self._save_debug_screenshot(image_b64, "predict_click")
+
+        # 获取真实物理尺寸用于还原
+        physical_width, physical_height = 1024, 768
+        if computer_handler and hasattr(computer_handler, "get_dimensions"):
+            try:
+                physical_width, physical_height = await computer_handler.get_dimensions()
+                logger.info(
+                    f"📏 [物理尺寸] predict_click 识别到物理分辨率: {physical_width}x{physical_height}"
+                )
+            except Exception as e:
+                logger.warning(f"⚠️ [物理尺寸] predict_click 无法获取物理分辨率: {e}")
+        else:
+            try:
+                image_data = base64.b64decode(image_b64)
+                image = Image.open(BytesIO(image_data))
+                physical_width, physical_height = image.size
+                logger.info(
+                    f"📏 [物理尺寸] predict_click 回退使用图像分辨率: {physical_width}x{physical_height}"
+                )
+            except Exception:
+                pass
+
+        input_items = [
+            {
+                "role": "user",
+                "content": f"Task: Click {instruction}. Output ONLY a click action on the target element using 1000x1000 coordinate system.",
+            },
+            {
+                "role": "user",
+                "content": [
+                    {"type": "input_image", "image_url": f"data:image/png;base64,{image_b64}"}
+                ],
+            },
+        ]
+
+        computer_tool = {
+            "type": "function",
+            "name": "computer",
+            "description": "Screen resolution: 1000x1000 units.",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "action": {"type": "string", "enum": ["click"]},
+                    "x": {"type": "integer"},
+                    "y": {"type": "integer"},
+                },
+                "required": ["action", "x", "y"],
+            },
+        }
+
+        api_kwargs = {
+            "model": model,
+            "input": input_items,
+            "tools": [computer_tool],
+            "stream": False,
+            "reasoning": {},
+            "max_tokens": 200,
+            **kwargs,
+        }
+
+        response = await litellm.aresponses(**api_kwargs)
+        output_dict = response if isinstance(response, dict) else response.model_dump()
+
+        for item in output_dict.get("output", []):
+            if item.get("type") == "function_call" and item.get("name") == "computer":
+                args = item.get("arguments", "{}")
+                if isinstance(args, str):
+                    args = json.loads(args)
+                if args.get("x") is not None and args.get("y") is not None:
+                    nx, ny = float(args["x"]), float(args["y"])
+                    target_x, target_y = _denormalize_xy(nx, ny, physical_width, physical_height)
+                    logger.info(
+                        f"🎯 [点击还原] 模型({nx}, {ny}) -> 实际物理({target_x}, {target_y})"
+                    )
+                    return (target_x, target_y)
+        return None
+
+    def get_capabilities(self) -> List[AgentCapability]:
+        return ["click", "step"]

From 692c1cea0db9f7d8a871d60f5726f8c4aec77eea Mon Sep 17 00:00:00 2001
From: hillday <524081959@qq.com>
Date: Tue, 7 Apr 2026 11:31:40 +0800
Subject: [PATCH 2/2] chore(agent): remove doubao debug screenshot saving

---
 libs/python/agent/agent/loops/doubao.py | 45 ++-----------------------
 1 file changed, 3 insertions(+), 42 deletions(-)

diff --git a/libs/python/agent/agent/loops/doubao.py b/libs/python/agent/agent/loops/doubao.py
index ff4f6c4fb..cbfa57eb1 100644
--- a/libs/python/agent/agent/loops/doubao.py
+++ b/libs/python/agent/agent/loops/doubao.py
@@ -7,11 +7,8 @@
 import base64
 import json
 import logging
-import os
 import time
-from datetime import datetime
 from io import BytesIO
-from pathlib import Path
 from typing import Any, Dict, List, Optional, Tuple, Union
 
 import litellm
@@ -53,27 +50,6 @@ class DoubaoComputerAgentConfig:
     Converts back to 1024x768 target coordinates for Computer Server.
     """
 
-    def __init__(self):
-        # 创建调试截图保存目录
-        self.debug_dir = Path(os.getcwd()) / "debug_runs"
-        self.debug_dir.mkdir(exist_ok=True)
-
-    async def _save_debug_screenshot(self, image_b64: str, step_name: str) -> str:
-        """保存 Base64 截图到本地用于调试"""
-        try:
-            # 使用更精确的时间戳（包含毫秒），防止同一秒内的多张截图相互覆盖
-            timestamp = datetime.now().strftime("%Y%m%d_%H%M%S_%f")
-            filename = f"step_{timestamp}_{step_name}.png"
-            file_path = self.debug_dir / filename
-
-            image_data = base64.b64decode(image_b64)
-            with open(file_path, "wb") as f:
-                f.write(image_data)
-            return str(file_path)
-        except Exception as e:
-            logger.warning(f"Failed to save debug screenshot: {e}")
-            return ""
-
     async def predict_step(
         self,
         messages: List[Dict[str, Any]],
@@ -102,19 +78,6 @@ async def predict_step(
             except Exception as e:
                 logger.warning(f"⚠️ [物理尺寸] 无法获取物理分辨率: {e}")
 
-        # 调试保存最新截图
-        found_latest = False
-        for msg in reversed(messages):
-            if found_latest:
-                break
-            if msg.get("role") == "user" and isinstance(msg.get("content"), list):
-                for part in msg["content"]:
-                    if part.get("type") == "input_image":
-                        image_b64 = part["image_url"].split(",")[-1]
-                        await self._save_debug_screenshot(image_b64, "predict_step")
-                        found_latest = True
-                        break
-
         # 2. Prepare tools for OpenAI-compatible API
         # 强制告诉模型屏幕是 1000x1000 (归一化空间)
         openai_tools = []
@@ -124,9 +87,9 @@ async def predict_step(
                     "type": "function",
                     "name": "computer",
                     "description": (
-                        f"Use a mouse and keyboard to interact with a computer, and take screenshots.\n"
-                        f"Screen resolution: 1000x1000 units.\n"
-                        f"Environment: windows."
+                        "Use a mouse and keyboard to interact with a computer, and take screenshots.\n"
+                        "Screen resolution: 1000x1000 units.\n"
+                        "Environment: windows."
                     ),
                     "parameters": {
                         "type": "object",
@@ -232,8 +195,6 @@ async def predict_click(
         self, model: str, image_b64: str, instruction: str, computer_handler=None, **kwargs
     ) -> Optional[Tuple[int, int]]:
         """Predict click coordinates specifically for Doubao with 1000x1000 scaling."""
-        await self._save_debug_screenshot(image_b64, "predict_click")
-
         # 获取真实物理尺寸用于还原
         physical_width, physical_height = 1024, 768
         if computer_handler and hasattr(computer_handler, "get_dimensions"):