diff --git a/libs/python/agent/agent/agent.py b/libs/python/agent/agent/agent.py index 3d64a4dbc..7d4d88acf 100644 --- a/libs/python/agent/agent/agent.py +++ b/libs/python/agent/agent/agent.py @@ -18,6 +18,7 @@ Optional, Set, Tuple, + Type, Union, cast, ) @@ -26,6 +27,7 @@ import litellm.utils from core.telemetry import is_telemetry_enabled, record_event from litellm.responses.utils import Usage +from pydantic import BaseModel from .adapters import ( AzureMLAdapter, @@ -254,6 +256,28 @@ async def _predict_step_with_retry( raise last_exc # unreachable, but satisfies type checkers +def _pydantic_model_to_response_format(output_type: Type[BaseModel]) -> dict: + """Convert a Pydantic BaseModel class to a response_format dict for LLM APIs.""" + return { + "type": "json_schema", + "json_schema": { + "name": output_type.__name__, + "schema": output_type.model_json_schema(), + }, + } + + +def _extract_text_from_output_item(item: dict) -> Optional[str]: + """Extract text content from a ResponseOutputMessageParam item.""" + content = item.get("content", []) + for block in content: + if isinstance(block, dict) and block.get("type") == "output_text": + return block.get("text", "") + elif hasattr(block, "type") and getattr(block, "type", None) == "output_text": + return getattr(block, "text", "") + return None + + class ComputerAgent: """ Main agent class that automatically selects the appropriate agent loop @@ -278,6 +302,7 @@ def __init__( trust_remote_code: Optional[bool] = False, api_key: Optional[str] = None, api_base: Optional[str] = None, + output_type: Optional[Type[BaseModel]] = None, **additional_generation_kwargs, ): """ @@ -300,6 +325,7 @@ def __init__( trust_remote_code: If set, trust remote code when loading local models. Disabled by default. api_key: Optional API key override for the model provider api_base: Optional API base URL override for the model provider + output_type: Optional Pydantic BaseModel class for structured outputs. When set, the LLM is instructed to return JSON conforming to the schema, and the final text response is parsed into an instance of this model. **additional_generation_kwargs: Additional arguments passed to the model provider """ # If the loop is "human/human", we need to prefix a grounding model fallback @@ -322,6 +348,7 @@ def __init__( self.trust_remote_code = trust_remote_code self.api_key = api_key self.api_base = api_base + self.output_type = output_type # == Add built-in callbacks == @@ -905,6 +932,7 @@ async def run( stream: bool = False, api_key: Optional[str] = None, api_base: Optional[str] = None, + output_type: Optional[Type[BaseModel]] = None, **additional_generation_kwargs, ) -> AsyncGenerator[Dict[str, Any], None]: """ @@ -915,6 +943,7 @@ async def run( stream: Whether to stream the response api_key: Optional API key override for the model provider api_base: Optional API base URL override for the model provider + output_type: Optional Pydantic BaseModel class for structured outputs. Overrides the constructor-level output_type if provided. **additional_generation_kwargs: Additional arguments passed to the model provider Returns: @@ -938,6 +967,13 @@ async def run( if (api_base is not None) or (self.api_base is not None): merged_kwargs["api_base"] = api_base if api_base is not None else self.api_base + # Resolve structured output type (run-level overrides constructor-level) + effective_output_type = output_type if output_type is not None else self.output_type + if effective_output_type is not None: + merged_kwargs["response_format"] = _pydantic_model_to_response_format( + effective_output_type + ) + old_items = self._process_input(messages) new_items = [] @@ -1063,6 +1099,24 @@ def contains_image_content(msgs): await self._on_run_end(loop_kwargs, old_items, new_items) + # Parse structured output from the final assistant message + if effective_output_type is not None and new_items: + for item in reversed(new_items): + if item.get("role") == "assistant": + text_content = _extract_text_from_output_item(item) + if text_content: + parsed = effective_output_type.model_validate_json(text_content) + yield { + "output_parsed": parsed, + "output": [], + "usage": Usage( + prompt_tokens=0, + completion_tokens=0, + total_tokens=0, + ), + } + break + async def predict_click( self, instruction: str, image_b64: Optional[str] = None ) -> Optional[Tuple[int, int]]: diff --git a/libs/python/agent/agent/loops/gemini.py b/libs/python/agent/agent/loops/gemini.py index 37c45dcad..d1c2886fb 100644 --- a/libs/python/agent/agent/loops/gemini.py +++ b/libs/python/agent/agent/loops/gemini.py @@ -709,6 +709,9 @@ async def predict_step( # Create client with CUA routing support (detects cua/ prefix automatically) client, model = _create_gemini_client(model, genai, kwargs) + # Pop response_format — structured outputs not yet supported for Gemini + kwargs.pop("response_format", None) + # Extract Gemini 3-specific parameters # thinking_level: Use types.ThinkingLevel enum values (e.g., "LOW", "HIGH", "MEDIUM", "MINIMAL") # media_resolution: Use types.MediaResolution enum values (e.g., "MEDIA_RESOLUTION_LOW", "MEDIA_RESOLUTION_HIGH") diff --git a/libs/python/agent/agent/loops/openai.py b/libs/python/agent/agent/loops/openai.py index 8e7a3fb20..d9c5f20b6 100644 --- a/libs/python/agent/agent/loops/openai.py +++ b/libs/python/agent/agent/loops/openai.py @@ -224,6 +224,12 @@ async def predict_step( # Prepare tools for OpenAI API openai_tools = await _prepare_tools_for_openai(tools, model=model) + # Translate response_format to Responses API text.format parameter + response_format = kwargs.pop("response_format", None) + text_format = None + if response_format is not None: + text_format = {"format": response_format} + # Prepare API call kwargs api_kwargs = { "model": model, @@ -237,6 +243,10 @@ async def predict_step( **kwargs, } + # Add text format for structured outputs if specified + if text_format is not None: + api_kwargs["text"] = text_format + # Call API start hook if _on_api_start: await _on_api_start(api_kwargs) diff --git a/libs/python/agent/tests/test_computer_agent.py b/libs/python/agent/tests/test_computer_agent.py index b6de1e86e..f3c653c53 100644 --- a/libs/python/agent/tests/test_computer_agent.py +++ b/libs/python/agent/tests/test_computer_agent.py @@ -125,6 +125,78 @@ def test_agent_response_type_exists(self): assert AgentResponse is not None +class TestComputerAgentStructuredOutputs: + """Test ComputerAgent structured outputs support.""" + + @patch("agent.agent.litellm") + def test_agent_initialization_with_output_type(self, mock_litellm, disable_telemetry): + """Test that agent can be initialized with output_type parameter.""" + from pydantic import BaseModel + + from agent import ComputerAgent + + class MyOutput(BaseModel): + title: str + score: int + + agent = ComputerAgent( + model="anthropic/claude-sonnet-4-5-20250929", + output_type=MyOutput, + ) + + assert agent is not None + assert agent.output_type is MyOutput + + @patch("agent.agent.litellm") + def test_agent_initialization_without_output_type(self, mock_litellm, disable_telemetry): + """Test that output_type defaults to None.""" + from agent import ComputerAgent + + agent = ComputerAgent(model="anthropic/claude-sonnet-4-5-20250929") + assert agent.output_type is None + + def test_pydantic_model_to_response_format(self): + """Test conversion of Pydantic model to response_format dict.""" + from pydantic import BaseModel + + from agent.agent import _pydantic_model_to_response_format + + class TestModel(BaseModel): + name: str + value: int + + result = _pydantic_model_to_response_format(TestModel) + + assert result["type"] == "json_schema" + assert result["json_schema"]["name"] == "TestModel" + assert "properties" in result["json_schema"]["schema"] + assert "name" in result["json_schema"]["schema"]["properties"] + assert "value" in result["json_schema"]["schema"]["properties"] + + def test_extract_text_from_output_item_dict(self): + """Test extracting text from a dict-based output item.""" + from agent.agent import _extract_text_from_output_item + + item = { + "role": "assistant", + "type": "message", + "content": [ + {"type": "output_text", "text": '{"title": "test", "score": 42}'} + ], + } + + result = _extract_text_from_output_item(item) + assert result == '{"title": "test", "score": 42}' + + def test_extract_text_from_output_item_no_text(self): + """Test extracting text from an item with no text content.""" + from agent.agent import _extract_text_from_output_item + + item = {"role": "assistant", "type": "message", "content": []} + result = _extract_text_from_output_item(item) + assert result is None + + class TestComputerAgentIntegration: """Test ComputerAgent integration with Computer tool (SRP: Integration within package)."""