diff --git a/src/agentevals/converter.py b/src/agentevals/converter.py index 4e4b3b8..7543369 100644 --- a/src/agentevals/converter.py +++ b/src/agentevals/converter.py @@ -23,6 +23,7 @@ extract_tool_call_from_span, extract_tool_result_from_span, extract_user_text_from_attrs, + find_adk_llm_spans_in, get_extractor, has_adk_descendant, is_adk_scope, @@ -127,15 +128,18 @@ def _find_adk_spans(trace: Trace, operation: str) -> list[Span]: def _convert_invoke_span(invoke_span: Span) -> Invocation: - call_llm_spans = _find_children_by_op(invoke_span, "call_llm") - if not call_llm_spans: - raise ValueError(f"invoke_agent span {invoke_span.span_id} has no child call_llm spans") + llm_spans = find_adk_llm_spans_in(invoke_span) + if not llm_spans: + raise ValueError( + f"invoke_agent span {invoke_span.span_id} has no converter-compatible ADK LLM descendants; " + "expected call_llm or ADK generate_content spans" + ) tool_spans = _find_children_by_op(invoke_span, "execute_tool") - user_content = _extract_user_content(call_llm_spans[0]) - final_response = _extract_final_response(call_llm_spans[-1]) - tool_uses, tool_responses = _extract_tool_trajectory(call_llm_spans, tool_spans) + user_content = _extract_user_content(llm_spans[0]) + final_response = _extract_final_response(llm_spans[-1]) + tool_uses, tool_responses = _extract_tool_trajectory(llm_spans, tool_spans) intermediate_data = IntermediateData( tool_uses=tool_uses, @@ -177,7 +181,7 @@ def _extract_user_content(first_call_llm: Span) -> genai_types.Content: ) llm_request_raw = first_call_llm.get_tag(ADK_LLM_REQUEST, "{}") llm_request = parse_json(llm_request_raw) - for content_dict in llm_request.get("contents", []): + for content_dict in llm_request.get("contents", llm_request.get("Contents", [])): if content_dict.get("role") == "user": return _content_from_dict(content_dict) raise ValueError(f"call_llm span {first_call_llm.span_id}: no user content found in llm_request") @@ -193,7 +197,7 @@ def _extract_final_response(last_call_llm: Span) -> genai_types.Content: ) llm_response_raw = last_call_llm.get_tag(ADK_LLM_RESPONSE, "{}") llm_response = parse_json(llm_response_raw) - content_dict = llm_response.get("content", {}) + content_dict = llm_response.get("content", llm_response.get("Content", {})) if not content_dict: raise ValueError(f"call_llm span {last_call_llm.span_id}: no content in llm_response") logger.warning( @@ -263,12 +267,12 @@ def _extract_function_calls_from_llm_response( llm_response_raw = call_llm.get_tag(ADK_LLM_RESPONSE, "{}") llm_response = parse_json(llm_response_raw) - content_dict = llm_response.get("content", {}) + content_dict = llm_response.get("content", llm_response.get("Content", {})) parts = content_dict.get("parts", []) calls = [] for part in parts: - fc_dict = part.get("function_call") + fc_dict = part.get("function_call", part.get("functionCall")) if fc_dict: calls.append( genai_types.FunctionCall( @@ -288,9 +292,9 @@ def _content_from_dict(content_dict: dict[str, Any]) -> genai_types.Content: parts: list[genai_types.Part] = [] for p in parts_dicts: if "text" in p: - parts.append(genai_types.Part(text=p["text"])) - elif "function_call" in p: - fc = p["function_call"] + parts.append(genai_types.Part(text=p.get("text"))) + elif "function_call" in p or "functionCall" in p: + fc = p.get("function_call", p.get("functionCall")) parts.append( genai_types.Part( function_call=genai_types.FunctionCall( @@ -300,8 +304,8 @@ def _content_from_dict(content_dict: dict[str, Any]) -> genai_types.Content: ) ) ) - elif "function_response" in p: - fr = p["function_response"] + elif "function_response" in p or "functionResponse" in p: + fr = p.get("function_response", p.get("functionResponse")) parts.append( genai_types.Part( function_response=genai_types.FunctionResponse( diff --git a/src/agentevals/extraction.py b/src/agentevals/extraction.py index e4536da..51b0348 100644 --- a/src/agentevals/extraction.py +++ b/src/agentevals/extraction.py @@ -69,14 +69,15 @@ def extract_user_text_from_attrs(attrs: dict[str, Any]) -> str | None: if llm_request_raw: llm_request = parse_json(llm_request_raw) if isinstance(llm_request, dict): - for content_dict in reversed(llm_request.get("contents", [])): + contents = llm_request.get("contents", llm_request.get("Contents", [])) + for content_dict in reversed(contents): if content_dict.get("role") != "user": continue parts = content_dict.get("parts", []) text_parts = [p for p in parts if "text" in p] if text_parts: return " ".join(p["text"] for p in text_parts) - for content_dict in llm_request.get("contents", []): + for content_dict in contents: if content_dict.get("role") == "user": parts = content_dict.get("parts", []) if parts: @@ -101,7 +102,7 @@ def extract_agent_response_from_attrs(attrs: dict[str, Any]) -> str | None: if llm_response_raw: llm_response = parse_json(llm_response_raw) if isinstance(llm_response, dict): - content_dict = llm_response.get("content", {}) + content_dict = llm_response.get("content", llm_response.get("Content", {})) if content_dict: parts_dicts = content_dict.get("parts", []) text_parts = [p for p in parts_dicts if "text" in p] @@ -392,6 +393,38 @@ def is_adk_scope(span: Span) -> bool: return False +def is_adk_generate_content_llm_span(span: Span) -> bool: + if not (span.operation_name.startswith("generate_content") or span.get_tag(OTEL_GENAI_OP) == "generate_content"): + return False + return bool(span.get_tag(ADK_LLM_REQUEST) or span.get_tag(ADK_LLM_RESPONSE)) + + +def is_adk_llm_span(span: Span) -> bool: + return span.operation_name.startswith("call_llm") or is_adk_generate_content_llm_span(span) + + +def find_adk_llm_spans_in(root: Span) -> list[Span]: + call_llm_spans: list[Span] = [] + generate_content_spans: list[Span] = [] + + def collect(span: Span) -> None: + if span.operation_name.startswith("call_llm"): + call_llm_spans.append(span) + elif is_adk_generate_content_llm_span(span): + generate_content_spans.append(span) + + _walk_descendants(root, collect) + call_llm_spans.sort(key=lambda s: s.start_time) + generate_content_spans.sort(key=lambda s: s.start_time) + return call_llm_spans or generate_content_spans + + +def _walk_descendants(span: Span, visit) -> None: + for child in span.children: + visit(child) + _walk_descendants(child, visit) + + def is_llm_span(span: Span) -> bool: return span.get_tag(OTEL_GENAI_REQUEST_MODEL) is not None @@ -477,10 +510,7 @@ def find_invocation_spans(self, trace: Trace) -> list[Span]: return matches def find_llm_spans_in(self, root: Span) -> list[Span]: - results: list[Span] = [] - self._walk(root, lambda s: s.operation_name.startswith("call_llm"), results) - results.sort(key=lambda s: s.start_time) - return results + return find_adk_llm_spans_in(root) def find_tool_spans_in(self, root: Span) -> list[Span]: results: list[Span] = [] @@ -493,7 +523,7 @@ def classify_span(self, span: Span) -> str | None: return None if span.operation_name.startswith("invoke_agent"): return "invocation" - if span.operation_name.startswith("call_llm"): + if is_adk_llm_span(span): return "llm" if span.operation_name.startswith("execute_tool"): return "tool" diff --git a/tests/test_converter.py b/tests/test_converter.py index 5e6a4e4..fdbb518 100644 --- a/tests/test_converter.py +++ b/tests/test_converter.py @@ -186,6 +186,108 @@ def test_convert_traces_multiple(self): assert len(results) == 2 assert all(r.trace_id == "t1" for r in results) + def test_convert_adk_generate_content_llm_spans(self): + invoke = Span( + trace_id="t-gc", + span_id="invoke1", + parent_span_id=None, + operation_name="invoke_agent query_agent", + start_time=1000, + duration=10000, + tags={"gen_ai.operation.name": "invoke_agent"}, + ) + llm_1 = Span( + trace_id="t-gc", + span_id="llm1", + parent_span_id="invoke1", + operation_name="generate_content mockllm-deterministic", + start_time=2000, + duration=1000, + tags={ + "gen_ai.operation.name": "generate_content", + "gcp.vertex.agent.llm_request": json.dumps( + {"Contents": [{"role": "user", "parts": [{"text": "inspect pods"}]}]} + ), + "gcp.vertex.agent.llm_response": json.dumps( + {"Content": {"role": "model", "parts": [{"text": "Calling tools."}]}} + ), + }, + ) + tool_1 = Span( + trace_id="t-gc", + span_id="tool1", + parent_span_id="invoke1", + operation_name="execute_tool list_pods", + start_time=3000, + duration=500, + tags={ + "gen_ai.tool.name": "list_pods", + "gen_ai.tool.call.id": "call_1", + "gcp.vertex.agent.tool_call_args": json.dumps({"namespace": "default"}), + "gcp.vertex.agent.tool_response": json.dumps({"pods": []}), + }, + ) + llm_2 = Span( + trace_id="t-gc", + span_id="llm2", + parent_span_id="invoke1", + operation_name="generate_content mockllm-deterministic", + start_time=4000, + duration=1000, + tags={ + "gen_ai.operation.name": "generate_content", + "gcp.vertex.agent.llm_request": json.dumps({"contents": []}), + "gcp.vertex.agent.llm_response": json.dumps( + { + "Content": { + "role": "model", + "parts": [ + { + "functionCall": { + "name": "summarize_pods", + "args": {"namespace": "default"}, + "id": "call_final", + } + } + ], + } + } + ), + }, + ) + tool_2 = Span( + trace_id="t-gc", + span_id="tool2", + parent_span_id="invoke1", + operation_name="execute_tool get_events", + start_time=5000, + duration=500, + tags={ + "gen_ai.tool.name": "get_events", + "gen_ai.tool.call.id": "call_2", + "gcp.vertex.agent.tool_call_args": json.dumps({"namespace": "default"}), + "gcp.vertex.agent.tool_response": json.dumps({"events": []}), + }, + ) + invoke.children.extend([llm_1, tool_1, llm_2, tool_2]) + trace = Trace( + trace_id="t-gc", + root_spans=[invoke], + all_spans=[invoke, llm_1, tool_1, llm_2, tool_2], + ) + + result = convert_trace(trace) + + assert result.warnings == [] + assert len(result.invocations) == 1 + inv = result.invocations[0] + assert inv.user_content.parts[0].text == "inspect pods" + final_call = inv.final_response.parts[0].function_call + assert final_call.name == "summarize_pods" + assert final_call.args == {"namespace": "default"} + assert final_call.id == "call_final" + assert [t.name for t in inv.intermediate_data.tool_uses] == ["list_pods", "get_events"] + def test_no_invoke_agent_warns(self): trace = Trace( trace_id="empty", @@ -207,6 +309,35 @@ def test_no_invoke_agent_warns(self): assert len(result.warnings) == 1 assert "no invoke_agent" in result.warnings[0] + def test_no_llm_descendants_warns_with_compatible_shapes(self): + invoke = Span( + trace_id="no-llm", + span_id="invoke-no-llm", + parent_span_id=None, + operation_name="invoke_agent test_agent", + start_time=1000, + duration=1000, + tags={ + "otel.scope.name": "gcp.vertex.agent", + "gen_ai.operation.name": "invoke_agent", + }, + ) + trace = Trace( + trace_id="no-llm", + root_spans=[invoke], + all_spans=[invoke], + ) + + result = convert_trace(trace) + + assert result.invocations == [] + assert len(result.warnings) == 1 + warning = result.warnings[0] + assert "invoke-no-llm" in warning + assert "no converter-compatible ADK LLM descendants" in warning + assert "call_llm" in warning + assert "ADK generate_content" in warning + def test_no_tool_spans_fallback_to_llm_response(self): """When no execute_tool spans exist, function_calls should be extracted from call_llm responses instead.""" diff --git a/tests/test_extraction.py b/tests/test_extraction.py index 8686e55..52ec812 100644 --- a/tests/test_extraction.py +++ b/tests/test_extraction.py @@ -107,6 +107,18 @@ def test_adk_llm_request_prefers_last_user(self): } assert extract_user_text_from_attrs(attrs) == "Second" + def test_adk_llm_request_outer_contents_pascalcase(self): + attrs = { + ADK_LLM_REQUEST: json.dumps( + { + "Contents": [ + {"role": "user", "parts": [{"text": "Outer PascalCase only"}]}, + ] + } + ) + } + assert extract_user_text_from_attrs(attrs) == "Outer PascalCase only" + def test_genai_content_based(self): attrs = { OTEL_GENAI_INPUT_MESSAGES: json.dumps( @@ -170,6 +182,10 @@ def test_adk_llm_response(self): attrs = {ADK_LLM_RESPONSE: json.dumps({"content": {"parts": [{"text": "ADK response"}]}})} assert extract_agent_response_from_attrs(attrs) == "ADK response" + def test_adk_llm_response_outer_content_pascalcase(self): + attrs = {ADK_LLM_RESPONSE: json.dumps({"Content": {"parts": [{"text": "Outer Content only"}]}})} + assert extract_agent_response_from_attrs(attrs) == "Outer Content only" + def test_genai_content_based(self): attrs = { OTEL_GENAI_OUTPUT_MESSAGES: json.dumps( @@ -519,6 +535,39 @@ def test_find_llm_spans_in(self): ext = AdkExtractor() assert [s.span_id for s in ext.find_llm_spans_in(root)] == ["llm1"] + def test_find_llm_spans_in_falls_back_to_adk_generate_content(self): + child_llm = _span( + op="generate_content mockllm-deterministic", + tags={ADK_LLM_REQUEST: "{}"}, + span_id="llm1", + ) + child_tool = _span(op="execute_tool search", span_id="tool1") + root = _span(op="invoke_agent a", children=[child_llm, child_tool]) + ext = AdkExtractor() + assert [s.span_id for s in ext.find_llm_spans_in(root)] == ["llm1"] + + def test_find_llm_spans_in_ignores_provider_generate_content_without_adk_payload(self): + child_llm = _span( + op="generate_content gpt-4", + tags={OTEL_GENAI_REQUEST_MODEL: "gpt-4"}, + span_id="llm1", + ) + root = _span(op="invoke_agent a", children=[child_llm]) + ext = AdkExtractor() + assert ext.find_llm_spans_in(root) == [] + + def test_find_llm_spans_in_prefers_call_llm_over_generate_content(self): + call_llm = _span(op="call_llm gemini", span_id="llm1", start_time=20) + generate_content = _span( + op="generate_content gemini", + tags={ADK_LLM_REQUEST: "{}"}, + span_id="llm2", + start_time=10, + ) + root = _span(op="invoke_agent a", children=[generate_content, call_llm]) + ext = AdkExtractor() + assert [s.span_id for s in ext.find_llm_spans_in(root)] == ["llm1"] + def test_find_tool_spans_in(self): child_llm = _span(op="call_llm gemini", span_id="llm1") child_tool = _span(op="execute_tool search", span_id="tool1") @@ -530,6 +579,7 @@ def test_classify_span(self): ext = AdkExtractor() assert ext.classify_span(_span(op="invoke_agent a", tags={OTEL_SCOPE: ADK_SCOPE_VALUE})) == "invocation" assert ext.classify_span(_span(op="call_llm", tags={OTEL_SCOPE: ADK_SCOPE_VALUE})) == "llm" + assert ext.classify_span(_span(op="generate_content", tags={ADK_LLM_REQUEST: "{}"})) == "llm" assert ext.classify_span(_span(op="execute_tool x", tags={OTEL_SCOPE: ADK_SCOPE_VALUE})) == "tool" assert ext.classify_span(_span(op="random")) is None