From 1397087b474e78597a45c22cfdd510cf85cc0acf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Reme=C5=A1?= Date: Mon, 20 Apr 2026 14:01:37 +0200 Subject: [PATCH 1/3] feat: add observability toolset evals --- evals/claude-code/eval.yaml | 10 ++++++ evals/gemini-agent/eval.yaml | 10 ++++++ evals/openai-agent/eval.yaml | 10 ++++++ .../alerts/alert-investigation.yaml | 29 +++++++++++++++++ .../observability/alerts/filtered-alerts.yaml | 27 ++++++++++++++++ .../observability/alerts/get-alerts.yaml | 22 +++++++++++++ .../observability/alerts/get-silences.yaml | 22 +++++++++++++ .../observability/labels/get-series.yaml | 26 ++++++++++++++++ .../observability/labels/label-names.yaml | 26 ++++++++++++++++ .../observability/labels/label-values.yaml | 22 +++++++++++++ .../labels/series-by-namespace.yaml | 28 +++++++++++++++++ .../observability/metrics/list-metrics.yaml | 22 +++++++++++++ .../metrics/list-node-metrics.yaml | 22 +++++++++++++ .../queries/backend-reachability.yaml | 23 ++++++++++++++ .../observability/queries/cpu-usage.yaml | 25 +++++++++++++++ .../queries/crashlooping-pods.yaml | 22 +++++++++++++ .../queries/diagnose-cluster-health.yaml | 28 +++++++++++++++++ .../queries/high-cardinality-rejection.yaml | 24 ++++++++++++++ .../observability/queries/memory-usage.yaml | 25 +++++++++++++++ .../queries/namespace-pod-count.yaml | 27 ++++++++++++++++ .../queries/namespace-resource-usage.yaml | 31 +++++++++++++++++++ .../queries/network-traffic.yaml | 25 +++++++++++++++ .../queries/nonexistent-metric.yaml | 23 ++++++++++++++ .../queries/nonexistent-namespace.yaml | 23 ++++++++++++++ .../observability/queries/pending-pods.yaml | 23 ++++++++++++++ .../observability/queries/pods-created.yaml | 22 +++++++++++++ .../queries/prometheus-head-series.yaml | 22 +++++++++++++ .../queries/prometheus-requests.yaml | 22 +++++++++++++ .../queries/prometheus-wal-size.yaml | 22 +++++++++++++ .../queries/time-range-query.yaml | 26 ++++++++++++++++ .../queries/visualize-cpu-usage.yaml | 23 ++++++++++++++ 31 files changed, 712 insertions(+) create mode 100644 evals/tasks/observability/alerts/alert-investigation.yaml create mode 100644 evals/tasks/observability/alerts/filtered-alerts.yaml create mode 100644 evals/tasks/observability/alerts/get-alerts.yaml create mode 100644 evals/tasks/observability/alerts/get-silences.yaml create mode 100644 evals/tasks/observability/labels/get-series.yaml create mode 100644 evals/tasks/observability/labels/label-names.yaml create mode 100644 evals/tasks/observability/labels/label-values.yaml create mode 100644 evals/tasks/observability/labels/series-by-namespace.yaml create mode 100644 evals/tasks/observability/metrics/list-metrics.yaml create mode 100644 evals/tasks/observability/metrics/list-node-metrics.yaml create mode 100644 evals/tasks/observability/queries/backend-reachability.yaml create mode 100644 evals/tasks/observability/queries/cpu-usage.yaml create mode 100644 evals/tasks/observability/queries/crashlooping-pods.yaml create mode 100644 evals/tasks/observability/queries/diagnose-cluster-health.yaml create mode 100644 evals/tasks/observability/queries/high-cardinality-rejection.yaml create mode 100644 evals/tasks/observability/queries/memory-usage.yaml create mode 100644 evals/tasks/observability/queries/namespace-pod-count.yaml create mode 100644 evals/tasks/observability/queries/namespace-resource-usage.yaml create mode 100644 evals/tasks/observability/queries/network-traffic.yaml create mode 100644 evals/tasks/observability/queries/nonexistent-metric.yaml create mode 100644 evals/tasks/observability/queries/nonexistent-namespace.yaml create mode 100644 evals/tasks/observability/queries/pending-pods.yaml create mode 100644 evals/tasks/observability/queries/pods-created.yaml create mode 100644 evals/tasks/observability/queries/prometheus-head-series.yaml create mode 100644 evals/tasks/observability/queries/prometheus-requests.yaml create mode 100644 evals/tasks/observability/queries/prometheus-wal-size.yaml create mode 100644 evals/tasks/observability/queries/time-range-query.yaml create mode 100644 evals/tasks/observability/queries/visualize-cpu-usage.yaml diff --git a/evals/claude-code/eval.yaml b/evals/claude-code/eval.yaml index 4a46b8131..bbf5c9e0d 100644 --- a/evals/claude-code/eval.yaml +++ b/evals/claude-code/eval.yaml @@ -72,3 +72,13 @@ config: toolPattern: ".*" minToolCalls: 1 maxToolCalls: 20 + # Observability tasks + - glob: ../tasks/observability/*/*.yaml + labelSelector: + suite: observability + assertions: + toolsUsed: + - server: kubernetes + toolPattern: ".*" + minToolCalls: 1 + maxToolCalls: 20 diff --git a/evals/gemini-agent/eval.yaml b/evals/gemini-agent/eval.yaml index d7ee6eb82..3a1a66132 100644 --- a/evals/gemini-agent/eval.yaml +++ b/evals/gemini-agent/eval.yaml @@ -17,3 +17,13 @@ config: toolsUsed: - server: kubernetes toolPattern: ".*" + # Observability tasks + - glob: ../tasks/observability/*/*.yaml + labelSelector: + suite: observability + assertions: + toolsUsed: + - server: kubernetes + toolPattern: ".*" + minToolCalls: 1 + maxToolCalls: 20 diff --git a/evals/openai-agent/eval.yaml b/evals/openai-agent/eval.yaml index d3a71773f..1dedd4787 100644 --- a/evals/openai-agent/eval.yaml +++ b/evals/openai-agent/eval.yaml @@ -72,3 +72,13 @@ config: toolPattern: ".*" minToolCalls: 1 maxToolCalls: 20 + # Observability tasks + - glob: ../tasks/observability/*/*.yaml + labelSelector: + suite: observability + assertions: + toolsUsed: + - server: kubernetes + toolPattern: ".*" + minToolCalls: 1 + maxToolCalls: 20 diff --git a/evals/tasks/observability/alerts/alert-investigation.yaml b/evals/tasks/observability/alerts/alert-investigation.yaml new file mode 100644 index 000000000..0e6b37819 --- /dev/null +++ b/evals/tasks/observability/alerts/alert-investigation.yaml @@ -0,0 +1,29 @@ +kind: Task +apiVersion: mcpchecker/v1alpha2 +metadata: + name: "alert-investigation" + difficulty: medium + parallel: true + runs: 1 + labels: + suite: observability + category: alerts + toolType: multi-step + description: | + Tests if the agent can perform multi-step alert triage: first retrieving + alerts from Alertmanager, then investigating related metrics for the most + critical alert using Prometheus queries. +spec: + verify: + - llmJudge: + contains: "alert" + reason: "Verify the agent retrieved alerts from Alertmanager" + - llmJudge: + contains: "metric" + reason: "Verify the agent investigated a related Prometheus metric" + prompt: + inline: | + Check if there are any firing alerts. If there are, investigate + the related metrics for the most critical alert and summarize + what's happening. If there are no firing alerts, check cluster + health metrics instead. diff --git a/evals/tasks/observability/alerts/filtered-alerts.yaml b/evals/tasks/observability/alerts/filtered-alerts.yaml new file mode 100644 index 000000000..548c0363b --- /dev/null +++ b/evals/tasks/observability/alerts/filtered-alerts.yaml @@ -0,0 +1,27 @@ +kind: Task +apiVersion: mcpchecker/v1alpha2 +metadata: + name: "filtered-alerts" + difficulty: medium + parallel: true + runs: 1 + labels: + suite: observability + category: alerts + toolType: alertmanager + description: | + Tests if the agent uses the get_alerts tool with filter parameters + to retrieve only active alerts matching a specific severity. The agent + should pass appropriate filter arguments rather than fetching all alerts + and filtering client-side. +spec: + verify: + - llmJudge: + contains: "alert" + reason: "Verify the agent retrieved and reported on alerts" + - llmJudge: + contains: "critical" + reason: "Verify the agent filtered or addressed the critical severity" + prompt: + inline: | + Are there any firing alerts with severity=critical? Show only active alerts. diff --git a/evals/tasks/observability/alerts/get-alerts.yaml b/evals/tasks/observability/alerts/get-alerts.yaml new file mode 100644 index 000000000..fa758d473 --- /dev/null +++ b/evals/tasks/observability/alerts/get-alerts.yaml @@ -0,0 +1,22 @@ +kind: Task +apiVersion: mcpchecker/v1alpha2 +metadata: + name: "get-alerts" + difficulty: easy + parallel: true + runs: 1 + labels: + suite: observability + category: alerts + toolType: alertmanager + description: | + Tests if the agent can discover and use the get_alerts tool to retrieve + currently firing alerts from Alertmanager. +spec: + verify: + - llmJudge: + contains: "alerts" + reason: "Verify the output reports on the alert state, whether alerts are firing or not" + prompt: + inline: | + Check the cluster for any firing alerts and report what you find. diff --git a/evals/tasks/observability/alerts/get-silences.yaml b/evals/tasks/observability/alerts/get-silences.yaml new file mode 100644 index 000000000..726e60c31 --- /dev/null +++ b/evals/tasks/observability/alerts/get-silences.yaml @@ -0,0 +1,22 @@ +kind: Task +apiVersion: mcpchecker/v1alpha2 +metadata: + name: "get-silences" + difficulty: easy + parallel: true + runs: 1 + labels: + suite: observability + category: alerts + toolType: alertmanager + description: | + Tests if the agent can discover and use the get_silences tool to retrieve + active silences from Alertmanager. +spec: + verify: + - llmJudge: + contains: "silence" + reason: "Verify the output discusses alert silences, including when no active silences exist" + prompt: + inline: | + Are there any active silences in Alertmanager? diff --git a/evals/tasks/observability/labels/get-series.yaml b/evals/tasks/observability/labels/get-series.yaml new file mode 100644 index 000000000..317e85ae6 --- /dev/null +++ b/evals/tasks/observability/labels/get-series.yaml @@ -0,0 +1,26 @@ +kind: Task +apiVersion: mcpchecker/v1alpha2 +metadata: + name: "get-series-cardinality" + difficulty: medium + parallel: true + runs: 1 + labels: + suite: observability + category: labels + toolType: exploration + description: | + Tests if the agent can use the get_series tool to check cardinality for a metric. + The agent should first verify the metric exists via list_metrics, then use + get_series to retrieve matching time series and report the count. +spec: + verify: + - llmJudge: + contains: "series" + reason: "Verify the output reports time series information" + - llmJudge: + contains: "kube_pod_info" + reason: "Verify the agent queried the correct metric" + prompt: + inline: | + How many time series exist for the kube_pod_info metric? Show the cardinality. diff --git a/evals/tasks/observability/labels/label-names.yaml b/evals/tasks/observability/labels/label-names.yaml new file mode 100644 index 000000000..8d306d528 --- /dev/null +++ b/evals/tasks/observability/labels/label-names.yaml @@ -0,0 +1,26 @@ +kind: Task +apiVersion: mcpchecker/v1alpha2 +metadata: + name: "label-names" + difficulty: easy + parallel: true + runs: 1 + labels: + suite: observability + category: labels + toolType: exploration + description: | + Tests if the agent follows the correct workflow: first calling list_metrics to + verify kube_pod_info exists, then calling get_label_names to discover available + labels for that metric. +spec: + verify: + - llmJudge: + contains: "namespace" + reason: "Verify the output includes the namespace label which is a standard Kubernetes label" + - llmJudge: + contains: "pod" + reason: "Verify the output includes the pod label" + prompt: + inline: | + What labels are available for the kube_pod_info metric? diff --git a/evals/tasks/observability/labels/label-values.yaml b/evals/tasks/observability/labels/label-values.yaml new file mode 100644 index 000000000..2376f5a86 --- /dev/null +++ b/evals/tasks/observability/labels/label-values.yaml @@ -0,0 +1,22 @@ +kind: Task +apiVersion: mcpchecker/v1alpha2 +metadata: + name: "label-values" + difficulty: medium + parallel: true + runs: 1 + labels: + suite: observability + category: labels + toolType: exploration + description: | + Tests the full discovery workflow: list_metrics to verify the metric, then + get_label_values to retrieve unique namespace values for kube_pod_info. +spec: + verify: + - llmJudge: + contains: "namespace" + reason: "Verify the output lists actual namespace values from the cluster" + prompt: + inline: | + What are the unique namespace values for the kube_pod_info metric? diff --git a/evals/tasks/observability/labels/series-by-namespace.yaml b/evals/tasks/observability/labels/series-by-namespace.yaml new file mode 100644 index 000000000..c7984440c --- /dev/null +++ b/evals/tasks/observability/labels/series-by-namespace.yaml @@ -0,0 +1,28 @@ +kind: Task +apiVersion: mcpchecker/v1alpha2 +metadata: + name: "series-by-namespace" + difficulty: medium + parallel: true + runs: 1 + labels: + suite: observability + category: labels + toolType: exploration + description: | + Tests if the agent can use the get_series tool with a label selector + to find time series scoped to a specific namespace. The agent should + first verify the metric exists, then use get_series with a namespace + matcher to report the cardinality within that scope. +spec: + verify: + - llmJudge: + contains: "series" + reason: "Verify the agent reported series information" + - llmJudge: + contains: "monitoring" + reason: "Verify the agent scoped to the monitoring namespace" + prompt: + inline: | + How many time series exist for container_cpu_usage_seconds_total + in the monitoring namespace? diff --git a/evals/tasks/observability/metrics/list-metrics.yaml b/evals/tasks/observability/metrics/list-metrics.yaml new file mode 100644 index 000000000..6a8fe405a --- /dev/null +++ b/evals/tasks/observability/metrics/list-metrics.yaml @@ -0,0 +1,22 @@ +kind: Task +apiVersion: mcpchecker/v1alpha2 +metadata: + name: "list-kube-metrics" + difficulty: easy + parallel: true + runs: 1 + labels: + suite: observability + category: metrics + toolType: discovery + description: | + Tests if the agent can discover the list_metrics tool to find Kubernetes-related + metrics. The agent should use the name_regex parameter to filter for kube metrics. +spec: + verify: + - llmJudge: + contains: "kube" + reason: "Verify the output lists Kubernetes metrics matching the kube prefix" + prompt: + inline: | + List all available Prometheus metrics that contain 'kube' in the name. diff --git a/evals/tasks/observability/metrics/list-node-metrics.yaml b/evals/tasks/observability/metrics/list-node-metrics.yaml new file mode 100644 index 000000000..9c87f7b43 --- /dev/null +++ b/evals/tasks/observability/metrics/list-node-metrics.yaml @@ -0,0 +1,22 @@ +kind: Task +apiVersion: mcpchecker/v1alpha2 +metadata: + name: "list-node-metrics" + difficulty: easy + parallel: true + runs: 1 + labels: + suite: observability + category: metrics + toolType: discovery + description: | + Tests if the agent can discover node-related metrics using the list_metrics tool + with a regex filter for node metrics. +spec: + verify: + - llmJudge: + contains: "node" + reason: "Verify the output lists node-related metrics" + prompt: + inline: | + What node-related metrics are available in Prometheus? diff --git a/evals/tasks/observability/queries/backend-reachability.yaml b/evals/tasks/observability/queries/backend-reachability.yaml new file mode 100644 index 000000000..33b78f190 --- /dev/null +++ b/evals/tasks/observability/queries/backend-reachability.yaml @@ -0,0 +1,23 @@ +kind: Task +apiVersion: mcpchecker/v1alpha2 +metadata: + name: "backend-reachability" + difficulty: easy + parallel: true + runs: 1 + labels: + suite: observability + category: queries + toolType: smoke-test + description: | + Smoke test to verify the agent can reach the metrics backend and get a + valid response. The agent should query the up metric and confirm the + backend is reachable before any complex eval tasks run. +spec: + verify: + - llmJudge: + contains: "up" + reason: "Verify the agent confirmed the metrics backend is reachable and targets are up" + prompt: + inline: | + Is the metrics backend reachable? Query up{job="prometheus-k8s"} and report the status. diff --git a/evals/tasks/observability/queries/cpu-usage.yaml b/evals/tasks/observability/queries/cpu-usage.yaml new file mode 100644 index 000000000..7d8c545ac --- /dev/null +++ b/evals/tasks/observability/queries/cpu-usage.yaml @@ -0,0 +1,25 @@ +kind: Task +apiVersion: mcpchecker/v1alpha2 +metadata: + name: "cpu-usage" + difficulty: medium + parallel: true + runs: 1 + labels: + suite: observability + category: queries + toolType: query + description: | + Tests if the agent can find the appropriate CPU usage metric and query it to + determine which pods are consuming the most CPU resources. +spec: + verify: + - llmJudge: + contains: "container_cpu_usage_seconds_total" + reason: "Verify the agent queried the correct CPU metric" + - llmJudge: + contains: "pod" + reason: "Verify the response identifies specific pods" + prompt: + inline: | + Which pods are using the most CPU? diff --git a/evals/tasks/observability/queries/crashlooping-pods.yaml b/evals/tasks/observability/queries/crashlooping-pods.yaml new file mode 100644 index 000000000..fcad5ef03 --- /dev/null +++ b/evals/tasks/observability/queries/crashlooping-pods.yaml @@ -0,0 +1,22 @@ +kind: Task +apiVersion: mcpchecker/v1alpha2 +metadata: + name: "crashlooping-pods" + difficulty: medium + parallel: true + runs: 1 + labels: + suite: observability + category: queries + toolType: range-query + description: | + Tests if the agent can identify crashlooping pods by discovering and querying + the kube_pod_container_status_restarts_total metric over a time range. +spec: + verify: + - llmJudge: + contains: "kube_pod_container_status_restarts_total" + reason: "Verify the agent used the correct metric for container restart counts" + prompt: + inline: | + Which pods were crashlooping in the last 5 minutes? diff --git a/evals/tasks/observability/queries/diagnose-cluster-health.yaml b/evals/tasks/observability/queries/diagnose-cluster-health.yaml new file mode 100644 index 000000000..19ea059e1 --- /dev/null +++ b/evals/tasks/observability/queries/diagnose-cluster-health.yaml @@ -0,0 +1,28 @@ +kind: Task +apiVersion: mcpchecker/v1alpha2 +metadata: + name: "diagnose-cluster-health" + difficulty: hard + parallel: true + runs: 1 + labels: + suite: observability + category: queries + toolType: diagnostic + description: | + Tests the agent's ability to handle an ambiguous diagnostic prompt. + The agent must autonomously decide which tools and metrics to check, + ideally covering alerts, node status, and pod health without being + told exactly what to look for. +spec: + verify: + - llmJudge: + contains: "cluster" + reason: "Verify the response provides an overall cluster health assessment" + - llmJudge: + contains: "node" + reason: "Verify the agent checked node-level health indicators" + prompt: + inline: | + Is the cluster healthy? Give me an overview of any issues. + Check alerts, node status, and pod health. diff --git a/evals/tasks/observability/queries/high-cardinality-rejection.yaml b/evals/tasks/observability/queries/high-cardinality-rejection.yaml new file mode 100644 index 000000000..f8e6d4cd5 --- /dev/null +++ b/evals/tasks/observability/queries/high-cardinality-rejection.yaml @@ -0,0 +1,24 @@ +kind: Task +apiVersion: mcpchecker/v1alpha2 +metadata: + name: "high-cardinality-rejection" + difficulty: medium + parallel: true + runs: 1 + labels: + suite: observability + category: queries + toolType: error-handling + description: | + Tests agent behavior when a query is rejected by obs-mcp guardrails + due to high cardinality. The agent should explain the guardrail + rejection and suggest a scoped alternative rather than giving up. +spec: + verify: + - llmJudge: + contains: "namespace" + reason: "Verify the agent suggests scoping the query by namespace or another dimension" + prompt: + inline: | + Show me the CPU, memory, network, and disk usage for every single container + across all namespaces over the last 24 hours with 1-second resolution. diff --git a/evals/tasks/observability/queries/memory-usage.yaml b/evals/tasks/observability/queries/memory-usage.yaml new file mode 100644 index 000000000..ffd1d113e --- /dev/null +++ b/evals/tasks/observability/queries/memory-usage.yaml @@ -0,0 +1,25 @@ +kind: Task +apiVersion: mcpchecker/v1alpha2 +metadata: + name: "memory-usage" + difficulty: medium + parallel: true + runs: 1 + labels: + suite: observability + category: queries + toolType: query + description: | + Tests if the agent can find the appropriate memory usage metric and query it + to determine which pods are consuming the most memory resources. +spec: + verify: + - llmJudge: + contains: "container_memory_working_set_bytes" + reason: "Verify the agent queried the correct memory metric" + - llmJudge: + contains: "pod" + reason: "Verify the response identifies specific pods" + prompt: + inline: | + Which pods are using the most memory? Show me the top 5. diff --git a/evals/tasks/observability/queries/namespace-pod-count.yaml b/evals/tasks/observability/queries/namespace-pod-count.yaml new file mode 100644 index 000000000..642fd905d --- /dev/null +++ b/evals/tasks/observability/queries/namespace-pod-count.yaml @@ -0,0 +1,27 @@ +kind: Task +apiVersion: mcpchecker/v1alpha2 +metadata: + name: "namespace-pod-count" + difficulty: medium + parallel: true + runs: 1 + labels: + suite: observability + category: queries + toolType: multi-step + description: | + Tests multi-step reasoning: the agent must discover a suitable metric + via list_metrics, explore label values to find namespaces, then query + to determine which namespaces have the most pods. Requires chaining + discovery, label exploration, and a query. +spec: + verify: + - llmJudge: + contains: "namespace" + reason: "Verify the response lists namespaces" + - llmJudge: + contains: "pod" + reason: "Verify the response includes pod counts" + prompt: + inline: | + Which namespaces have the most running pods? Show me the top 5. diff --git a/evals/tasks/observability/queries/namespace-resource-usage.yaml b/evals/tasks/observability/queries/namespace-resource-usage.yaml new file mode 100644 index 000000000..ad776a282 --- /dev/null +++ b/evals/tasks/observability/queries/namespace-resource-usage.yaml @@ -0,0 +1,31 @@ +kind: Task +apiVersion: mcpchecker/v1alpha2 +metadata: + name: "namespace-resource-usage" + difficulty: hard + parallel: true + runs: 1 + labels: + suite: observability + category: queries + toolType: multi-step + description: | + Tests multi-step reasoning: the agent must discover CPU and memory metrics, + then query them with namespace-level aggregation to identify the top + resource-consuming namespaces. Requires chaining list_metrics, label + exploration, and multiple instant queries. +spec: + verify: + - llmJudge: + contains: "namespace" + reason: "Verify the response breaks down resource usage by namespace" + - llmJudge: + contains: "cpu" + reason: "Verify the response includes CPU usage data" + - llmJudge: + contains: "memory" + reason: "Verify the response includes memory usage data" + prompt: + inline: | + Which namespace is consuming the most CPU and memory? + Show me the top namespace for each. diff --git a/evals/tasks/observability/queries/network-traffic.yaml b/evals/tasks/observability/queries/network-traffic.yaml new file mode 100644 index 000000000..23f147da0 --- /dev/null +++ b/evals/tasks/observability/queries/network-traffic.yaml @@ -0,0 +1,25 @@ +kind: Task +apiVersion: mcpchecker/v1alpha2 +metadata: + name: "network-traffic" + difficulty: medium + parallel: true + runs: 1 + labels: + suite: observability + category: queries + toolType: query + description: | + Tests if the agent can discover network-related metrics and query them to find + which pods are receiving the most network traffic. +spec: + verify: + - llmJudge: + contains: "container_network_receive_bytes_total" + reason: "Verify the agent queried the correct network metric" + - llmJudge: + contains: "pod" + reason: "Verify the response identifies specific pods" + prompt: + inline: | + Which pods are receiving the most network traffic? diff --git a/evals/tasks/observability/queries/nonexistent-metric.yaml b/evals/tasks/observability/queries/nonexistent-metric.yaml new file mode 100644 index 000000000..b3890eeb3 --- /dev/null +++ b/evals/tasks/observability/queries/nonexistent-metric.yaml @@ -0,0 +1,23 @@ +kind: Task +apiVersion: mcpchecker/v1alpha2 +metadata: + name: "nonexistent-metric" + difficulty: easy + parallel: true + runs: 1 + labels: + suite: observability + category: queries + toolType: error-handling + description: | + Tests agent recovery when querying a metric that does not exist. + The agent should discover that the metric is missing via list_metrics + and inform the user rather than fabricating results. +spec: + verify: + - llmJudge: + contains: "not found" + reason: "Verify the agent communicates that the metric does not exist or was not found" + prompt: + inline: | + What is the current value of the metric fake_nonexistent_metric_total? diff --git a/evals/tasks/observability/queries/nonexistent-namespace.yaml b/evals/tasks/observability/queries/nonexistent-namespace.yaml new file mode 100644 index 000000000..b9e440814 --- /dev/null +++ b/evals/tasks/observability/queries/nonexistent-namespace.yaml @@ -0,0 +1,23 @@ +kind: Task +apiVersion: mcpchecker/v1alpha2 +metadata: + name: "nonexistent-namespace" + difficulty: easy + parallel: true + runs: 1 + labels: + suite: observability + category: queries + toolType: error-handling + description: | + Tests agent behavior when querying for resources in a namespace that + does not exist. The agent should query and report empty results + gracefully rather than hallucinating data. +spec: + verify: + - llmJudge: + contains: "no data" + reason: "Verify the agent reports no data, no results, or no pods found in the nonexistent namespace" + prompt: + inline: | + Show me the memory usage for all pods in the namespace called totally-fake-namespace-12345. diff --git a/evals/tasks/observability/queries/pending-pods.yaml b/evals/tasks/observability/queries/pending-pods.yaml new file mode 100644 index 000000000..7f06f5d9e --- /dev/null +++ b/evals/tasks/observability/queries/pending-pods.yaml @@ -0,0 +1,23 @@ +kind: Task +apiVersion: mcpchecker/v1alpha2 +metadata: + name: "pending-pods" + difficulty: medium + parallel: true + runs: 1 + labels: + suite: observability + category: queries + toolType: query + description: | + Tests if the agent can identify pods stuck in pending state by first discovering + the kube_pod_status_phase metric and then running an instant query to find + pods with phase=Pending. +spec: + verify: + - llmJudge: + contains: "kube_pod_status_phase" + reason: "Verify the agent used the correct metric for pod phase status" + prompt: + inline: | + Which pods are stuck in pending state? diff --git a/evals/tasks/observability/queries/pods-created.yaml b/evals/tasks/observability/queries/pods-created.yaml new file mode 100644 index 000000000..a8cee3114 --- /dev/null +++ b/evals/tasks/observability/queries/pods-created.yaml @@ -0,0 +1,22 @@ +kind: Task +apiVersion: mcpchecker/v1alpha2 +metadata: + name: "pods-created" + difficulty: medium + parallel: true + runs: 1 + labels: + suite: observability + category: queries + toolType: range-query + description: | + Tests if the agent can use a range query to find recently created pods by + discovering the kube_pod_created metric and querying it over a 5-minute window. +spec: + verify: + - llmJudge: + contains: "kube_pod_created" + reason: "Verify the agent used the correct metric for pod creation timestamps" + prompt: + inline: | + How many pods were created in the last 5 minutes? diff --git a/evals/tasks/observability/queries/prometheus-head-series.yaml b/evals/tasks/observability/queries/prometheus-head-series.yaml new file mode 100644 index 000000000..ee89d5953 --- /dev/null +++ b/evals/tasks/observability/queries/prometheus-head-series.yaml @@ -0,0 +1,22 @@ +kind: Task +apiVersion: mcpchecker/v1alpha2 +metadata: + name: "prometheus-head-series" + difficulty: easy + parallel: true + runs: 1 + labels: + suite: observability + category: queries + toolType: query + description: | + Tests if the agent can query Prometheus internal metrics to report the current + number of head series using prometheus_tsdb_head_series. +spec: + verify: + - llmJudge: + contains: "prometheus_tsdb_head_series" + reason: "Verify the agent used the correct Prometheus TSDB metric" + prompt: + inline: | + How many head series does Prometheus have? diff --git a/evals/tasks/observability/queries/prometheus-requests.yaml b/evals/tasks/observability/queries/prometheus-requests.yaml new file mode 100644 index 000000000..9560170ae --- /dev/null +++ b/evals/tasks/observability/queries/prometheus-requests.yaml @@ -0,0 +1,22 @@ +kind: Task +apiVersion: mcpchecker/v1alpha2 +metadata: + name: "prometheus-requests" + difficulty: medium + parallel: true + runs: 1 + labels: + suite: observability + category: queries + toolType: query + description: | + Tests if the agent can calculate the request rate to Prometheus by discovering + and querying the prometheus_http_requests_total metric. +spec: + verify: + - llmJudge: + contains: "prometheus_http_requests_total" + reason: "Verify the agent used the correct HTTP requests metric" + prompt: + inline: | + How many requests per second are being made to Prometheus? diff --git a/evals/tasks/observability/queries/prometheus-wal-size.yaml b/evals/tasks/observability/queries/prometheus-wal-size.yaml new file mode 100644 index 000000000..af6d18293 --- /dev/null +++ b/evals/tasks/observability/queries/prometheus-wal-size.yaml @@ -0,0 +1,22 @@ +kind: Task +apiVersion: mcpchecker/v1alpha2 +metadata: + name: "prometheus-wal-size" + difficulty: easy + parallel: true + runs: 1 + labels: + suite: observability + category: queries + toolType: query + description: | + Tests if the agent can query the current Prometheus WAL storage size using + the prometheus_tsdb_wal_storage_size_bytes metric. +spec: + verify: + - llmJudge: + contains: "prometheus_tsdb_wal_storage_size_bytes" + reason: "Verify the agent used the correct WAL storage metric" + prompt: + inline: | + What is the current storage size of the Prometheus WAL? diff --git a/evals/tasks/observability/queries/time-range-query.yaml b/evals/tasks/observability/queries/time-range-query.yaml new file mode 100644 index 000000000..521423c8c --- /dev/null +++ b/evals/tasks/observability/queries/time-range-query.yaml @@ -0,0 +1,26 @@ +kind: Task +apiVersion: mcpchecker/v1alpha2 +metadata: + name: "time-range-query" + difficulty: medium + parallel: true + runs: 1 + labels: + suite: observability + category: queries + toolType: multi-step + description: | + Tests whether the agent correctly uses execute_range_query with + appropriate start/end/step parameters when asked for data over + a specific time window. +spec: + verify: + - llmJudge: + contains: "30 minutes" + reason: "Verify the agent honoured the requested 30-minute time window" + - llmJudge: + contains: "cpu" + reason: "Verify the response includes CPU usage data" + prompt: + inline: | + Show me the CPU usage trend for pods in the default namespace over the last 30 minutes. diff --git a/evals/tasks/observability/queries/visualize-cpu-usage.yaml b/evals/tasks/observability/queries/visualize-cpu-usage.yaml new file mode 100644 index 000000000..0f6af1eb6 --- /dev/null +++ b/evals/tasks/observability/queries/visualize-cpu-usage.yaml @@ -0,0 +1,23 @@ +kind: Task +apiVersion: mcpchecker/v1alpha2 +metadata: + name: "visualize-cpu-usage" + difficulty: medium + parallel: true + runs: 1 + labels: + suite: observability + category: queries + toolType: visualization + description: | + Tests if the agent uses the show_timeseries tool to visualize CPU usage + as a chart. The agent should discover the metric, then use show_timeseries + to render a time-series visualization. +spec: + verify: + - llmJudge: + contains: "cpu" + reason: "Verify the agent queried and visualized CPU usage data" + prompt: + inline: | + Visualize the CPU usage for pods in the default namespace over the last 30 minutes. From 44ce079fe786b0d6a0dabe4b024297e435066993 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Reme=C5=A1?= Date: Wed, 22 Apr 2026 12:25:10 +0200 Subject: [PATCH 2/3] attempt to improve the observability evals --- .../observability/alerts/alert-investigation.yaml | 8 ++++---- evals/tasks/observability/alerts/filtered-alerts.yaml | 8 ++++---- evals/tasks/observability/alerts/get-alerts.yaml | 4 ++-- evals/tasks/observability/labels/get-series.yaml | 8 ++++---- evals/tasks/observability/labels/label-names.yaml | 4 ++-- evals/tasks/observability/labels/label-values.yaml | 4 ++-- .../observability/labels/series-by-namespace.yaml | 8 ++++---- evals/tasks/observability/metrics/list-metrics.yaml | 4 ++-- .../observability/metrics/list-node-metrics.yaml | 4 ++-- .../observability/queries/backend-reachability.yaml | 4 ++-- evals/tasks/observability/queries/cpu-usage.yaml | 4 ++-- .../observability/queries/crashlooping-pods.yaml | 2 +- .../queries/diagnose-cluster-health.yaml | 8 ++++---- .../queries/high-cardinality-rejection.yaml | 4 ++-- evals/tasks/observability/queries/memory-usage.yaml | 4 ++-- .../observability/queries/namespace-pod-count.yaml | 8 ++++---- .../queries/namespace-resource-usage.yaml | 11 ++++------- .../tasks/observability/queries/network-traffic.yaml | 4 ++-- .../observability/queries/nonexistent-metric.yaml | 2 +- .../observability/queries/nonexistent-namespace.yaml | 2 +- evals/tasks/observability/queries/pending-pods.yaml | 2 +- evals/tasks/observability/queries/pods-created.yaml | 2 +- .../observability/queries/prometheus-head-series.yaml | 2 +- .../observability/queries/prometheus-requests.yaml | 2 +- .../observability/queries/prometheus-wal-size.yaml | 2 +- .../tasks/observability/queries/time-range-query.yaml | 8 ++++---- .../observability/queries/visualize-cpu-usage.yaml | 4 ++-- 27 files changed, 62 insertions(+), 65 deletions(-) diff --git a/evals/tasks/observability/alerts/alert-investigation.yaml b/evals/tasks/observability/alerts/alert-investigation.yaml index 0e6b37819..da5369366 100644 --- a/evals/tasks/observability/alerts/alert-investigation.yaml +++ b/evals/tasks/observability/alerts/alert-investigation.yaml @@ -16,11 +16,11 @@ metadata: spec: verify: - llmJudge: - contains: "alert" - reason: "Verify the agent retrieved alerts from Alertmanager" + contains: "alertname" + reason: "Verify the agent called get_alerts and reported the results: either listing specific alert names from Alertmanager, or explicitly confirming that no alerts are currently firing" - llmJudge: - contains: "metric" - reason: "Verify the agent investigated a related Prometheus metric" + contains: "query" + reason: "Verify the agent investigated related metrics by running a Prometheus query" prompt: inline: | Check if there are any firing alerts. If there are, investigate diff --git a/evals/tasks/observability/alerts/filtered-alerts.yaml b/evals/tasks/observability/alerts/filtered-alerts.yaml index 548c0363b..2726598b0 100644 --- a/evals/tasks/observability/alerts/filtered-alerts.yaml +++ b/evals/tasks/observability/alerts/filtered-alerts.yaml @@ -17,11 +17,11 @@ metadata: spec: verify: - llmJudge: - contains: "alert" - reason: "Verify the agent retrieved and reported on alerts" + contains: "alertname" + reason: "Verify the agent called get_alerts and reported the results: either listing specific alert names with their status, or explicitly confirming that no critical alerts exist" - llmJudge: - contains: "critical" - reason: "Verify the agent filtered or addressed the critical severity" + contains: "severity" + reason: "Verify the agent addressed the severity filter in its response" prompt: inline: | Are there any firing alerts with severity=critical? Show only active alerts. diff --git a/evals/tasks/observability/alerts/get-alerts.yaml b/evals/tasks/observability/alerts/get-alerts.yaml index fa758d473..3b167faa7 100644 --- a/evals/tasks/observability/alerts/get-alerts.yaml +++ b/evals/tasks/observability/alerts/get-alerts.yaml @@ -15,8 +15,8 @@ metadata: spec: verify: - llmJudge: - contains: "alerts" - reason: "Verify the output reports on the alert state, whether alerts are firing or not" + contains: "alertname" + reason: "Verify the agent called get_alerts and reported the results: either listing specific alert names with their status, or explicitly confirming that no alerts are currently firing" prompt: inline: | Check the cluster for any firing alerts and report what you find. diff --git a/evals/tasks/observability/labels/get-series.yaml b/evals/tasks/observability/labels/get-series.yaml index 317e85ae6..6cd93a4da 100644 --- a/evals/tasks/observability/labels/get-series.yaml +++ b/evals/tasks/observability/labels/get-series.yaml @@ -16,11 +16,11 @@ metadata: spec: verify: - llmJudge: - contains: "series" - reason: "Verify the output reports time series information" + contains: "namespace" + reason: "Verify the agent called get_series and reported the results: either listing actual series data containing label dimensions like namespace, or confirming no series were found" - llmJudge: - contains: "kube_pod_info" - reason: "Verify the agent queried the correct metric" + contains: "node" + reason: "Verify the agent called get_series and reported series data with node label values, or explicitly confirmed the metric has no series with node labels" prompt: inline: | How many time series exist for the kube_pod_info metric? Show the cardinality. diff --git a/evals/tasks/observability/labels/label-names.yaml b/evals/tasks/observability/labels/label-names.yaml index 8d306d528..084c923a9 100644 --- a/evals/tasks/observability/labels/label-names.yaml +++ b/evals/tasks/observability/labels/label-names.yaml @@ -17,10 +17,10 @@ spec: verify: - llmJudge: contains: "namespace" - reason: "Verify the output includes the namespace label which is a standard Kubernetes label" + reason: "Verify the agent called get_label_names and reported the results: the output should include the namespace label which is a standard Kubernetes label" - llmJudge: contains: "pod" - reason: "Verify the output includes the pod label" + reason: "Verify the agent called get_label_names and reported the results: the output should include the pod label" prompt: inline: | What labels are available for the kube_pod_info metric? diff --git a/evals/tasks/observability/labels/label-values.yaml b/evals/tasks/observability/labels/label-values.yaml index 2376f5a86..e012ebca3 100644 --- a/evals/tasks/observability/labels/label-values.yaml +++ b/evals/tasks/observability/labels/label-values.yaml @@ -15,8 +15,8 @@ metadata: spec: verify: - llmJudge: - contains: "namespace" - reason: "Verify the output lists actual namespace values from the cluster" + contains: "kube-system" + reason: "Verify the agent called get_label_values and reported the results: the output should list actual namespace values from the cluster such as kube-system" prompt: inline: | What are the unique namespace values for the kube_pod_info metric? diff --git a/evals/tasks/observability/labels/series-by-namespace.yaml b/evals/tasks/observability/labels/series-by-namespace.yaml index c7984440c..f782a5a4d 100644 --- a/evals/tasks/observability/labels/series-by-namespace.yaml +++ b/evals/tasks/observability/labels/series-by-namespace.yaml @@ -17,11 +17,11 @@ metadata: spec: verify: - llmJudge: - contains: "series" - reason: "Verify the agent reported series information" + contains: "pod" + reason: "Verify the agent called get_series and reported the results: either listing actual series data containing label dimensions like pod, or confirming no series were found for the given namespace" - llmJudge: - contains: "monitoring" - reason: "Verify the agent scoped to the monitoring namespace" + contains: "container" + reason: "Verify the agent called get_series and reported series with container label values from the monitoring namespace, or explicitly confirmed no matching series exist" prompt: inline: | How many time series exist for container_cpu_usage_seconds_total diff --git a/evals/tasks/observability/metrics/list-metrics.yaml b/evals/tasks/observability/metrics/list-metrics.yaml index 6a8fe405a..ee3283059 100644 --- a/evals/tasks/observability/metrics/list-metrics.yaml +++ b/evals/tasks/observability/metrics/list-metrics.yaml @@ -15,8 +15,8 @@ metadata: spec: verify: - llmJudge: - contains: "kube" - reason: "Verify the output lists Kubernetes metrics matching the kube prefix" + contains: "kube_pod_info" + reason: "Verify the agent called list_metrics and reported the results: the output should list specific kube metrics discovered from Prometheus" prompt: inline: | List all available Prometheus metrics that contain 'kube' in the name. diff --git a/evals/tasks/observability/metrics/list-node-metrics.yaml b/evals/tasks/observability/metrics/list-node-metrics.yaml index 9c87f7b43..d6f4184a4 100644 --- a/evals/tasks/observability/metrics/list-node-metrics.yaml +++ b/evals/tasks/observability/metrics/list-node-metrics.yaml @@ -15,8 +15,8 @@ metadata: spec: verify: - llmJudge: - contains: "node" - reason: "Verify the output lists node-related metrics" + contains: "node_" + reason: "Verify the agent called list_metrics and reported the results: the output should list specific node metrics discovered from Prometheus (e.g. node_cpu_seconds_total)" prompt: inline: | What node-related metrics are available in Prometheus? diff --git a/evals/tasks/observability/queries/backend-reachability.yaml b/evals/tasks/observability/queries/backend-reachability.yaml index 33b78f190..5b5009583 100644 --- a/evals/tasks/observability/queries/backend-reachability.yaml +++ b/evals/tasks/observability/queries/backend-reachability.yaml @@ -16,8 +16,8 @@ metadata: spec: verify: - llmJudge: - contains: "up" - reason: "Verify the agent confirmed the metrics backend is reachable and targets are up" + contains: "prometheus-k8s" + reason: "Verify the agent called execute_instant_query with the up metric and reported the results: the response should confirm the prometheus-k8s target status" prompt: inline: | Is the metrics backend reachable? Query up{job="prometheus-k8s"} and report the status. diff --git a/evals/tasks/observability/queries/cpu-usage.yaml b/evals/tasks/observability/queries/cpu-usage.yaml index 7d8c545ac..f4c6a0f67 100644 --- a/evals/tasks/observability/queries/cpu-usage.yaml +++ b/evals/tasks/observability/queries/cpu-usage.yaml @@ -16,10 +16,10 @@ spec: verify: - llmJudge: contains: "container_cpu_usage_seconds_total" - reason: "Verify the agent queried the correct CPU metric" + reason: "Verify the agent called execute_instant_query with the correct CPU metric container_cpu_usage_seconds_total and reported the results" - llmJudge: contains: "pod" - reason: "Verify the response identifies specific pods" + reason: "Verify the agent reported query results identifying specific pods, or explicitly confirmed no CPU data is available" prompt: inline: | Which pods are using the most CPU? diff --git a/evals/tasks/observability/queries/crashlooping-pods.yaml b/evals/tasks/observability/queries/crashlooping-pods.yaml index fcad5ef03..24119f551 100644 --- a/evals/tasks/observability/queries/crashlooping-pods.yaml +++ b/evals/tasks/observability/queries/crashlooping-pods.yaml @@ -16,7 +16,7 @@ spec: verify: - llmJudge: contains: "kube_pod_container_status_restarts_total" - reason: "Verify the agent used the correct metric for container restart counts" + reason: "Verify the agent called execute_range_query with the correct metric kube_pod_container_status_restarts_total and reported the results: either listing crashlooping pods, or explicitly confirming no pods have restarted" prompt: inline: | Which pods were crashlooping in the last 5 minutes? diff --git a/evals/tasks/observability/queries/diagnose-cluster-health.yaml b/evals/tasks/observability/queries/diagnose-cluster-health.yaml index 19ea059e1..a2e45e1a9 100644 --- a/evals/tasks/observability/queries/diagnose-cluster-health.yaml +++ b/evals/tasks/observability/queries/diagnose-cluster-health.yaml @@ -17,11 +17,11 @@ metadata: spec: verify: - llmJudge: - contains: "cluster" - reason: "Verify the response provides an overall cluster health assessment" + contains: "alertname" + reason: "Verify the agent called get_alerts and reported the results: either listing specific alert names, or explicitly confirming that no alerts are currently firing" - llmJudge: - contains: "node" - reason: "Verify the agent checked node-level health indicators" + contains: "kube_" + reason: "Verify the agent called execute_instant_query with Kubernetes metrics (e.g. kube_node_status_condition) and reported the results for health assessment" prompt: inline: | Is the cluster healthy? Give me an overview of any issues. diff --git a/evals/tasks/observability/queries/high-cardinality-rejection.yaml b/evals/tasks/observability/queries/high-cardinality-rejection.yaml index f8e6d4cd5..9a5cf78c9 100644 --- a/evals/tasks/observability/queries/high-cardinality-rejection.yaml +++ b/evals/tasks/observability/queries/high-cardinality-rejection.yaml @@ -16,8 +16,8 @@ metadata: spec: verify: - llmJudge: - contains: "namespace" - reason: "Verify the agent suggests scoping the query by namespace or another dimension" + contains: "guardrail" + reason: "Verify the agent attempted a query that was rejected by a guardrail and reported the result: the response should explain the guardrail rejection and suggest a scoped alternative" prompt: inline: | Show me the CPU, memory, network, and disk usage for every single container diff --git a/evals/tasks/observability/queries/memory-usage.yaml b/evals/tasks/observability/queries/memory-usage.yaml index ffd1d113e..2a41e2c83 100644 --- a/evals/tasks/observability/queries/memory-usage.yaml +++ b/evals/tasks/observability/queries/memory-usage.yaml @@ -16,10 +16,10 @@ spec: verify: - llmJudge: contains: "container_memory_working_set_bytes" - reason: "Verify the agent queried the correct memory metric" + reason: "Verify the agent called execute_instant_query with the correct memory metric container_memory_working_set_bytes and reported the results" - llmJudge: contains: "pod" - reason: "Verify the response identifies specific pods" + reason: "Verify the agent reported query results identifying specific pods, or explicitly confirmed no memory data is available" prompt: inline: | Which pods are using the most memory? Show me the top 5. diff --git a/evals/tasks/observability/queries/namespace-pod-count.yaml b/evals/tasks/observability/queries/namespace-pod-count.yaml index 642fd905d..11ef3d0f8 100644 --- a/evals/tasks/observability/queries/namespace-pod-count.yaml +++ b/evals/tasks/observability/queries/namespace-pod-count.yaml @@ -17,11 +17,11 @@ metadata: spec: verify: - llmJudge: - contains: "namespace" - reason: "Verify the response lists namespaces" + contains: "kube-system" + reason: "Verify the agent called execute_instant_query and reported the results: the response should list actual namespace names from the cluster such as kube-system" - llmJudge: - contains: "pod" - reason: "Verify the response includes pod counts" + contains: "kube_pod" + reason: "Verify the agent called execute_instant_query with a kube_pod metric to count pods and reported the results" prompt: inline: | Which namespaces have the most running pods? Show me the top 5. diff --git a/evals/tasks/observability/queries/namespace-resource-usage.yaml b/evals/tasks/observability/queries/namespace-resource-usage.yaml index ad776a282..26ee2d73f 100644 --- a/evals/tasks/observability/queries/namespace-resource-usage.yaml +++ b/evals/tasks/observability/queries/namespace-resource-usage.yaml @@ -17,14 +17,11 @@ metadata: spec: verify: - llmJudge: - contains: "namespace" - reason: "Verify the response breaks down resource usage by namespace" + contains: "container_cpu_usage_seconds_total" + reason: "Verify the agent called list_metrics to discover and then executed a query with the correct CPU metric container_cpu_usage_seconds_total, reporting the results" - llmJudge: - contains: "cpu" - reason: "Verify the response includes CPU usage data" - - llmJudge: - contains: "memory" - reason: "Verify the response includes memory usage data" + contains: "container_memory_working_set_bytes" + reason: "Verify the agent called list_metrics to discover and then executed a query with the correct memory metric container_memory_working_set_bytes, reporting the results" prompt: inline: | Which namespace is consuming the most CPU and memory? diff --git a/evals/tasks/observability/queries/network-traffic.yaml b/evals/tasks/observability/queries/network-traffic.yaml index 23f147da0..879b7a943 100644 --- a/evals/tasks/observability/queries/network-traffic.yaml +++ b/evals/tasks/observability/queries/network-traffic.yaml @@ -16,10 +16,10 @@ spec: verify: - llmJudge: contains: "container_network_receive_bytes_total" - reason: "Verify the agent queried the correct network metric" + reason: "Verify the agent called execute_instant_query with the correct network metric container_network_receive_bytes_total and reported the results" - llmJudge: contains: "pod" - reason: "Verify the response identifies specific pods" + reason: "Verify the agent reported query results identifying specific pods, or explicitly confirmed no network data is available" prompt: inline: | Which pods are receiving the most network traffic? diff --git a/evals/tasks/observability/queries/nonexistent-metric.yaml b/evals/tasks/observability/queries/nonexistent-metric.yaml index b3890eeb3..25f0dbd22 100644 --- a/evals/tasks/observability/queries/nonexistent-metric.yaml +++ b/evals/tasks/observability/queries/nonexistent-metric.yaml @@ -17,7 +17,7 @@ spec: verify: - llmJudge: contains: "not found" - reason: "Verify the agent communicates that the metric does not exist or was not found" + reason: "Verify the agent called list_metrics and reported the result: explicitly communicating that the metric does not exist or was not found" prompt: inline: | What is the current value of the metric fake_nonexistent_metric_total? diff --git a/evals/tasks/observability/queries/nonexistent-namespace.yaml b/evals/tasks/observability/queries/nonexistent-namespace.yaml index b9e440814..3384f43e3 100644 --- a/evals/tasks/observability/queries/nonexistent-namespace.yaml +++ b/evals/tasks/observability/queries/nonexistent-namespace.yaml @@ -17,7 +17,7 @@ spec: verify: - llmJudge: contains: "no data" - reason: "Verify the agent reports no data, no results, or no pods found in the nonexistent namespace" + reason: "Verify the agent called execute_instant_query and reported the result: explicitly confirming no data, no results, or no pods found in the nonexistent namespace" prompt: inline: | Show me the memory usage for all pods in the namespace called totally-fake-namespace-12345. diff --git a/evals/tasks/observability/queries/pending-pods.yaml b/evals/tasks/observability/queries/pending-pods.yaml index 7f06f5d9e..5ca55f27e 100644 --- a/evals/tasks/observability/queries/pending-pods.yaml +++ b/evals/tasks/observability/queries/pending-pods.yaml @@ -17,7 +17,7 @@ spec: verify: - llmJudge: contains: "kube_pod_status_phase" - reason: "Verify the agent used the correct metric for pod phase status" + reason: "Verify the agent called execute_instant_query with the correct metric kube_pod_status_phase and reported the results: either listing pending pods, or explicitly confirming no pods are in pending state" prompt: inline: | Which pods are stuck in pending state? diff --git a/evals/tasks/observability/queries/pods-created.yaml b/evals/tasks/observability/queries/pods-created.yaml index a8cee3114..cacf0e76b 100644 --- a/evals/tasks/observability/queries/pods-created.yaml +++ b/evals/tasks/observability/queries/pods-created.yaml @@ -16,7 +16,7 @@ spec: verify: - llmJudge: contains: "kube_pod_created" - reason: "Verify the agent used the correct metric for pod creation timestamps" + reason: "Verify the agent called execute_range_query with the kube_pod_created metric and reported the results: either listing recently created pods, or explicitly confirming no pods were created in the time window" prompt: inline: | How many pods were created in the last 5 minutes? diff --git a/evals/tasks/observability/queries/prometheus-head-series.yaml b/evals/tasks/observability/queries/prometheus-head-series.yaml index ee89d5953..bcf2c77c8 100644 --- a/evals/tasks/observability/queries/prometheus-head-series.yaml +++ b/evals/tasks/observability/queries/prometheus-head-series.yaml @@ -16,7 +16,7 @@ spec: verify: - llmJudge: contains: "prometheus_tsdb_head_series" - reason: "Verify the agent used the correct Prometheus TSDB metric" + reason: "Verify the agent called execute_instant_query with the correct Prometheus TSDB metric prometheus_tsdb_head_series and reported the current head series count" prompt: inline: | How many head series does Prometheus have? diff --git a/evals/tasks/observability/queries/prometheus-requests.yaml b/evals/tasks/observability/queries/prometheus-requests.yaml index 9560170ae..4445d033a 100644 --- a/evals/tasks/observability/queries/prometheus-requests.yaml +++ b/evals/tasks/observability/queries/prometheus-requests.yaml @@ -16,7 +16,7 @@ spec: verify: - llmJudge: contains: "prometheus_http_requests_total" - reason: "Verify the agent used the correct HTTP requests metric" + reason: "Verify the agent called execute_instant_query with the correct HTTP requests metric prometheus_http_requests_total and reported the request rate" prompt: inline: | How many requests per second are being made to Prometheus? diff --git a/evals/tasks/observability/queries/prometheus-wal-size.yaml b/evals/tasks/observability/queries/prometheus-wal-size.yaml index af6d18293..14cb8a944 100644 --- a/evals/tasks/observability/queries/prometheus-wal-size.yaml +++ b/evals/tasks/observability/queries/prometheus-wal-size.yaml @@ -16,7 +16,7 @@ spec: verify: - llmJudge: contains: "prometheus_tsdb_wal_storage_size_bytes" - reason: "Verify the agent used the correct WAL storage metric" + reason: "Verify the agent called execute_instant_query with the correct WAL storage metric prometheus_tsdb_wal_storage_size_bytes and reported the current WAL size" prompt: inline: | What is the current storage size of the Prometheus WAL? diff --git a/evals/tasks/observability/queries/time-range-query.yaml b/evals/tasks/observability/queries/time-range-query.yaml index 521423c8c..2ac2c0d60 100644 --- a/evals/tasks/observability/queries/time-range-query.yaml +++ b/evals/tasks/observability/queries/time-range-query.yaml @@ -16,11 +16,11 @@ metadata: spec: verify: - llmJudge: - contains: "30 minutes" - reason: "Verify the agent honoured the requested 30-minute time window" + contains: "container_cpu_usage_seconds_total" + reason: "Verify the agent called execute_range_query with the correct CPU metric container_cpu_usage_seconds_total and reported the results" - llmJudge: - contains: "cpu" - reason: "Verify the response includes CPU usage data" + contains: "pod" + reason: "Verify the agent reported range query results including specific pod names, or explicitly confirmed no CPU data is available for the given namespace" prompt: inline: | Show me the CPU usage trend for pods in the default namespace over the last 30 minutes. diff --git a/evals/tasks/observability/queries/visualize-cpu-usage.yaml b/evals/tasks/observability/queries/visualize-cpu-usage.yaml index 0f6af1eb6..8894dabd5 100644 --- a/evals/tasks/observability/queries/visualize-cpu-usage.yaml +++ b/evals/tasks/observability/queries/visualize-cpu-usage.yaml @@ -16,8 +16,8 @@ metadata: spec: verify: - llmJudge: - contains: "cpu" - reason: "Verify the agent queried and visualized CPU usage data" + contains: "container_cpu_usage_seconds_total" + reason: "Verify the agent called show_timeseries with the correct CPU metric container_cpu_usage_seconds_total and reported the visualization results" prompt: inline: | Visualize the CPU usage for pods in the default namespace over the last 30 minutes. From f0113dd4bc60e67a05e0d185b828476fcf19391d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Reme=C5=A1?= Date: Thu, 23 Apr 2026 15:58:29 +0200 Subject: [PATCH 3/3] update to the latest version --- .../alerts/alert-investigation.yaml | 17 ++++++++--------- .../observability/alerts/filtered-alerts.yaml | 11 ++++------- .../tasks/observability/alerts/get-alerts.yaml | 6 +++--- .../observability/alerts/get-silences.yaml | 6 +++--- .../tasks/observability/labels/get-series.yaml | 10 +++++----- .../tasks/observability/labels/label-names.yaml | 6 +++--- .../observability/labels/label-values.yaml | 4 ++-- .../labels/series-by-namespace.yaml | 8 ++++---- .../observability/metrics/list-metrics.yaml | 4 ++-- .../metrics/list-node-metrics.yaml | 4 ++-- .../queries/backend-reachability.yaml | 4 ++-- .../tasks/observability/queries/cpu-usage.yaml | 6 +++--- .../queries/crashlooping-pods.yaml | 4 ++-- .../queries/diagnose-cluster-health.yaml | 8 ++++---- .../queries/high-cardinality-rejection.yaml | 4 ++-- .../observability/queries/memory-usage.yaml | 8 ++++---- .../queries/namespace-pod-count.yaml | 6 +++--- .../queries/namespace-resource-usage.yaml | 8 ++++---- .../observability/queries/network-traffic.yaml | 6 +++--- .../queries/nonexistent-metric.yaml | 4 ++-- .../queries/nonexistent-namespace.yaml | 4 ++-- .../observability/queries/pending-pods.yaml | 4 ++-- .../observability/queries/pods-created.yaml | 4 ++-- .../queries/prometheus-head-series.yaml | 4 ++-- .../queries/prometheus-requests.yaml | 4 ++-- .../queries/prometheus-wal-size.yaml | 4 ++-- .../observability/queries/time-range-query.yaml | 6 +++--- .../queries/visualize-cpu-usage.yaml | 4 ++-- 28 files changed, 82 insertions(+), 86 deletions(-) diff --git a/evals/tasks/observability/alerts/alert-investigation.yaml b/evals/tasks/observability/alerts/alert-investigation.yaml index da5369366..f0f35ce2d 100644 --- a/evals/tasks/observability/alerts/alert-investigation.yaml +++ b/evals/tasks/observability/alerts/alert-investigation.yaml @@ -6,24 +6,23 @@ metadata: parallel: true runs: 1 labels: - suite: observability category: alerts + suite: observability toolType: multi-step description: | Tests if the agent can perform multi-step alert triage: first retrieving - alerts from Alertmanager, then investigating related metrics for the most - critical alert using Prometheus queries. + alerts from Alertmanager, then investigating related metrics for a + firing alert using queries. spec: verify: - llmJudge: - contains: "alertname" - reason: "Verify the agent called get_alerts and reported the results: either listing specific alert names from Alertmanager, or explicitly confirming that no alerts are currently firing" + contains: "AlertmanagerReceiversNotConfigured" + reason: "Verify the agent retrieved firing alerts and identified AlertmanagerReceiversNotConfigured" - llmJudge: - contains: "query" - reason: "Verify the agent investigated related metrics by running a Prometheus query" + contains: "alertmanager" + reason: "Verify the agent investigated Alertmanager-related metrics for the alert" prompt: inline: | Check if there are any firing alerts. If there are, investigate the related metrics for the most critical alert and summarize - what's happening. If there are no firing alerts, check cluster - health metrics instead. + what's happening. diff --git a/evals/tasks/observability/alerts/filtered-alerts.yaml b/evals/tasks/observability/alerts/filtered-alerts.yaml index 2726598b0..6f9aaf91a 100644 --- a/evals/tasks/observability/alerts/filtered-alerts.yaml +++ b/evals/tasks/observability/alerts/filtered-alerts.yaml @@ -6,8 +6,8 @@ metadata: parallel: true runs: 1 labels: - suite: observability category: alerts + suite: observability toolType: alertmanager description: | Tests if the agent uses the get_alerts tool with filter parameters @@ -17,11 +17,8 @@ metadata: spec: verify: - llmJudge: - contains: "alertname" - reason: "Verify the agent called get_alerts and reported the results: either listing specific alert names with their status, or explicitly confirming that no critical alerts exist" - - llmJudge: - contains: "severity" - reason: "Verify the agent addressed the severity filter in its response" + contains: "AlertmanagerReceiversNotConfigured" + reason: "Verify the agent filtered for warning-severity alerts and found AlertmanagerReceiversNotConfigured" prompt: inline: | - Are there any firing alerts with severity=critical? Show only active alerts. + Show me only the active alerts with severity=warning. diff --git a/evals/tasks/observability/alerts/get-alerts.yaml b/evals/tasks/observability/alerts/get-alerts.yaml index 3b167faa7..49dbd5f92 100644 --- a/evals/tasks/observability/alerts/get-alerts.yaml +++ b/evals/tasks/observability/alerts/get-alerts.yaml @@ -6,8 +6,8 @@ metadata: parallel: true runs: 1 labels: - suite: observability category: alerts + suite: observability toolType: alertmanager description: | Tests if the agent can discover and use the get_alerts tool to retrieve @@ -15,8 +15,8 @@ metadata: spec: verify: - llmJudge: - contains: "alertname" - reason: "Verify the agent called get_alerts and reported the results: either listing specific alert names with their status, or explicitly confirming that no alerts are currently firing" + contains: "Watchdog" + reason: "Verify the agent retrieved firing alerts and reported the Watchdog alert" prompt: inline: | Check the cluster for any firing alerts and report what you find. diff --git a/evals/tasks/observability/alerts/get-silences.yaml b/evals/tasks/observability/alerts/get-silences.yaml index 726e60c31..f986c7851 100644 --- a/evals/tasks/observability/alerts/get-silences.yaml +++ b/evals/tasks/observability/alerts/get-silences.yaml @@ -6,8 +6,8 @@ metadata: parallel: true runs: 1 labels: - suite: observability category: alerts + suite: observability toolType: alertmanager description: | Tests if the agent can discover and use the get_silences tool to retrieve @@ -15,8 +15,8 @@ metadata: spec: verify: - llmJudge: - contains: "silence" - reason: "Verify the output discusses alert silences, including when no active silences exist" + contains: "silences" + reason: "Verify the response mentions silences. The agent should report either active silences with their matchers or that no active silences exist — both are valid outcomes" prompt: inline: | Are there any active silences in Alertmanager? diff --git a/evals/tasks/observability/labels/get-series.yaml b/evals/tasks/observability/labels/get-series.yaml index 6cd93a4da..95645079a 100644 --- a/evals/tasks/observability/labels/get-series.yaml +++ b/evals/tasks/observability/labels/get-series.yaml @@ -6,8 +6,8 @@ metadata: parallel: true runs: 1 labels: - suite: observability category: labels + suite: observability toolType: exploration description: | Tests if the agent can use the get_series tool to check cardinality for a metric. @@ -17,10 +17,10 @@ spec: verify: - llmJudge: contains: "namespace" - reason: "Verify the agent called get_series and reported the results: either listing actual series data containing label dimensions like namespace, or confirming no series were found" + reason: "Verify the agent retrieved actual series data containing label dimensions like namespace" - llmJudge: - contains: "node" - reason: "Verify the agent called get_series and reported series data with node label values, or explicitly confirmed the metric has no series with node labels" + contains: "kube_pod_info" + reason: "Verify the agent queried the kube_pod_info metric and reported its cardinality" prompt: inline: | - How many time series exist for the kube_pod_info metric? Show the cardinality. + How many time series exist for the kube_pod_info metric? Show the count and list the label names present. diff --git a/evals/tasks/observability/labels/label-names.yaml b/evals/tasks/observability/labels/label-names.yaml index 084c923a9..adf413963 100644 --- a/evals/tasks/observability/labels/label-names.yaml +++ b/evals/tasks/observability/labels/label-names.yaml @@ -6,8 +6,8 @@ metadata: parallel: true runs: 1 labels: - suite: observability category: labels + suite: observability toolType: exploration description: | Tests if the agent follows the correct workflow: first calling list_metrics to @@ -17,10 +17,10 @@ spec: verify: - llmJudge: contains: "namespace" - reason: "Verify the agent called get_label_names and reported the results: the output should include the namespace label which is a standard Kubernetes label" + reason: "Verify the output includes the namespace label which is a standard Kubernetes label" - llmJudge: contains: "pod" - reason: "Verify the agent called get_label_names and reported the results: the output should include the pod label" + reason: "Verify the output includes the pod label" prompt: inline: | What labels are available for the kube_pod_info metric? diff --git a/evals/tasks/observability/labels/label-values.yaml b/evals/tasks/observability/labels/label-values.yaml index e012ebca3..d78e5b39f 100644 --- a/evals/tasks/observability/labels/label-values.yaml +++ b/evals/tasks/observability/labels/label-values.yaml @@ -6,8 +6,8 @@ metadata: parallel: true runs: 1 labels: - suite: observability category: labels + suite: observability toolType: exploration description: | Tests the full discovery workflow: list_metrics to verify the metric, then @@ -16,7 +16,7 @@ spec: verify: - llmJudge: contains: "kube-system" - reason: "Verify the agent called get_label_values and reported the results: the output should list actual namespace values from the cluster such as kube-system" + reason: "Verify the output lists actual namespace values from the cluster such as kube-system" prompt: inline: | What are the unique namespace values for the kube_pod_info metric? diff --git a/evals/tasks/observability/labels/series-by-namespace.yaml b/evals/tasks/observability/labels/series-by-namespace.yaml index f782a5a4d..23d78b58e 100644 --- a/evals/tasks/observability/labels/series-by-namespace.yaml +++ b/evals/tasks/observability/labels/series-by-namespace.yaml @@ -6,8 +6,8 @@ metadata: parallel: true runs: 1 labels: - suite: observability category: labels + suite: observability toolType: exploration description: | Tests if the agent can use the get_series tool with a label selector @@ -18,11 +18,11 @@ spec: verify: - llmJudge: contains: "pod" - reason: "Verify the agent called get_series and reported the results: either listing actual series data containing label dimensions like pod, or confirming no series were found for the given namespace" + reason: "Verify the agent retrieved actual series data containing label dimensions like pod" - llmJudge: contains: "container" - reason: "Verify the agent called get_series and reported series with container label values from the monitoring namespace, or explicitly confirmed no matching series exist" + reason: "Verify the agent reported series with container label values from the namespace" prompt: inline: | How many time series exist for container_cpu_usage_seconds_total - in the monitoring namespace? + in the openshift-monitoring namespace? diff --git a/evals/tasks/observability/metrics/list-metrics.yaml b/evals/tasks/observability/metrics/list-metrics.yaml index ee3283059..1f5adc71a 100644 --- a/evals/tasks/observability/metrics/list-metrics.yaml +++ b/evals/tasks/observability/metrics/list-metrics.yaml @@ -6,8 +6,8 @@ metadata: parallel: true runs: 1 labels: - suite: observability category: metrics + suite: observability toolType: discovery description: | Tests if the agent can discover the list_metrics tool to find Kubernetes-related @@ -16,7 +16,7 @@ spec: verify: - llmJudge: contains: "kube_pod_info" - reason: "Verify the agent called list_metrics and reported the results: the output should list specific kube metrics discovered from Prometheus" + reason: "Verify the output lists specific kube metrics discovered from Prometheus" prompt: inline: | List all available Prometheus metrics that contain 'kube' in the name. diff --git a/evals/tasks/observability/metrics/list-node-metrics.yaml b/evals/tasks/observability/metrics/list-node-metrics.yaml index d6f4184a4..f496460aa 100644 --- a/evals/tasks/observability/metrics/list-node-metrics.yaml +++ b/evals/tasks/observability/metrics/list-node-metrics.yaml @@ -6,8 +6,8 @@ metadata: parallel: true runs: 1 labels: - suite: observability category: metrics + suite: observability toolType: discovery description: | Tests if the agent can discover node-related metrics using the list_metrics tool @@ -16,7 +16,7 @@ spec: verify: - llmJudge: contains: "node_" - reason: "Verify the agent called list_metrics and reported the results: the output should list specific node metrics discovered from Prometheus (e.g. node_cpu_seconds_total)" + reason: "Verify the output lists specific node metrics discovered from Prometheus (e.g. node_cpu_seconds_total)" prompt: inline: | What node-related metrics are available in Prometheus? diff --git a/evals/tasks/observability/queries/backend-reachability.yaml b/evals/tasks/observability/queries/backend-reachability.yaml index 5b5009583..5a027a5a9 100644 --- a/evals/tasks/observability/queries/backend-reachability.yaml +++ b/evals/tasks/observability/queries/backend-reachability.yaml @@ -6,8 +6,8 @@ metadata: parallel: true runs: 1 labels: - suite: observability category: queries + suite: observability toolType: smoke-test description: | Smoke test to verify the agent can reach the metrics backend and get a @@ -17,7 +17,7 @@ spec: verify: - llmJudge: contains: "prometheus-k8s" - reason: "Verify the agent called execute_instant_query with the up metric and reported the results: the response should confirm the prometheus-k8s target status" + reason: "Verify the agent queried the up metric and reported on the prometheus-k8s target" prompt: inline: | Is the metrics backend reachable? Query up{job="prometheus-k8s"} and report the status. diff --git a/evals/tasks/observability/queries/cpu-usage.yaml b/evals/tasks/observability/queries/cpu-usage.yaml index f4c6a0f67..3baddf6cf 100644 --- a/evals/tasks/observability/queries/cpu-usage.yaml +++ b/evals/tasks/observability/queries/cpu-usage.yaml @@ -6,8 +6,8 @@ metadata: parallel: true runs: 1 labels: - suite: observability category: queries + suite: observability toolType: query description: | Tests if the agent can find the appropriate CPU usage metric and query it to @@ -16,10 +16,10 @@ spec: verify: - llmJudge: contains: "container_cpu_usage_seconds_total" - reason: "Verify the agent called execute_instant_query with the correct CPU metric container_cpu_usage_seconds_total and reported the results" + reason: "Verify the agent queried the correct CPU metric" - llmJudge: contains: "pod" - reason: "Verify the agent reported query results identifying specific pods, or explicitly confirmed no CPU data is available" + reason: "Verify the response identifies specific pods" prompt: inline: | Which pods are using the most CPU? diff --git a/evals/tasks/observability/queries/crashlooping-pods.yaml b/evals/tasks/observability/queries/crashlooping-pods.yaml index 24119f551..4e975964e 100644 --- a/evals/tasks/observability/queries/crashlooping-pods.yaml +++ b/evals/tasks/observability/queries/crashlooping-pods.yaml @@ -6,8 +6,8 @@ metadata: parallel: true runs: 1 labels: - suite: observability category: queries + suite: observability toolType: range-query description: | Tests if the agent can identify crashlooping pods by discovering and querying @@ -16,7 +16,7 @@ spec: verify: - llmJudge: contains: "kube_pod_container_status_restarts_total" - reason: "Verify the agent called execute_range_query with the correct metric kube_pod_container_status_restarts_total and reported the results: either listing crashlooping pods, or explicitly confirming no pods have restarted" + reason: "Verify the agent used the correct metric for container restart counts" prompt: inline: | Which pods were crashlooping in the last 5 minutes? diff --git a/evals/tasks/observability/queries/diagnose-cluster-health.yaml b/evals/tasks/observability/queries/diagnose-cluster-health.yaml index a2e45e1a9..7542b61e5 100644 --- a/evals/tasks/observability/queries/diagnose-cluster-health.yaml +++ b/evals/tasks/observability/queries/diagnose-cluster-health.yaml @@ -6,8 +6,8 @@ metadata: parallel: true runs: 1 labels: - suite: observability category: queries + suite: observability toolType: diagnostic description: | Tests the agent's ability to handle an ambiguous diagnostic prompt. @@ -17,11 +17,11 @@ metadata: spec: verify: - llmJudge: - contains: "alertname" - reason: "Verify the agent called get_alerts and reported the results: either listing specific alert names, or explicitly confirming that no alerts are currently firing" + contains: "Watchdog" + reason: "Verify the agent checked alerts and reported specific alert names such as Watchdog" - llmJudge: contains: "kube_" - reason: "Verify the agent called execute_instant_query with Kubernetes metrics (e.g. kube_node_status_condition) and reported the results for health assessment" + reason: "Verify the agent queried Kubernetes metrics (e.g. kube_node_status_condition) for health assessment" prompt: inline: | Is the cluster healthy? Give me an overview of any issues. diff --git a/evals/tasks/observability/queries/high-cardinality-rejection.yaml b/evals/tasks/observability/queries/high-cardinality-rejection.yaml index 9a5cf78c9..dbb5b3bba 100644 --- a/evals/tasks/observability/queries/high-cardinality-rejection.yaml +++ b/evals/tasks/observability/queries/high-cardinality-rejection.yaml @@ -6,8 +6,8 @@ metadata: parallel: true runs: 1 labels: - suite: observability category: queries + suite: observability toolType: error-handling description: | Tests agent behavior when a query is rejected by obs-mcp guardrails @@ -17,7 +17,7 @@ spec: verify: - llmJudge: contains: "guardrail" - reason: "Verify the agent attempted a query that was rejected by a guardrail and reported the result: the response should explain the guardrail rejection and suggest a scoped alternative" + reason: "Verify the agent explains the query was rejected by a guardrail and suggests a scoped alternative" prompt: inline: | Show me the CPU, memory, network, and disk usage for every single container diff --git a/evals/tasks/observability/queries/memory-usage.yaml b/evals/tasks/observability/queries/memory-usage.yaml index 2a41e2c83..58da3fce8 100644 --- a/evals/tasks/observability/queries/memory-usage.yaml +++ b/evals/tasks/observability/queries/memory-usage.yaml @@ -6,8 +6,8 @@ metadata: parallel: true runs: 1 labels: - suite: observability category: queries + suite: observability toolType: query description: | Tests if the agent can find the appropriate memory usage metric and query it @@ -15,11 +15,11 @@ metadata: spec: verify: - llmJudge: - contains: "container_memory_working_set_bytes" - reason: "Verify the agent called execute_instant_query with the correct memory metric container_memory_working_set_bytes and reported the results" + contains: "container_memory" + reason: "Verify the agent queried a container memory metric (working_set_bytes or usage_bytes)" - llmJudge: contains: "pod" - reason: "Verify the agent reported query results identifying specific pods, or explicitly confirmed no memory data is available" + reason: "Verify the response identifies specific pods" prompt: inline: | Which pods are using the most memory? Show me the top 5. diff --git a/evals/tasks/observability/queries/namespace-pod-count.yaml b/evals/tasks/observability/queries/namespace-pod-count.yaml index 11ef3d0f8..b180a917d 100644 --- a/evals/tasks/observability/queries/namespace-pod-count.yaml +++ b/evals/tasks/observability/queries/namespace-pod-count.yaml @@ -6,8 +6,8 @@ metadata: parallel: true runs: 1 labels: - suite: observability category: queries + suite: observability toolType: multi-step description: | Tests multi-step reasoning: the agent must discover a suitable metric @@ -18,10 +18,10 @@ spec: verify: - llmJudge: contains: "kube-system" - reason: "Verify the agent called execute_instant_query and reported the results: the response should list actual namespace names from the cluster such as kube-system" + reason: "Verify the response lists actual namespace names from the cluster such as kube-system" - llmJudge: contains: "kube_pod" - reason: "Verify the agent called execute_instant_query with a kube_pod metric to count pods and reported the results" + reason: "Verify the agent used a kube_pod metric to count pods" prompt: inline: | Which namespaces have the most running pods? Show me the top 5. diff --git a/evals/tasks/observability/queries/namespace-resource-usage.yaml b/evals/tasks/observability/queries/namespace-resource-usage.yaml index 26ee2d73f..4434ca52c 100644 --- a/evals/tasks/observability/queries/namespace-resource-usage.yaml +++ b/evals/tasks/observability/queries/namespace-resource-usage.yaml @@ -6,8 +6,8 @@ metadata: parallel: true runs: 1 labels: - suite: observability category: queries + suite: observability toolType: multi-step description: | Tests multi-step reasoning: the agent must discover CPU and memory metrics, @@ -18,10 +18,10 @@ spec: verify: - llmJudge: contains: "container_cpu_usage_seconds_total" - reason: "Verify the agent called list_metrics to discover and then executed a query with the correct CPU metric container_cpu_usage_seconds_total, reporting the results" + reason: "Verify the agent discovered and used the correct CPU metric" - llmJudge: - contains: "container_memory_working_set_bytes" - reason: "Verify the agent called list_metrics to discover and then executed a query with the correct memory metric container_memory_working_set_bytes, reporting the results" + contains: "container_memory" + reason: "Verify the agent discovered and used a container memory metric (working_set_bytes or usage_bytes)" prompt: inline: | Which namespace is consuming the most CPU and memory? diff --git a/evals/tasks/observability/queries/network-traffic.yaml b/evals/tasks/observability/queries/network-traffic.yaml index 879b7a943..9b1e06fd3 100644 --- a/evals/tasks/observability/queries/network-traffic.yaml +++ b/evals/tasks/observability/queries/network-traffic.yaml @@ -6,8 +6,8 @@ metadata: parallel: true runs: 1 labels: - suite: observability category: queries + suite: observability toolType: query description: | Tests if the agent can discover network-related metrics and query them to find @@ -16,10 +16,10 @@ spec: verify: - llmJudge: contains: "container_network_receive_bytes_total" - reason: "Verify the agent called execute_instant_query with the correct network metric container_network_receive_bytes_total and reported the results" + reason: "Verify the agent queried the correct network metric" - llmJudge: contains: "pod" - reason: "Verify the agent reported query results identifying specific pods, or explicitly confirmed no network data is available" + reason: "Verify the response identifies specific pods" prompt: inline: | Which pods are receiving the most network traffic? diff --git a/evals/tasks/observability/queries/nonexistent-metric.yaml b/evals/tasks/observability/queries/nonexistent-metric.yaml index 25f0dbd22..ed76ee25a 100644 --- a/evals/tasks/observability/queries/nonexistent-metric.yaml +++ b/evals/tasks/observability/queries/nonexistent-metric.yaml @@ -6,8 +6,8 @@ metadata: parallel: true runs: 1 labels: - suite: observability category: queries + suite: observability toolType: error-handling description: | Tests agent recovery when querying a metric that does not exist. @@ -17,7 +17,7 @@ spec: verify: - llmJudge: contains: "not found" - reason: "Verify the agent called list_metrics and reported the result: explicitly communicating that the metric does not exist or was not found" + reason: "Verify the agent communicates that the metric does not exist or was not found" prompt: inline: | What is the current value of the metric fake_nonexistent_metric_total? diff --git a/evals/tasks/observability/queries/nonexistent-namespace.yaml b/evals/tasks/observability/queries/nonexistent-namespace.yaml index 3384f43e3..78090c0c1 100644 --- a/evals/tasks/observability/queries/nonexistent-namespace.yaml +++ b/evals/tasks/observability/queries/nonexistent-namespace.yaml @@ -6,8 +6,8 @@ metadata: parallel: true runs: 1 labels: - suite: observability category: queries + suite: observability toolType: error-handling description: | Tests agent behavior when querying for resources in a namespace that @@ -17,7 +17,7 @@ spec: verify: - llmJudge: contains: "no data" - reason: "Verify the agent called execute_instant_query and reported the result: explicitly confirming no data, no results, or no pods found in the nonexistent namespace" + reason: "Verify the agent reports no data, no results, or no pods found in the nonexistent namespace" prompt: inline: | Show me the memory usage for all pods in the namespace called totally-fake-namespace-12345. diff --git a/evals/tasks/observability/queries/pending-pods.yaml b/evals/tasks/observability/queries/pending-pods.yaml index 5ca55f27e..f5b47e387 100644 --- a/evals/tasks/observability/queries/pending-pods.yaml +++ b/evals/tasks/observability/queries/pending-pods.yaml @@ -6,8 +6,8 @@ metadata: parallel: true runs: 1 labels: - suite: observability category: queries + suite: observability toolType: query description: | Tests if the agent can identify pods stuck in pending state by first discovering @@ -17,7 +17,7 @@ spec: verify: - llmJudge: contains: "kube_pod_status_phase" - reason: "Verify the agent called execute_instant_query with the correct metric kube_pod_status_phase and reported the results: either listing pending pods, or explicitly confirming no pods are in pending state" + reason: "Verify the agent used the correct metric for pod phase status" prompt: inline: | Which pods are stuck in pending state? diff --git a/evals/tasks/observability/queries/pods-created.yaml b/evals/tasks/observability/queries/pods-created.yaml index cacf0e76b..097a57e24 100644 --- a/evals/tasks/observability/queries/pods-created.yaml +++ b/evals/tasks/observability/queries/pods-created.yaml @@ -6,8 +6,8 @@ metadata: parallel: true runs: 1 labels: - suite: observability category: queries + suite: observability toolType: range-query description: | Tests if the agent can use a range query to find recently created pods by @@ -16,7 +16,7 @@ spec: verify: - llmJudge: contains: "kube_pod_created" - reason: "Verify the agent called execute_range_query with the kube_pod_created metric and reported the results: either listing recently created pods, or explicitly confirming no pods were created in the time window" + reason: "Verify the agent discovered and used the kube_pod_created metric" prompt: inline: | How many pods were created in the last 5 minutes? diff --git a/evals/tasks/observability/queries/prometheus-head-series.yaml b/evals/tasks/observability/queries/prometheus-head-series.yaml index bcf2c77c8..9f62ffdf3 100644 --- a/evals/tasks/observability/queries/prometheus-head-series.yaml +++ b/evals/tasks/observability/queries/prometheus-head-series.yaml @@ -6,8 +6,8 @@ metadata: parallel: true runs: 1 labels: - suite: observability category: queries + suite: observability toolType: query description: | Tests if the agent can query Prometheus internal metrics to report the current @@ -16,7 +16,7 @@ spec: verify: - llmJudge: contains: "prometheus_tsdb_head_series" - reason: "Verify the agent called execute_instant_query with the correct Prometheus TSDB metric prometheus_tsdb_head_series and reported the current head series count" + reason: "Verify the agent used the correct Prometheus TSDB metric" prompt: inline: | How many head series does Prometheus have? diff --git a/evals/tasks/observability/queries/prometheus-requests.yaml b/evals/tasks/observability/queries/prometheus-requests.yaml index 4445d033a..f436130b4 100644 --- a/evals/tasks/observability/queries/prometheus-requests.yaml +++ b/evals/tasks/observability/queries/prometheus-requests.yaml @@ -6,8 +6,8 @@ metadata: parallel: true runs: 1 labels: - suite: observability category: queries + suite: observability toolType: query description: | Tests if the agent can calculate the request rate to Prometheus by discovering @@ -16,7 +16,7 @@ spec: verify: - llmJudge: contains: "prometheus_http_requests_total" - reason: "Verify the agent called execute_instant_query with the correct HTTP requests metric prometheus_http_requests_total and reported the request rate" + reason: "Verify the agent used the correct HTTP requests metric" prompt: inline: | How many requests per second are being made to Prometheus? diff --git a/evals/tasks/observability/queries/prometheus-wal-size.yaml b/evals/tasks/observability/queries/prometheus-wal-size.yaml index 14cb8a944..aa19ea451 100644 --- a/evals/tasks/observability/queries/prometheus-wal-size.yaml +++ b/evals/tasks/observability/queries/prometheus-wal-size.yaml @@ -6,8 +6,8 @@ metadata: parallel: true runs: 1 labels: - suite: observability category: queries + suite: observability toolType: query description: | Tests if the agent can query the current Prometheus WAL storage size using @@ -16,7 +16,7 @@ spec: verify: - llmJudge: contains: "prometheus_tsdb_wal_storage_size_bytes" - reason: "Verify the agent called execute_instant_query with the correct WAL storage metric prometheus_tsdb_wal_storage_size_bytes and reported the current WAL size" + reason: "Verify the agent used the correct WAL storage metric" prompt: inline: | What is the current storage size of the Prometheus WAL? diff --git a/evals/tasks/observability/queries/time-range-query.yaml b/evals/tasks/observability/queries/time-range-query.yaml index 2ac2c0d60..80a144be2 100644 --- a/evals/tasks/observability/queries/time-range-query.yaml +++ b/evals/tasks/observability/queries/time-range-query.yaml @@ -6,8 +6,8 @@ metadata: parallel: true runs: 1 labels: - suite: observability category: queries + suite: observability toolType: multi-step description: | Tests whether the agent correctly uses execute_range_query with @@ -17,10 +17,10 @@ spec: verify: - llmJudge: contains: "container_cpu_usage_seconds_total" - reason: "Verify the agent called execute_range_query with the correct CPU metric container_cpu_usage_seconds_total and reported the results" + reason: "Verify the agent discovered and used the correct CPU metric" - llmJudge: contains: "pod" - reason: "Verify the agent reported range query results including specific pod names, or explicitly confirmed no CPU data is available for the given namespace" + reason: "Verify the response includes specific pod names from the query results" prompt: inline: | Show me the CPU usage trend for pods in the default namespace over the last 30 minutes. diff --git a/evals/tasks/observability/queries/visualize-cpu-usage.yaml b/evals/tasks/observability/queries/visualize-cpu-usage.yaml index 8894dabd5..74d88ac85 100644 --- a/evals/tasks/observability/queries/visualize-cpu-usage.yaml +++ b/evals/tasks/observability/queries/visualize-cpu-usage.yaml @@ -6,8 +6,8 @@ metadata: parallel: true runs: 1 labels: - suite: observability category: queries + suite: observability toolType: visualization description: | Tests if the agent uses the show_timeseries tool to visualize CPU usage @@ -17,7 +17,7 @@ spec: verify: - llmJudge: contains: "container_cpu_usage_seconds_total" - reason: "Verify the agent called show_timeseries with the correct CPU metric container_cpu_usage_seconds_total and reported the visualization results" + reason: "Verify the agent discovered and visualized the correct CPU metric" prompt: inline: | Visualize the CPU usage for pods in the default namespace over the last 30 minutes.