From 1397087b474e78597a45c22cfdd510cf85cc0acf Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Reme=C5=A1?= <tremes@redhat.com>
Date: Mon, 20 Apr 2026 14:01:37 +0200
Subject: [PATCH 1/3] feat: add observability toolset evals

---
 evals/claude-code/eval.yaml                   | 10 ++++++
 evals/gemini-agent/eval.yaml                  | 10 ++++++
 evals/openai-agent/eval.yaml                  | 10 ++++++
 .../alerts/alert-investigation.yaml           | 29 +++++++++++++++++
 .../observability/alerts/filtered-alerts.yaml | 27 ++++++++++++++++
 .../observability/alerts/get-alerts.yaml      | 22 +++++++++++++
 .../observability/alerts/get-silences.yaml    | 22 +++++++++++++
 .../observability/labels/get-series.yaml      | 26 ++++++++++++++++
 .../observability/labels/label-names.yaml     | 26 ++++++++++++++++
 .../observability/labels/label-values.yaml    | 22 +++++++++++++
 .../labels/series-by-namespace.yaml           | 28 +++++++++++++++++
 .../observability/metrics/list-metrics.yaml   | 22 +++++++++++++
 .../metrics/list-node-metrics.yaml            | 22 +++++++++++++
 .../queries/backend-reachability.yaml         | 23 ++++++++++++++
 .../observability/queries/cpu-usage.yaml      | 25 +++++++++++++++
 .../queries/crashlooping-pods.yaml            | 22 +++++++++++++
 .../queries/diagnose-cluster-health.yaml      | 28 +++++++++++++++++
 .../queries/high-cardinality-rejection.yaml   | 24 ++++++++++++++
 .../observability/queries/memory-usage.yaml   | 25 +++++++++++++++
 .../queries/namespace-pod-count.yaml          | 27 ++++++++++++++++
 .../queries/namespace-resource-usage.yaml     | 31 +++++++++++++++++++
 .../queries/network-traffic.yaml              | 25 +++++++++++++++
 .../queries/nonexistent-metric.yaml           | 23 ++++++++++++++
 .../queries/nonexistent-namespace.yaml        | 23 ++++++++++++++
 .../observability/queries/pending-pods.yaml   | 23 ++++++++++++++
 .../observability/queries/pods-created.yaml   | 22 +++++++++++++
 .../queries/prometheus-head-series.yaml       | 22 +++++++++++++
 .../queries/prometheus-requests.yaml          | 22 +++++++++++++
 .../queries/prometheus-wal-size.yaml          | 22 +++++++++++++
 .../queries/time-range-query.yaml             | 26 ++++++++++++++++
 .../queries/visualize-cpu-usage.yaml          | 23 ++++++++++++++
 31 files changed, 712 insertions(+)
 create mode 100644 evals/tasks/observability/alerts/alert-investigation.yaml
 create mode 100644 evals/tasks/observability/alerts/filtered-alerts.yaml
 create mode 100644 evals/tasks/observability/alerts/get-alerts.yaml
 create mode 100644 evals/tasks/observability/alerts/get-silences.yaml
 create mode 100644 evals/tasks/observability/labels/get-series.yaml
 create mode 100644 evals/tasks/observability/labels/label-names.yaml
 create mode 100644 evals/tasks/observability/labels/label-values.yaml
 create mode 100644 evals/tasks/observability/labels/series-by-namespace.yaml
 create mode 100644 evals/tasks/observability/metrics/list-metrics.yaml
 create mode 100644 evals/tasks/observability/metrics/list-node-metrics.yaml
 create mode 100644 evals/tasks/observability/queries/backend-reachability.yaml
 create mode 100644 evals/tasks/observability/queries/cpu-usage.yaml
 create mode 100644 evals/tasks/observability/queries/crashlooping-pods.yaml
 create mode 100644 evals/tasks/observability/queries/diagnose-cluster-health.yaml
 create mode 100644 evals/tasks/observability/queries/high-cardinality-rejection.yaml
 create mode 100644 evals/tasks/observability/queries/memory-usage.yaml
 create mode 100644 evals/tasks/observability/queries/namespace-pod-count.yaml
 create mode 100644 evals/tasks/observability/queries/namespace-resource-usage.yaml
 create mode 100644 evals/tasks/observability/queries/network-traffic.yaml
 create mode 100644 evals/tasks/observability/queries/nonexistent-metric.yaml
 create mode 100644 evals/tasks/observability/queries/nonexistent-namespace.yaml
 create mode 100644 evals/tasks/observability/queries/pending-pods.yaml
 create mode 100644 evals/tasks/observability/queries/pods-created.yaml
 create mode 100644 evals/tasks/observability/queries/prometheus-head-series.yaml
 create mode 100644 evals/tasks/observability/queries/prometheus-requests.yaml
 create mode 100644 evals/tasks/observability/queries/prometheus-wal-size.yaml
 create mode 100644 evals/tasks/observability/queries/time-range-query.yaml
 create mode 100644 evals/tasks/observability/queries/visualize-cpu-usage.yaml

diff --git a/evals/claude-code/eval.yaml b/evals/claude-code/eval.yaml
index 4a46b8131..bbf5c9e0d 100644
--- a/evals/claude-code/eval.yaml
+++ b/evals/claude-code/eval.yaml
@@ -72,3 +72,13 @@ config:
             toolPattern: ".*"
         minToolCalls: 1
         maxToolCalls: 20
+    # Observability tasks
+    - glob: ../tasks/observability/*/*.yaml
+      labelSelector:
+        suite: observability
+      assertions:
+        toolsUsed:
+          - server: kubernetes
+            toolPattern: ".*"
+        minToolCalls: 1
+        maxToolCalls: 20
diff --git a/evals/gemini-agent/eval.yaml b/evals/gemini-agent/eval.yaml
index d7ee6eb82..3a1a66132 100644
--- a/evals/gemini-agent/eval.yaml
+++ b/evals/gemini-agent/eval.yaml
@@ -17,3 +17,13 @@ config:
         toolsUsed:
           - server: kubernetes
             toolPattern: ".*"
+    # Observability tasks
+    - glob: ../tasks/observability/*/*.yaml
+      labelSelector:
+        suite: observability
+      assertions:
+        toolsUsed:
+          - server: kubernetes
+            toolPattern: ".*"
+        minToolCalls: 1
+        maxToolCalls: 20
diff --git a/evals/openai-agent/eval.yaml b/evals/openai-agent/eval.yaml
index d3a71773f..1dedd4787 100644
--- a/evals/openai-agent/eval.yaml
+++ b/evals/openai-agent/eval.yaml
@@ -72,3 +72,13 @@ config:
             toolPattern: ".*"
         minToolCalls: 1
         maxToolCalls: 20
+    # Observability tasks
+    - glob: ../tasks/observability/*/*.yaml
+      labelSelector:
+        suite: observability
+      assertions:
+        toolsUsed:
+          - server: kubernetes
+            toolPattern: ".*"
+        minToolCalls: 1
+        maxToolCalls: 20
diff --git a/evals/tasks/observability/alerts/alert-investigation.yaml b/evals/tasks/observability/alerts/alert-investigation.yaml
new file mode 100644
index 000000000..0e6b37819
--- /dev/null
+++ b/evals/tasks/observability/alerts/alert-investigation.yaml
@@ -0,0 +1,29 @@
+kind: Task
+apiVersion: mcpchecker/v1alpha2
+metadata:
+  name: "alert-investigation"
+  difficulty: medium
+  parallel: true
+  runs: 1
+  labels:
+    suite: observability
+    category: alerts
+    toolType: multi-step
+  description: |
+    Tests if the agent can perform multi-step alert triage: first retrieving
+    alerts from Alertmanager, then investigating related metrics for the most
+    critical alert using Prometheus queries.
+spec:
+  verify:
+    - llmJudge:
+        contains: "alert"
+        reason: "Verify the agent retrieved alerts from Alertmanager"
+    - llmJudge:
+        contains: "metric"
+        reason: "Verify the agent investigated a related Prometheus metric"
+  prompt:
+    inline: |
+      Check if there are any firing alerts. If there are, investigate
+      the related metrics for the most critical alert and summarize
+      what's happening. If there are no firing alerts, check cluster
+      health metrics instead.
diff --git a/evals/tasks/observability/alerts/filtered-alerts.yaml b/evals/tasks/observability/alerts/filtered-alerts.yaml
new file mode 100644
index 000000000..548c0363b
--- /dev/null
+++ b/evals/tasks/observability/alerts/filtered-alerts.yaml
@@ -0,0 +1,27 @@
+kind: Task
+apiVersion: mcpchecker/v1alpha2
+metadata:
+  name: "filtered-alerts"
+  difficulty: medium
+  parallel: true
+  runs: 1
+  labels:
+    suite: observability
+    category: alerts
+    toolType: alertmanager
+  description: |
+    Tests if the agent uses the get_alerts tool with filter parameters
+    to retrieve only active alerts matching a specific severity. The agent
+    should pass appropriate filter arguments rather than fetching all alerts
+    and filtering client-side.
+spec:
+  verify:
+    - llmJudge:
+        contains: "alert"
+        reason: "Verify the agent retrieved and reported on alerts"
+    - llmJudge:
+        contains: "critical"
+        reason: "Verify the agent filtered or addressed the critical severity"
+  prompt:
+    inline: |
+      Are there any firing alerts with severity=critical? Show only active alerts.
diff --git a/evals/tasks/observability/alerts/get-alerts.yaml b/evals/tasks/observability/alerts/get-alerts.yaml
new file mode 100644
index 000000000..fa758d473
--- /dev/null
+++ b/evals/tasks/observability/alerts/get-alerts.yaml
@@ -0,0 +1,22 @@
+kind: Task
+apiVersion: mcpchecker/v1alpha2
+metadata:
+  name: "get-alerts"
+  difficulty: easy
+  parallel: true
+  runs: 1
+  labels:
+    suite: observability
+    category: alerts
+    toolType: alertmanager
+  description: |
+    Tests if the agent can discover and use the get_alerts tool to retrieve
+    currently firing alerts from Alertmanager.
+spec:
+  verify:
+    - llmJudge:
+        contains: "alerts"
+        reason: "Verify the output reports on the alert state, whether alerts are firing or not"
+  prompt:
+    inline: |
+      Check the cluster for any firing alerts and report what you find.
diff --git a/evals/tasks/observability/alerts/get-silences.yaml b/evals/tasks/observability/alerts/get-silences.yaml
new file mode 100644
index 000000000..726e60c31
--- /dev/null
+++ b/evals/tasks/observability/alerts/get-silences.yaml
@@ -0,0 +1,22 @@
+kind: Task
+apiVersion: mcpchecker/v1alpha2
+metadata:
+  name: "get-silences"
+  difficulty: easy
+  parallel: true
+  runs: 1
+  labels:
+    suite: observability
+    category: alerts
+    toolType: alertmanager
+  description: |
+    Tests if the agent can discover and use the get_silences tool to retrieve
+    active silences from Alertmanager.
+spec:
+  verify:
+    - llmJudge:
+        contains: "silence"
+        reason: "Verify the output discusses alert silences, including when no active silences exist"
+  prompt:
+    inline: |
+      Are there any active silences in Alertmanager?
diff --git a/evals/tasks/observability/labels/get-series.yaml b/evals/tasks/observability/labels/get-series.yaml
new file mode 100644
index 000000000..317e85ae6
--- /dev/null
+++ b/evals/tasks/observability/labels/get-series.yaml
@@ -0,0 +1,26 @@
+kind: Task
+apiVersion: mcpchecker/v1alpha2
+metadata:
+  name: "get-series-cardinality"
+  difficulty: medium
+  parallel: true
+  runs: 1
+  labels:
+    suite: observability
+    category: labels
+    toolType: exploration
+  description: |
+    Tests if the agent can use the get_series tool to check cardinality for a metric.
+    The agent should first verify the metric exists via list_metrics, then use
+    get_series to retrieve matching time series and report the count.
+spec:
+  verify:
+    - llmJudge:
+        contains: "series"
+        reason: "Verify the output reports time series information"
+    - llmJudge:
+        contains: "kube_pod_info"
+        reason: "Verify the agent queried the correct metric"
+  prompt:
+    inline: |
+      How many time series exist for the kube_pod_info metric? Show the cardinality.
diff --git a/evals/tasks/observability/labels/label-names.yaml b/evals/tasks/observability/labels/label-names.yaml
new file mode 100644
index 000000000..8d306d528
--- /dev/null
+++ b/evals/tasks/observability/labels/label-names.yaml
@@ -0,0 +1,26 @@
+kind: Task
+apiVersion: mcpchecker/v1alpha2
+metadata:
+  name: "label-names"
+  difficulty: easy
+  parallel: true
+  runs: 1
+  labels:
+    suite: observability
+    category: labels
+    toolType: exploration
+  description: |
+    Tests if the agent follows the correct workflow: first calling list_metrics to
+    verify kube_pod_info exists, then calling get_label_names to discover available
+    labels for that metric.
+spec:
+  verify:
+    - llmJudge:
+        contains: "namespace"
+        reason: "Verify the output includes the namespace label which is a standard Kubernetes label"
+    - llmJudge:
+        contains: "pod"
+        reason: "Verify the output includes the pod label"
+  prompt:
+    inline: |
+      What labels are available for the kube_pod_info metric?
diff --git a/evals/tasks/observability/labels/label-values.yaml b/evals/tasks/observability/labels/label-values.yaml
new file mode 100644
index 000000000..2376f5a86
--- /dev/null
+++ b/evals/tasks/observability/labels/label-values.yaml
@@ -0,0 +1,22 @@
+kind: Task
+apiVersion: mcpchecker/v1alpha2
+metadata:
+  name: "label-values"
+  difficulty: medium
+  parallel: true
+  runs: 1
+  labels:
+    suite: observability
+    category: labels
+    toolType: exploration
+  description: |
+    Tests the full discovery workflow: list_metrics to verify the metric, then
+    get_label_values to retrieve unique namespace values for kube_pod_info.
+spec:
+  verify:
+    - llmJudge:
+        contains: "namespace"
+        reason: "Verify the output lists actual namespace values from the cluster"
+  prompt:
+    inline: |
+      What are the unique namespace values for the kube_pod_info metric?
diff --git a/evals/tasks/observability/labels/series-by-namespace.yaml b/evals/tasks/observability/labels/series-by-namespace.yaml
new file mode 100644
index 000000000..c7984440c
--- /dev/null
+++ b/evals/tasks/observability/labels/series-by-namespace.yaml
@@ -0,0 +1,28 @@
+kind: Task
+apiVersion: mcpchecker/v1alpha2
+metadata:
+  name: "series-by-namespace"
+  difficulty: medium
+  parallel: true
+  runs: 1
+  labels:
+    suite: observability
+    category: labels
+    toolType: exploration
+  description: |
+    Tests if the agent can use the get_series tool with a label selector
+    to find time series scoped to a specific namespace. The agent should
+    first verify the metric exists, then use get_series with a namespace
+    matcher to report the cardinality within that scope.
+spec:
+  verify:
+    - llmJudge:
+        contains: "series"
+        reason: "Verify the agent reported series information"
+    - llmJudge:
+        contains: "monitoring"
+        reason: "Verify the agent scoped to the monitoring namespace"
+  prompt:
+    inline: |
+      How many time series exist for container_cpu_usage_seconds_total
+      in the monitoring namespace?
diff --git a/evals/tasks/observability/metrics/list-metrics.yaml b/evals/tasks/observability/metrics/list-metrics.yaml
new file mode 100644
index 000000000..6a8fe405a
--- /dev/null
+++ b/evals/tasks/observability/metrics/list-metrics.yaml
@@ -0,0 +1,22 @@
+kind: Task
+apiVersion: mcpchecker/v1alpha2
+metadata:
+  name: "list-kube-metrics"
+  difficulty: easy
+  parallel: true
+  runs: 1
+  labels:
+    suite: observability
+    category: metrics
+    toolType: discovery
+  description: |
+    Tests if the agent can discover the list_metrics tool to find Kubernetes-related
+    metrics. The agent should use the name_regex parameter to filter for kube metrics.
+spec:
+  verify:
+    - llmJudge:
+        contains: "kube"
+        reason: "Verify the output lists Kubernetes metrics matching the kube prefix"
+  prompt:
+    inline: |
+      List all available Prometheus metrics that contain 'kube' in the name.
diff --git a/evals/tasks/observability/metrics/list-node-metrics.yaml b/evals/tasks/observability/metrics/list-node-metrics.yaml
new file mode 100644
index 000000000..9c87f7b43
--- /dev/null
+++ b/evals/tasks/observability/metrics/list-node-metrics.yaml
@@ -0,0 +1,22 @@
+kind: Task
+apiVersion: mcpchecker/v1alpha2
+metadata:
+  name: "list-node-metrics"
+  difficulty: easy
+  parallel: true
+  runs: 1
+  labels:
+    suite: observability
+    category: metrics
+    toolType: discovery
+  description: |
+    Tests if the agent can discover node-related metrics using the list_metrics tool
+    with a regex filter for node metrics.
+spec:
+  verify:
+    - llmJudge:
+        contains: "node"
+        reason: "Verify the output lists node-related metrics"
+  prompt:
+    inline: |
+      What node-related metrics are available in Prometheus?
diff --git a/evals/tasks/observability/queries/backend-reachability.yaml b/evals/tasks/observability/queries/backend-reachability.yaml
new file mode 100644
index 000000000..33b78f190
--- /dev/null
+++ b/evals/tasks/observability/queries/backend-reachability.yaml
@@ -0,0 +1,23 @@
+kind: Task
+apiVersion: mcpchecker/v1alpha2
+metadata:
+  name: "backend-reachability"
+  difficulty: easy
+  parallel: true
+  runs: 1
+  labels:
+    suite: observability
+    category: queries
+    toolType: smoke-test
+  description: |
+    Smoke test to verify the agent can reach the metrics backend and get a
+    valid response. The agent should query the up metric and confirm the
+    backend is reachable before any complex eval tasks run.
+spec:
+  verify:
+    - llmJudge:
+        contains: "up"
+        reason: "Verify the agent confirmed the metrics backend is reachable and targets are up"
+  prompt:
+    inline: |
+      Is the metrics backend reachable? Query up{job="prometheus-k8s"} and report the status.
diff --git a/evals/tasks/observability/queries/cpu-usage.yaml b/evals/tasks/observability/queries/cpu-usage.yaml
new file mode 100644
index 000000000..7d8c545ac
--- /dev/null
+++ b/evals/tasks/observability/queries/cpu-usage.yaml
@@ -0,0 +1,25 @@
+kind: Task
+apiVersion: mcpchecker/v1alpha2
+metadata:
+  name: "cpu-usage"
+  difficulty: medium
+  parallel: true
+  runs: 1
+  labels:
+    suite: observability
+    category: queries
+    toolType: query
+  description: |
+    Tests if the agent can find the appropriate CPU usage metric and query it to
+    determine which pods are consuming the most CPU resources.
+spec:
+  verify:
+    - llmJudge:
+        contains: "container_cpu_usage_seconds_total"
+        reason: "Verify the agent queried the correct CPU metric"
+    - llmJudge:
+        contains: "pod"
+        reason: "Verify the response identifies specific pods"
+  prompt:
+    inline: |
+      Which pods are using the most CPU?
diff --git a/evals/tasks/observability/queries/crashlooping-pods.yaml b/evals/tasks/observability/queries/crashlooping-pods.yaml
new file mode 100644
index 000000000..fcad5ef03
--- /dev/null
+++ b/evals/tasks/observability/queries/crashlooping-pods.yaml
@@ -0,0 +1,22 @@
+kind: Task
+apiVersion: mcpchecker/v1alpha2
+metadata:
+  name: "crashlooping-pods"
+  difficulty: medium
+  parallel: true
+  runs: 1
+  labels:
+    suite: observability
+    category: queries
+    toolType: range-query
+  description: |
+    Tests if the agent can identify crashlooping pods by discovering and querying
+    the kube_pod_container_status_restarts_total metric over a time range.
+spec:
+  verify:
+    - llmJudge:
+        contains: "kube_pod_container_status_restarts_total"
+        reason: "Verify the agent used the correct metric for container restart counts"
+  prompt:
+    inline: |
+      Which pods were crashlooping in the last 5 minutes?
diff --git a/evals/tasks/observability/queries/diagnose-cluster-health.yaml b/evals/tasks/observability/queries/diagnose-cluster-health.yaml
new file mode 100644
index 000000000..19ea059e1
--- /dev/null
+++ b/evals/tasks/observability/queries/diagnose-cluster-health.yaml
@@ -0,0 +1,28 @@
+kind: Task
+apiVersion: mcpchecker/v1alpha2
+metadata:
+  name: "diagnose-cluster-health"
+  difficulty: hard
+  parallel: true
+  runs: 1
+  labels:
+    suite: observability
+    category: queries
+    toolType: diagnostic
+  description: |
+    Tests the agent's ability to handle an ambiguous diagnostic prompt.
+    The agent must autonomously decide which tools and metrics to check,
+    ideally covering alerts, node status, and pod health without being
+    told exactly what to look for.
+spec:
+  verify:
+    - llmJudge:
+        contains: "cluster"
+        reason: "Verify the response provides an overall cluster health assessment"
+    - llmJudge:
+        contains: "node"
+        reason: "Verify the agent checked node-level health indicators"
+  prompt:
+    inline: |
+      Is the cluster healthy? Give me an overview of any issues.
+      Check alerts, node status, and pod health.
diff --git a/evals/tasks/observability/queries/high-cardinality-rejection.yaml b/evals/tasks/observability/queries/high-cardinality-rejection.yaml
new file mode 100644
index 000000000..f8e6d4cd5
--- /dev/null
+++ b/evals/tasks/observability/queries/high-cardinality-rejection.yaml
@@ -0,0 +1,24 @@
+kind: Task
+apiVersion: mcpchecker/v1alpha2
+metadata:
+  name: "high-cardinality-rejection"
+  difficulty: medium
+  parallel: true
+  runs: 1
+  labels:
+    suite: observability
+    category: queries
+    toolType: error-handling
+  description: |
+    Tests agent behavior when a query is rejected by obs-mcp guardrails
+    due to high cardinality. The agent should explain the guardrail
+    rejection and suggest a scoped alternative rather than giving up.
+spec:
+  verify:
+    - llmJudge:
+        contains: "namespace"
+        reason: "Verify the agent suggests scoping the query by namespace or another dimension"
+  prompt:
+    inline: |
+      Show me the CPU, memory, network, and disk usage for every single container
+      across all namespaces over the last 24 hours with 1-second resolution.
diff --git a/evals/tasks/observability/queries/memory-usage.yaml b/evals/tasks/observability/queries/memory-usage.yaml
new file mode 100644
index 000000000..ffd1d113e
--- /dev/null
+++ b/evals/tasks/observability/queries/memory-usage.yaml
@@ -0,0 +1,25 @@
+kind: Task
+apiVersion: mcpchecker/v1alpha2
+metadata:
+  name: "memory-usage"
+  difficulty: medium
+  parallel: true
+  runs: 1
+  labels:
+    suite: observability
+    category: queries
+    toolType: query
+  description: |
+    Tests if the agent can find the appropriate memory usage metric and query it
+    to determine which pods are consuming the most memory resources.
+spec:
+  verify:
+    - llmJudge:
+        contains: "container_memory_working_set_bytes"
+        reason: "Verify the agent queried the correct memory metric"
+    - llmJudge:
+        contains: "pod"
+        reason: "Verify the response identifies specific pods"
+  prompt:
+    inline: |
+      Which pods are using the most memory? Show me the top 5.
diff --git a/evals/tasks/observability/queries/namespace-pod-count.yaml b/evals/tasks/observability/queries/namespace-pod-count.yaml
new file mode 100644
index 000000000..642fd905d
--- /dev/null
+++ b/evals/tasks/observability/queries/namespace-pod-count.yaml
@@ -0,0 +1,27 @@
+kind: Task
+apiVersion: mcpchecker/v1alpha2
+metadata:
+  name: "namespace-pod-count"
+  difficulty: medium
+  parallel: true
+  runs: 1
+  labels:
+    suite: observability
+    category: queries
+    toolType: multi-step
+  description: |
+    Tests multi-step reasoning: the agent must discover a suitable metric
+    via list_metrics, explore label values to find namespaces, then query
+    to determine which namespaces have the most pods. Requires chaining
+    discovery, label exploration, and a query.
+spec:
+  verify:
+    - llmJudge:
+        contains: "namespace"
+        reason: "Verify the response lists namespaces"
+    - llmJudge:
+        contains: "pod"
+        reason: "Verify the response includes pod counts"
+  prompt:
+    inline: |
+      Which namespaces have the most running pods? Show me the top 5.
diff --git a/evals/tasks/observability/queries/namespace-resource-usage.yaml b/evals/tasks/observability/queries/namespace-resource-usage.yaml
new file mode 100644
index 000000000..ad776a282
--- /dev/null
+++ b/evals/tasks/observability/queries/namespace-resource-usage.yaml
@@ -0,0 +1,31 @@
+kind: Task
+apiVersion: mcpchecker/v1alpha2
+metadata:
+  name: "namespace-resource-usage"
+  difficulty: hard
+  parallel: true
+  runs: 1
+  labels:
+    suite: observability
+    category: queries
+    toolType: multi-step
+  description: |
+    Tests multi-step reasoning: the agent must discover CPU and memory metrics,
+    then query them with namespace-level aggregation to identify the top
+    resource-consuming namespaces. Requires chaining list_metrics, label
+    exploration, and multiple instant queries.
+spec:
+  verify:
+    - llmJudge:
+        contains: "namespace"
+        reason: "Verify the response breaks down resource usage by namespace"
+    - llmJudge:
+        contains: "cpu"
+        reason: "Verify the response includes CPU usage data"
+    - llmJudge:
+        contains: "memory"
+        reason: "Verify the response includes memory usage data"
+  prompt:
+    inline: |
+      Which namespace is consuming the most CPU and memory?
+      Show me the top namespace for each.
diff --git a/evals/tasks/observability/queries/network-traffic.yaml b/evals/tasks/observability/queries/network-traffic.yaml
new file mode 100644
index 000000000..23f147da0
--- /dev/null
+++ b/evals/tasks/observability/queries/network-traffic.yaml
@@ -0,0 +1,25 @@
+kind: Task
+apiVersion: mcpchecker/v1alpha2
+metadata:
+  name: "network-traffic"
+  difficulty: medium
+  parallel: true
+  runs: 1
+  labels:
+    suite: observability
+    category: queries
+    toolType: query
+  description: |
+    Tests if the agent can discover network-related metrics and query them to find
+    which pods are receiving the most network traffic.
+spec:
+  verify:
+    - llmJudge:
+        contains: "container_network_receive_bytes_total"
+        reason: "Verify the agent queried the correct network metric"
+    - llmJudge:
+        contains: "pod"
+        reason: "Verify the response identifies specific pods"
+  prompt:
+    inline: |
+      Which pods are receiving the most network traffic?
diff --git a/evals/tasks/observability/queries/nonexistent-metric.yaml b/evals/tasks/observability/queries/nonexistent-metric.yaml
new file mode 100644
index 000000000..b3890eeb3
--- /dev/null
+++ b/evals/tasks/observability/queries/nonexistent-metric.yaml
@@ -0,0 +1,23 @@
+kind: Task
+apiVersion: mcpchecker/v1alpha2
+metadata:
+  name: "nonexistent-metric"
+  difficulty: easy
+  parallel: true
+  runs: 1
+  labels:
+    suite: observability
+    category: queries
+    toolType: error-handling
+  description: |
+    Tests agent recovery when querying a metric that does not exist.
+    The agent should discover that the metric is missing via list_metrics
+    and inform the user rather than fabricating results.
+spec:
+  verify:
+    - llmJudge:
+        contains: "not found"
+        reason: "Verify the agent communicates that the metric does not exist or was not found"
+  prompt:
+    inline: |
+      What is the current value of the metric fake_nonexistent_metric_total?
diff --git a/evals/tasks/observability/queries/nonexistent-namespace.yaml b/evals/tasks/observability/queries/nonexistent-namespace.yaml
new file mode 100644
index 000000000..b9e440814
--- /dev/null
+++ b/evals/tasks/observability/queries/nonexistent-namespace.yaml
@@ -0,0 +1,23 @@
+kind: Task
+apiVersion: mcpchecker/v1alpha2
+metadata:
+  name: "nonexistent-namespace"
+  difficulty: easy
+  parallel: true
+  runs: 1
+  labels:
+    suite: observability
+    category: queries
+    toolType: error-handling
+  description: |
+    Tests agent behavior when querying for resources in a namespace that
+    does not exist. The agent should query and report empty results
+    gracefully rather than hallucinating data.
+spec:
+  verify:
+    - llmJudge:
+        contains: "no data"
+        reason: "Verify the agent reports no data, no results, or no pods found in the nonexistent namespace"
+  prompt:
+    inline: |
+      Show me the memory usage for all pods in the namespace called totally-fake-namespace-12345.
diff --git a/evals/tasks/observability/queries/pending-pods.yaml b/evals/tasks/observability/queries/pending-pods.yaml
new file mode 100644
index 000000000..7f06f5d9e
--- /dev/null
+++ b/evals/tasks/observability/queries/pending-pods.yaml
@@ -0,0 +1,23 @@
+kind: Task
+apiVersion: mcpchecker/v1alpha2
+metadata:
+  name: "pending-pods"
+  difficulty: medium
+  parallel: true
+  runs: 1
+  labels:
+    suite: observability
+    category: queries
+    toolType: query
+  description: |
+    Tests if the agent can identify pods stuck in pending state by first discovering
+    the kube_pod_status_phase metric and then running an instant query to find
+    pods with phase=Pending.
+spec:
+  verify:
+    - llmJudge:
+        contains: "kube_pod_status_phase"
+        reason: "Verify the agent used the correct metric for pod phase status"
+  prompt:
+    inline: |
+      Which pods are stuck in pending state?
diff --git a/evals/tasks/observability/queries/pods-created.yaml b/evals/tasks/observability/queries/pods-created.yaml
new file mode 100644
index 000000000..a8cee3114
--- /dev/null
+++ b/evals/tasks/observability/queries/pods-created.yaml
@@ -0,0 +1,22 @@
+kind: Task
+apiVersion: mcpchecker/v1alpha2
+metadata:
+  name: "pods-created"
+  difficulty: medium
+  parallel: true
+  runs: 1
+  labels:
+    suite: observability
+    category: queries
+    toolType: range-query
+  description: |
+    Tests if the agent can use a range query to find recently created pods by
+    discovering the kube_pod_created metric and querying it over a 5-minute window.
+spec:
+  verify:
+    - llmJudge:
+        contains: "kube_pod_created"
+        reason: "Verify the agent used the correct metric for pod creation timestamps"
+  prompt:
+    inline: |
+      How many pods were created in the last 5 minutes?
diff --git a/evals/tasks/observability/queries/prometheus-head-series.yaml b/evals/tasks/observability/queries/prometheus-head-series.yaml
new file mode 100644
index 000000000..ee89d5953
--- /dev/null
+++ b/evals/tasks/observability/queries/prometheus-head-series.yaml
@@ -0,0 +1,22 @@
+kind: Task
+apiVersion: mcpchecker/v1alpha2
+metadata:
+  name: "prometheus-head-series"
+  difficulty: easy
+  parallel: true
+  runs: 1
+  labels:
+    suite: observability
+    category: queries
+    toolType: query
+  description: |
+    Tests if the agent can query Prometheus internal metrics to report the current
+    number of head series using prometheus_tsdb_head_series.
+spec:
+  verify:
+    - llmJudge:
+        contains: "prometheus_tsdb_head_series"
+        reason: "Verify the agent used the correct Prometheus TSDB metric"
+  prompt:
+    inline: |
+      How many head series does Prometheus have?
diff --git a/evals/tasks/observability/queries/prometheus-requests.yaml b/evals/tasks/observability/queries/prometheus-requests.yaml
new file mode 100644
index 000000000..9560170ae
--- /dev/null
+++ b/evals/tasks/observability/queries/prometheus-requests.yaml
@@ -0,0 +1,22 @@
+kind: Task
+apiVersion: mcpchecker/v1alpha2
+metadata:
+  name: "prometheus-requests"
+  difficulty: medium
+  parallel: true
+  runs: 1
+  labels:
+    suite: observability
+    category: queries
+    toolType: query
+  description: |
+    Tests if the agent can calculate the request rate to Prometheus by discovering
+    and querying the prometheus_http_requests_total metric.
+spec:
+  verify:
+    - llmJudge:
+        contains: "prometheus_http_requests_total"
+        reason: "Verify the agent used the correct HTTP requests metric"
+  prompt:
+    inline: |
+      How many requests per second are being made to Prometheus?
diff --git a/evals/tasks/observability/queries/prometheus-wal-size.yaml b/evals/tasks/observability/queries/prometheus-wal-size.yaml
new file mode 100644
index 000000000..af6d18293
--- /dev/null
+++ b/evals/tasks/observability/queries/prometheus-wal-size.yaml
@@ -0,0 +1,22 @@
+kind: Task
+apiVersion: mcpchecker/v1alpha2
+metadata:
+  name: "prometheus-wal-size"
+  difficulty: easy
+  parallel: true
+  runs: 1
+  labels:
+    suite: observability
+    category: queries
+    toolType: query
+  description: |
+    Tests if the agent can query the current Prometheus WAL storage size using
+    the prometheus_tsdb_wal_storage_size_bytes metric.
+spec:
+  verify:
+    - llmJudge:
+        contains: "prometheus_tsdb_wal_storage_size_bytes"
+        reason: "Verify the agent used the correct WAL storage metric"
+  prompt:
+    inline: |
+      What is the current storage size of the Prometheus WAL?
diff --git a/evals/tasks/observability/queries/time-range-query.yaml b/evals/tasks/observability/queries/time-range-query.yaml
new file mode 100644
index 000000000..521423c8c
--- /dev/null
+++ b/evals/tasks/observability/queries/time-range-query.yaml
@@ -0,0 +1,26 @@
+kind: Task
+apiVersion: mcpchecker/v1alpha2
+metadata:
+  name: "time-range-query"
+  difficulty: medium
+  parallel: true
+  runs: 1
+  labels:
+    suite: observability
+    category: queries
+    toolType: multi-step
+  description: |
+    Tests whether the agent correctly uses execute_range_query with
+    appropriate start/end/step parameters when asked for data over
+    a specific time window.
+spec:
+  verify:
+    - llmJudge:
+        contains: "30 minutes"
+        reason: "Verify the agent honoured the requested 30-minute time window"
+    - llmJudge:
+        contains: "cpu"
+        reason: "Verify the response includes CPU usage data"
+  prompt:
+    inline: |
+      Show me the CPU usage trend for pods in the default namespace over the last 30 minutes.
diff --git a/evals/tasks/observability/queries/visualize-cpu-usage.yaml b/evals/tasks/observability/queries/visualize-cpu-usage.yaml
new file mode 100644
index 000000000..0f6af1eb6
--- /dev/null
+++ b/evals/tasks/observability/queries/visualize-cpu-usage.yaml
@@ -0,0 +1,23 @@
+kind: Task
+apiVersion: mcpchecker/v1alpha2
+metadata:
+  name: "visualize-cpu-usage"
+  difficulty: medium
+  parallel: true
+  runs: 1
+  labels:
+    suite: observability
+    category: queries
+    toolType: visualization
+  description: |
+    Tests if the agent uses the show_timeseries tool to visualize CPU usage
+    as a chart. The agent should discover the metric, then use show_timeseries
+    to render a time-series visualization.
+spec:
+  verify:
+    - llmJudge:
+        contains: "cpu"
+        reason: "Verify the agent queried and visualized CPU usage data"
+  prompt:
+    inline: |
+      Visualize the CPU usage for pods in the default namespace over the last 30 minutes.

From 44ce079fe786b0d6a0dabe4b024297e435066993 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Reme=C5=A1?= <tremes@redhat.com>
Date: Wed, 22 Apr 2026 12:25:10 +0200
Subject: [PATCH 2/3] attempt to improve the observability evals

---
 .../observability/alerts/alert-investigation.yaml     |  8 ++++----
 evals/tasks/observability/alerts/filtered-alerts.yaml |  8 ++++----
 evals/tasks/observability/alerts/get-alerts.yaml      |  4 ++--
 evals/tasks/observability/labels/get-series.yaml      |  8 ++++----
 evals/tasks/observability/labels/label-names.yaml     |  4 ++--
 evals/tasks/observability/labels/label-values.yaml    |  4 ++--
 .../observability/labels/series-by-namespace.yaml     |  8 ++++----
 evals/tasks/observability/metrics/list-metrics.yaml   |  4 ++--
 .../observability/metrics/list-node-metrics.yaml      |  4 ++--
 .../observability/queries/backend-reachability.yaml   |  4 ++--
 evals/tasks/observability/queries/cpu-usage.yaml      |  4 ++--
 .../observability/queries/crashlooping-pods.yaml      |  2 +-
 .../queries/diagnose-cluster-health.yaml              |  8 ++++----
 .../queries/high-cardinality-rejection.yaml           |  4 ++--
 evals/tasks/observability/queries/memory-usage.yaml   |  4 ++--
 .../observability/queries/namespace-pod-count.yaml    |  8 ++++----
 .../queries/namespace-resource-usage.yaml             | 11 ++++-------
 .../tasks/observability/queries/network-traffic.yaml  |  4 ++--
 .../observability/queries/nonexistent-metric.yaml     |  2 +-
 .../observability/queries/nonexistent-namespace.yaml  |  2 +-
 evals/tasks/observability/queries/pending-pods.yaml   |  2 +-
 evals/tasks/observability/queries/pods-created.yaml   |  2 +-
 .../observability/queries/prometheus-head-series.yaml |  2 +-
 .../observability/queries/prometheus-requests.yaml    |  2 +-
 .../observability/queries/prometheus-wal-size.yaml    |  2 +-
 .../tasks/observability/queries/time-range-query.yaml |  8 ++++----
 .../observability/queries/visualize-cpu-usage.yaml    |  4 ++--
 27 files changed, 62 insertions(+), 65 deletions(-)

diff --git a/evals/tasks/observability/alerts/alert-investigation.yaml b/evals/tasks/observability/alerts/alert-investigation.yaml
index 0e6b37819..da5369366 100644
--- a/evals/tasks/observability/alerts/alert-investigation.yaml
+++ b/evals/tasks/observability/alerts/alert-investigation.yaml
@@ -16,11 +16,11 @@ metadata:
 spec:
   verify:
     - llmJudge:
-        contains: "alert"
-        reason: "Verify the agent retrieved alerts from Alertmanager"
+        contains: "alertname"
+        reason: "Verify the agent called get_alerts and reported the results: either listing specific alert names from Alertmanager, or explicitly confirming that no alerts are currently firing"
     - llmJudge:
-        contains: "metric"
-        reason: "Verify the agent investigated a related Prometheus metric"
+        contains: "query"
+        reason: "Verify the agent investigated related metrics by running a Prometheus query"
   prompt:
     inline: |
       Check if there are any firing alerts. If there are, investigate
diff --git a/evals/tasks/observability/alerts/filtered-alerts.yaml b/evals/tasks/observability/alerts/filtered-alerts.yaml
index 548c0363b..2726598b0 100644
--- a/evals/tasks/observability/alerts/filtered-alerts.yaml
+++ b/evals/tasks/observability/alerts/filtered-alerts.yaml
@@ -17,11 +17,11 @@ metadata:
 spec:
   verify:
     - llmJudge:
-        contains: "alert"
-        reason: "Verify the agent retrieved and reported on alerts"
+        contains: "alertname"
+        reason: "Verify the agent called get_alerts and reported the results: either listing specific alert names with their status, or explicitly confirming that no critical alerts exist"
     - llmJudge:
-        contains: "critical"
-        reason: "Verify the agent filtered or addressed the critical severity"
+        contains: "severity"
+        reason: "Verify the agent addressed the severity filter in its response"
   prompt:
     inline: |
       Are there any firing alerts with severity=critical? Show only active alerts.
diff --git a/evals/tasks/observability/alerts/get-alerts.yaml b/evals/tasks/observability/alerts/get-alerts.yaml
index fa758d473..3b167faa7 100644
--- a/evals/tasks/observability/alerts/get-alerts.yaml
+++ b/evals/tasks/observability/alerts/get-alerts.yaml
@@ -15,8 +15,8 @@ metadata:
 spec:
   verify:
     - llmJudge:
-        contains: "alerts"
-        reason: "Verify the output reports on the alert state, whether alerts are firing or not"
+        contains: "alertname"
+        reason: "Verify the agent called get_alerts and reported the results: either listing specific alert names with their status, or explicitly confirming that no alerts are currently firing"
   prompt:
     inline: |
       Check the cluster for any firing alerts and report what you find.
diff --git a/evals/tasks/observability/labels/get-series.yaml b/evals/tasks/observability/labels/get-series.yaml
index 317e85ae6..6cd93a4da 100644
--- a/evals/tasks/observability/labels/get-series.yaml
+++ b/evals/tasks/observability/labels/get-series.yaml
@@ -16,11 +16,11 @@ metadata:
 spec:
   verify:
     - llmJudge:
-        contains: "series"
-        reason: "Verify the output reports time series information"
+        contains: "namespace"
+        reason: "Verify the agent called get_series and reported the results: either listing actual series data containing label dimensions like namespace, or confirming no series were found"
     - llmJudge:
-        contains: "kube_pod_info"
-        reason: "Verify the agent queried the correct metric"
+        contains: "node"
+        reason: "Verify the agent called get_series and reported series data with node label values, or explicitly confirmed the metric has no series with node labels"
   prompt:
     inline: |
       How many time series exist for the kube_pod_info metric? Show the cardinality.
diff --git a/evals/tasks/observability/labels/label-names.yaml b/evals/tasks/observability/labels/label-names.yaml
index 8d306d528..084c923a9 100644
--- a/evals/tasks/observability/labels/label-names.yaml
+++ b/evals/tasks/observability/labels/label-names.yaml
@@ -17,10 +17,10 @@ spec:
   verify:
     - llmJudge:
         contains: "namespace"
-        reason: "Verify the output includes the namespace label which is a standard Kubernetes label"
+        reason: "Verify the agent called get_label_names and reported the results: the output should include the namespace label which is a standard Kubernetes label"
     - llmJudge:
         contains: "pod"
-        reason: "Verify the output includes the pod label"
+        reason: "Verify the agent called get_label_names and reported the results: the output should include the pod label"
   prompt:
     inline: |
       What labels are available for the kube_pod_info metric?
diff --git a/evals/tasks/observability/labels/label-values.yaml b/evals/tasks/observability/labels/label-values.yaml
index 2376f5a86..e012ebca3 100644
--- a/evals/tasks/observability/labels/label-values.yaml
+++ b/evals/tasks/observability/labels/label-values.yaml
@@ -15,8 +15,8 @@ metadata:
 spec:
   verify:
     - llmJudge:
-        contains: "namespace"
-        reason: "Verify the output lists actual namespace values from the cluster"
+        contains: "kube-system"
+        reason: "Verify the agent called get_label_values and reported the results: the output should list actual namespace values from the cluster such as kube-system"
   prompt:
     inline: |
       What are the unique namespace values for the kube_pod_info metric?
diff --git a/evals/tasks/observability/labels/series-by-namespace.yaml b/evals/tasks/observability/labels/series-by-namespace.yaml
index c7984440c..f782a5a4d 100644
--- a/evals/tasks/observability/labels/series-by-namespace.yaml
+++ b/evals/tasks/observability/labels/series-by-namespace.yaml
@@ -17,11 +17,11 @@ metadata:
 spec:
   verify:
     - llmJudge:
-        contains: "series"
-        reason: "Verify the agent reported series information"
+        contains: "pod"
+        reason: "Verify the agent called get_series and reported the results: either listing actual series data containing label dimensions like pod, or confirming no series were found for the given namespace"
     - llmJudge:
-        contains: "monitoring"
-        reason: "Verify the agent scoped to the monitoring namespace"
+        contains: "container"
+        reason: "Verify the agent called get_series and reported series with container label values from the monitoring namespace, or explicitly confirmed no matching series exist"
   prompt:
     inline: |
       How many time series exist for container_cpu_usage_seconds_total
diff --git a/evals/tasks/observability/metrics/list-metrics.yaml b/evals/tasks/observability/metrics/list-metrics.yaml
index 6a8fe405a..ee3283059 100644
--- a/evals/tasks/observability/metrics/list-metrics.yaml
+++ b/evals/tasks/observability/metrics/list-metrics.yaml
@@ -15,8 +15,8 @@ metadata:
 spec:
   verify:
     - llmJudge:
-        contains: "kube"
-        reason: "Verify the output lists Kubernetes metrics matching the kube prefix"
+        contains: "kube_pod_info"
+        reason: "Verify the agent called list_metrics and reported the results: the output should list specific kube metrics discovered from Prometheus"
   prompt:
     inline: |
       List all available Prometheus metrics that contain 'kube' in the name.
diff --git a/evals/tasks/observability/metrics/list-node-metrics.yaml b/evals/tasks/observability/metrics/list-node-metrics.yaml
index 9c87f7b43..d6f4184a4 100644
--- a/evals/tasks/observability/metrics/list-node-metrics.yaml
+++ b/evals/tasks/observability/metrics/list-node-metrics.yaml
@@ -15,8 +15,8 @@ metadata:
 spec:
   verify:
     - llmJudge:
-        contains: "node"
-        reason: "Verify the output lists node-related metrics"
+        contains: "node_"
+        reason: "Verify the agent called list_metrics and reported the results: the output should list specific node metrics discovered from Prometheus (e.g. node_cpu_seconds_total)"
   prompt:
     inline: |
       What node-related metrics are available in Prometheus?
diff --git a/evals/tasks/observability/queries/backend-reachability.yaml b/evals/tasks/observability/queries/backend-reachability.yaml
index 33b78f190..5b5009583 100644
--- a/evals/tasks/observability/queries/backend-reachability.yaml
+++ b/evals/tasks/observability/queries/backend-reachability.yaml
@@ -16,8 +16,8 @@ metadata:
 spec:
   verify:
     - llmJudge:
-        contains: "up"
-        reason: "Verify the agent confirmed the metrics backend is reachable and targets are up"
+        contains: "prometheus-k8s"
+        reason: "Verify the agent called execute_instant_query with the up metric and reported the results: the response should confirm the prometheus-k8s target status"
   prompt:
     inline: |
       Is the metrics backend reachable? Query up{job="prometheus-k8s"} and report the status.
diff --git a/evals/tasks/observability/queries/cpu-usage.yaml b/evals/tasks/observability/queries/cpu-usage.yaml
index 7d8c545ac..f4c6a0f67 100644
--- a/evals/tasks/observability/queries/cpu-usage.yaml
+++ b/evals/tasks/observability/queries/cpu-usage.yaml
@@ -16,10 +16,10 @@ spec:
   verify:
     - llmJudge:
         contains: "container_cpu_usage_seconds_total"
-        reason: "Verify the agent queried the correct CPU metric"
+        reason: "Verify the agent called execute_instant_query with the correct CPU metric container_cpu_usage_seconds_total and reported the results"
     - llmJudge:
         contains: "pod"
-        reason: "Verify the response identifies specific pods"
+        reason: "Verify the agent reported query results identifying specific pods, or explicitly confirmed no CPU data is available"
   prompt:
     inline: |
       Which pods are using the most CPU?
diff --git a/evals/tasks/observability/queries/crashlooping-pods.yaml b/evals/tasks/observability/queries/crashlooping-pods.yaml
index fcad5ef03..24119f551 100644
--- a/evals/tasks/observability/queries/crashlooping-pods.yaml
+++ b/evals/tasks/observability/queries/crashlooping-pods.yaml
@@ -16,7 +16,7 @@ spec:
   verify:
     - llmJudge:
         contains: "kube_pod_container_status_restarts_total"
-        reason: "Verify the agent used the correct metric for container restart counts"
+        reason: "Verify the agent called execute_range_query with the correct metric kube_pod_container_status_restarts_total and reported the results: either listing crashlooping pods, or explicitly confirming no pods have restarted"
   prompt:
     inline: |
       Which pods were crashlooping in the last 5 minutes?
diff --git a/evals/tasks/observability/queries/diagnose-cluster-health.yaml b/evals/tasks/observability/queries/diagnose-cluster-health.yaml
index 19ea059e1..a2e45e1a9 100644
--- a/evals/tasks/observability/queries/diagnose-cluster-health.yaml
+++ b/evals/tasks/observability/queries/diagnose-cluster-health.yaml
@@ -17,11 +17,11 @@ metadata:
 spec:
   verify:
     - llmJudge:
-        contains: "cluster"
-        reason: "Verify the response provides an overall cluster health assessment"
+        contains: "alertname"
+        reason: "Verify the agent called get_alerts and reported the results: either listing specific alert names, or explicitly confirming that no alerts are currently firing"
     - llmJudge:
-        contains: "node"
-        reason: "Verify the agent checked node-level health indicators"
+        contains: "kube_"
+        reason: "Verify the agent called execute_instant_query with Kubernetes metrics (e.g. kube_node_status_condition) and reported the results for health assessment"
   prompt:
     inline: |
       Is the cluster healthy? Give me an overview of any issues.
diff --git a/evals/tasks/observability/queries/high-cardinality-rejection.yaml b/evals/tasks/observability/queries/high-cardinality-rejection.yaml
index f8e6d4cd5..9a5cf78c9 100644
--- a/evals/tasks/observability/queries/high-cardinality-rejection.yaml
+++ b/evals/tasks/observability/queries/high-cardinality-rejection.yaml
@@ -16,8 +16,8 @@ metadata:
 spec:
   verify:
     - llmJudge:
-        contains: "namespace"
-        reason: "Verify the agent suggests scoping the query by namespace or another dimension"
+        contains: "guardrail"
+        reason: "Verify the agent attempted a query that was rejected by a guardrail and reported the result: the response should explain the guardrail rejection and suggest a scoped alternative"
   prompt:
     inline: |
       Show me the CPU, memory, network, and disk usage for every single container
diff --git a/evals/tasks/observability/queries/memory-usage.yaml b/evals/tasks/observability/queries/memory-usage.yaml
index ffd1d113e..2a41e2c83 100644
--- a/evals/tasks/observability/queries/memory-usage.yaml
+++ b/evals/tasks/observability/queries/memory-usage.yaml
@@ -16,10 +16,10 @@ spec:
   verify:
     - llmJudge:
         contains: "container_memory_working_set_bytes"
-        reason: "Verify the agent queried the correct memory metric"
+        reason: "Verify the agent called execute_instant_query with the correct memory metric container_memory_working_set_bytes and reported the results"
     - llmJudge:
         contains: "pod"
-        reason: "Verify the response identifies specific pods"
+        reason: "Verify the agent reported query results identifying specific pods, or explicitly confirmed no memory data is available"
   prompt:
     inline: |
       Which pods are using the most memory? Show me the top 5.
diff --git a/evals/tasks/observability/queries/namespace-pod-count.yaml b/evals/tasks/observability/queries/namespace-pod-count.yaml
index 642fd905d..11ef3d0f8 100644
--- a/evals/tasks/observability/queries/namespace-pod-count.yaml
+++ b/evals/tasks/observability/queries/namespace-pod-count.yaml
@@ -17,11 +17,11 @@ metadata:
 spec:
   verify:
     - llmJudge:
-        contains: "namespace"
-        reason: "Verify the response lists namespaces"
+        contains: "kube-system"
+        reason: "Verify the agent called execute_instant_query and reported the results: the response should list actual namespace names from the cluster such as kube-system"
     - llmJudge:
-        contains: "pod"
-        reason: "Verify the response includes pod counts"
+        contains: "kube_pod"
+        reason: "Verify the agent called execute_instant_query with a kube_pod metric to count pods and reported the results"
   prompt:
     inline: |
       Which namespaces have the most running pods? Show me the top 5.
diff --git a/evals/tasks/observability/queries/namespace-resource-usage.yaml b/evals/tasks/observability/queries/namespace-resource-usage.yaml
index ad776a282..26ee2d73f 100644
--- a/evals/tasks/observability/queries/namespace-resource-usage.yaml
+++ b/evals/tasks/observability/queries/namespace-resource-usage.yaml
@@ -17,14 +17,11 @@ metadata:
 spec:
   verify:
     - llmJudge:
-        contains: "namespace"
-        reason: "Verify the response breaks down resource usage by namespace"
+        contains: "container_cpu_usage_seconds_total"
+        reason: "Verify the agent called list_metrics to discover and then executed a query with the correct CPU metric container_cpu_usage_seconds_total, reporting the results"
     - llmJudge:
-        contains: "cpu"
-        reason: "Verify the response includes CPU usage data"
-    - llmJudge:
-        contains: "memory"
-        reason: "Verify the response includes memory usage data"
+        contains: "container_memory_working_set_bytes"
+        reason: "Verify the agent called list_metrics to discover and then executed a query with the correct memory metric container_memory_working_set_bytes, reporting the results"
   prompt:
     inline: |
       Which namespace is consuming the most CPU and memory?
diff --git a/evals/tasks/observability/queries/network-traffic.yaml b/evals/tasks/observability/queries/network-traffic.yaml
index 23f147da0..879b7a943 100644
--- a/evals/tasks/observability/queries/network-traffic.yaml
+++ b/evals/tasks/observability/queries/network-traffic.yaml
@@ -16,10 +16,10 @@ spec:
   verify:
     - llmJudge:
         contains: "container_network_receive_bytes_total"
-        reason: "Verify the agent queried the correct network metric"
+        reason: "Verify the agent called execute_instant_query with the correct network metric container_network_receive_bytes_total and reported the results"
     - llmJudge:
         contains: "pod"
-        reason: "Verify the response identifies specific pods"
+        reason: "Verify the agent reported query results identifying specific pods, or explicitly confirmed no network data is available"
   prompt:
     inline: |
       Which pods are receiving the most network traffic?
diff --git a/evals/tasks/observability/queries/nonexistent-metric.yaml b/evals/tasks/observability/queries/nonexistent-metric.yaml
index b3890eeb3..25f0dbd22 100644
--- a/evals/tasks/observability/queries/nonexistent-metric.yaml
+++ b/evals/tasks/observability/queries/nonexistent-metric.yaml
@@ -17,7 +17,7 @@ spec:
   verify:
     - llmJudge:
         contains: "not found"
-        reason: "Verify the agent communicates that the metric does not exist or was not found"
+        reason: "Verify the agent called list_metrics and reported the result: explicitly communicating that the metric does not exist or was not found"
   prompt:
     inline: |
       What is the current value of the metric fake_nonexistent_metric_total?
diff --git a/evals/tasks/observability/queries/nonexistent-namespace.yaml b/evals/tasks/observability/queries/nonexistent-namespace.yaml
index b9e440814..3384f43e3 100644
--- a/evals/tasks/observability/queries/nonexistent-namespace.yaml
+++ b/evals/tasks/observability/queries/nonexistent-namespace.yaml
@@ -17,7 +17,7 @@ spec:
   verify:
     - llmJudge:
         contains: "no data"
-        reason: "Verify the agent reports no data, no results, or no pods found in the nonexistent namespace"
+        reason: "Verify the agent called execute_instant_query and reported the result: explicitly confirming no data, no results, or no pods found in the nonexistent namespace"
   prompt:
     inline: |
       Show me the memory usage for all pods in the namespace called totally-fake-namespace-12345.
diff --git a/evals/tasks/observability/queries/pending-pods.yaml b/evals/tasks/observability/queries/pending-pods.yaml
index 7f06f5d9e..5ca55f27e 100644
--- a/evals/tasks/observability/queries/pending-pods.yaml
+++ b/evals/tasks/observability/queries/pending-pods.yaml
@@ -17,7 +17,7 @@ spec:
   verify:
     - llmJudge:
         contains: "kube_pod_status_phase"
-        reason: "Verify the agent used the correct metric for pod phase status"
+        reason: "Verify the agent called execute_instant_query with the correct metric kube_pod_status_phase and reported the results: either listing pending pods, or explicitly confirming no pods are in pending state"
   prompt:
     inline: |
       Which pods are stuck in pending state?
diff --git a/evals/tasks/observability/queries/pods-created.yaml b/evals/tasks/observability/queries/pods-created.yaml
index a8cee3114..cacf0e76b 100644
--- a/evals/tasks/observability/queries/pods-created.yaml
+++ b/evals/tasks/observability/queries/pods-created.yaml
@@ -16,7 +16,7 @@ spec:
   verify:
     - llmJudge:
         contains: "kube_pod_created"
-        reason: "Verify the agent used the correct metric for pod creation timestamps"
+        reason: "Verify the agent called execute_range_query with the kube_pod_created metric and reported the results: either listing recently created pods, or explicitly confirming no pods were created in the time window"
   prompt:
     inline: |
       How many pods were created in the last 5 minutes?
diff --git a/evals/tasks/observability/queries/prometheus-head-series.yaml b/evals/tasks/observability/queries/prometheus-head-series.yaml
index ee89d5953..bcf2c77c8 100644
--- a/evals/tasks/observability/queries/prometheus-head-series.yaml
+++ b/evals/tasks/observability/queries/prometheus-head-series.yaml
@@ -16,7 +16,7 @@ spec:
   verify:
     - llmJudge:
         contains: "prometheus_tsdb_head_series"
-        reason: "Verify the agent used the correct Prometheus TSDB metric"
+        reason: "Verify the agent called execute_instant_query with the correct Prometheus TSDB metric prometheus_tsdb_head_series and reported the current head series count"
   prompt:
     inline: |
       How many head series does Prometheus have?
diff --git a/evals/tasks/observability/queries/prometheus-requests.yaml b/evals/tasks/observability/queries/prometheus-requests.yaml
index 9560170ae..4445d033a 100644
--- a/evals/tasks/observability/queries/prometheus-requests.yaml
+++ b/evals/tasks/observability/queries/prometheus-requests.yaml
@@ -16,7 +16,7 @@ spec:
   verify:
     - llmJudge:
         contains: "prometheus_http_requests_total"
-        reason: "Verify the agent used the correct HTTP requests metric"
+        reason: "Verify the agent called execute_instant_query with the correct HTTP requests metric prometheus_http_requests_total and reported the request rate"
   prompt:
     inline: |
       How many requests per second are being made to Prometheus?
diff --git a/evals/tasks/observability/queries/prometheus-wal-size.yaml b/evals/tasks/observability/queries/prometheus-wal-size.yaml
index af6d18293..14cb8a944 100644
--- a/evals/tasks/observability/queries/prometheus-wal-size.yaml
+++ b/evals/tasks/observability/queries/prometheus-wal-size.yaml
@@ -16,7 +16,7 @@ spec:
   verify:
     - llmJudge:
         contains: "prometheus_tsdb_wal_storage_size_bytes"
-        reason: "Verify the agent used the correct WAL storage metric"
+        reason: "Verify the agent called execute_instant_query with the correct WAL storage metric prometheus_tsdb_wal_storage_size_bytes and reported the current WAL size"
   prompt:
     inline: |
       What is the current storage size of the Prometheus WAL?
diff --git a/evals/tasks/observability/queries/time-range-query.yaml b/evals/tasks/observability/queries/time-range-query.yaml
index 521423c8c..2ac2c0d60 100644
--- a/evals/tasks/observability/queries/time-range-query.yaml
+++ b/evals/tasks/observability/queries/time-range-query.yaml
@@ -16,11 +16,11 @@ metadata:
 spec:
   verify:
     - llmJudge:
-        contains: "30 minutes"
-        reason: "Verify the agent honoured the requested 30-minute time window"
+        contains: "container_cpu_usage_seconds_total"
+        reason: "Verify the agent called execute_range_query with the correct CPU metric container_cpu_usage_seconds_total and reported the results"
     - llmJudge:
-        contains: "cpu"
-        reason: "Verify the response includes CPU usage data"
+        contains: "pod"
+        reason: "Verify the agent reported range query results including specific pod names, or explicitly confirmed no CPU data is available for the given namespace"
   prompt:
     inline: |
       Show me the CPU usage trend for pods in the default namespace over the last 30 minutes.
diff --git a/evals/tasks/observability/queries/visualize-cpu-usage.yaml b/evals/tasks/observability/queries/visualize-cpu-usage.yaml
index 0f6af1eb6..8894dabd5 100644
--- a/evals/tasks/observability/queries/visualize-cpu-usage.yaml
+++ b/evals/tasks/observability/queries/visualize-cpu-usage.yaml
@@ -16,8 +16,8 @@ metadata:
 spec:
   verify:
     - llmJudge:
-        contains: "cpu"
-        reason: "Verify the agent queried and visualized CPU usage data"
+        contains: "container_cpu_usage_seconds_total"
+        reason: "Verify the agent called show_timeseries with the correct CPU metric container_cpu_usage_seconds_total and reported the visualization results"
   prompt:
     inline: |
       Visualize the CPU usage for pods in the default namespace over the last 30 minutes.

From f0113dd4bc60e67a05e0d185b828476fcf19391d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Reme=C5=A1?= <tremes@redhat.com>
Date: Thu, 23 Apr 2026 15:58:29 +0200
Subject: [PATCH 3/3] update to the latest version

---
 .../alerts/alert-investigation.yaml             | 17 ++++++++---------
 .../observability/alerts/filtered-alerts.yaml   | 11 ++++-------
 .../tasks/observability/alerts/get-alerts.yaml  |  6 +++---
 .../observability/alerts/get-silences.yaml      |  6 +++---
 .../tasks/observability/labels/get-series.yaml  | 10 +++++-----
 .../tasks/observability/labels/label-names.yaml |  6 +++---
 .../observability/labels/label-values.yaml      |  4 ++--
 .../labels/series-by-namespace.yaml             |  8 ++++----
 .../observability/metrics/list-metrics.yaml     |  4 ++--
 .../metrics/list-node-metrics.yaml              |  4 ++--
 .../queries/backend-reachability.yaml           |  4 ++--
 .../tasks/observability/queries/cpu-usage.yaml  |  6 +++---
 .../queries/crashlooping-pods.yaml              |  4 ++--
 .../queries/diagnose-cluster-health.yaml        |  8 ++++----
 .../queries/high-cardinality-rejection.yaml     |  4 ++--
 .../observability/queries/memory-usage.yaml     |  8 ++++----
 .../queries/namespace-pod-count.yaml            |  6 +++---
 .../queries/namespace-resource-usage.yaml       |  8 ++++----
 .../observability/queries/network-traffic.yaml  |  6 +++---
 .../queries/nonexistent-metric.yaml             |  4 ++--
 .../queries/nonexistent-namespace.yaml          |  4 ++--
 .../observability/queries/pending-pods.yaml     |  4 ++--
 .../observability/queries/pods-created.yaml     |  4 ++--
 .../queries/prometheus-head-series.yaml         |  4 ++--
 .../queries/prometheus-requests.yaml            |  4 ++--
 .../queries/prometheus-wal-size.yaml            |  4 ++--
 .../observability/queries/time-range-query.yaml |  6 +++---
 .../queries/visualize-cpu-usage.yaml            |  4 ++--
 28 files changed, 82 insertions(+), 86 deletions(-)

diff --git a/evals/tasks/observability/alerts/alert-investigation.yaml b/evals/tasks/observability/alerts/alert-investigation.yaml
index da5369366..f0f35ce2d 100644
--- a/evals/tasks/observability/alerts/alert-investigation.yaml
+++ b/evals/tasks/observability/alerts/alert-investigation.yaml
@@ -6,24 +6,23 @@ metadata:
   parallel: true
   runs: 1
   labels:
-    suite: observability
     category: alerts
+    suite: observability
     toolType: multi-step
   description: |
     Tests if the agent can perform multi-step alert triage: first retrieving
-    alerts from Alertmanager, then investigating related metrics for the most
-    critical alert using Prometheus queries.
+    alerts from Alertmanager, then investigating related metrics for a
+    firing alert using queries.
 spec:
   verify:
     - llmJudge:
-        contains: "alertname"
-        reason: "Verify the agent called get_alerts and reported the results: either listing specific alert names from Alertmanager, or explicitly confirming that no alerts are currently firing"
+        contains: "AlertmanagerReceiversNotConfigured"
+        reason: "Verify the agent retrieved firing alerts and identified AlertmanagerReceiversNotConfigured"
     - llmJudge:
-        contains: "query"
-        reason: "Verify the agent investigated related metrics by running a Prometheus query"
+        contains: "alertmanager"
+        reason: "Verify the agent investigated Alertmanager-related metrics for the alert"
   prompt:
     inline: |
       Check if there are any firing alerts. If there are, investigate
       the related metrics for the most critical alert and summarize
-      what's happening. If there are no firing alerts, check cluster
-      health metrics instead.
+      what's happening.
diff --git a/evals/tasks/observability/alerts/filtered-alerts.yaml b/evals/tasks/observability/alerts/filtered-alerts.yaml
index 2726598b0..6f9aaf91a 100644
--- a/evals/tasks/observability/alerts/filtered-alerts.yaml
+++ b/evals/tasks/observability/alerts/filtered-alerts.yaml
@@ -6,8 +6,8 @@ metadata:
   parallel: true
   runs: 1
   labels:
-    suite: observability
     category: alerts
+    suite: observability
     toolType: alertmanager
   description: |
     Tests if the agent uses the get_alerts tool with filter parameters
@@ -17,11 +17,8 @@ metadata:
 spec:
   verify:
     - llmJudge:
-        contains: "alertname"
-        reason: "Verify the agent called get_alerts and reported the results: either listing specific alert names with their status, or explicitly confirming that no critical alerts exist"
-    - llmJudge:
-        contains: "severity"
-        reason: "Verify the agent addressed the severity filter in its response"
+        contains: "AlertmanagerReceiversNotConfigured"
+        reason: "Verify the agent filtered for warning-severity alerts and found AlertmanagerReceiversNotConfigured"
   prompt:
     inline: |
-      Are there any firing alerts with severity=critical? Show only active alerts.
+      Show me only the active alerts with severity=warning.
diff --git a/evals/tasks/observability/alerts/get-alerts.yaml b/evals/tasks/observability/alerts/get-alerts.yaml
index 3b167faa7..49dbd5f92 100644
--- a/evals/tasks/observability/alerts/get-alerts.yaml
+++ b/evals/tasks/observability/alerts/get-alerts.yaml
@@ -6,8 +6,8 @@ metadata:
   parallel: true
   runs: 1
   labels:
-    suite: observability
     category: alerts
+    suite: observability
     toolType: alertmanager
   description: |
     Tests if the agent can discover and use the get_alerts tool to retrieve
@@ -15,8 +15,8 @@ metadata:
 spec:
   verify:
     - llmJudge:
-        contains: "alertname"
-        reason: "Verify the agent called get_alerts and reported the results: either listing specific alert names with their status, or explicitly confirming that no alerts are currently firing"
+        contains: "Watchdog"
+        reason: "Verify the agent retrieved firing alerts and reported the Watchdog alert"
   prompt:
     inline: |
       Check the cluster for any firing alerts and report what you find.
diff --git a/evals/tasks/observability/alerts/get-silences.yaml b/evals/tasks/observability/alerts/get-silences.yaml
index 726e60c31..f986c7851 100644
--- a/evals/tasks/observability/alerts/get-silences.yaml
+++ b/evals/tasks/observability/alerts/get-silences.yaml
@@ -6,8 +6,8 @@ metadata:
   parallel: true
   runs: 1
   labels:
-    suite: observability
     category: alerts
+    suite: observability
     toolType: alertmanager
   description: |
     Tests if the agent can discover and use the get_silences tool to retrieve
@@ -15,8 +15,8 @@ metadata:
 spec:
   verify:
     - llmJudge:
-        contains: "silence"
-        reason: "Verify the output discusses alert silences, including when no active silences exist"
+        contains: "silences"
+        reason: "Verify the response mentions silences. The agent should report either active silences with their matchers or that no active silences exist — both are valid outcomes"
   prompt:
     inline: |
       Are there any active silences in Alertmanager?
diff --git a/evals/tasks/observability/labels/get-series.yaml b/evals/tasks/observability/labels/get-series.yaml
index 6cd93a4da..95645079a 100644
--- a/evals/tasks/observability/labels/get-series.yaml
+++ b/evals/tasks/observability/labels/get-series.yaml
@@ -6,8 +6,8 @@ metadata:
   parallel: true
   runs: 1
   labels:
-    suite: observability
     category: labels
+    suite: observability
     toolType: exploration
   description: |
     Tests if the agent can use the get_series tool to check cardinality for a metric.
@@ -17,10 +17,10 @@ spec:
   verify:
     - llmJudge:
         contains: "namespace"
-        reason: "Verify the agent called get_series and reported the results: either listing actual series data containing label dimensions like namespace, or confirming no series were found"
+        reason: "Verify the agent retrieved actual series data containing label dimensions like namespace"
     - llmJudge:
-        contains: "node"
-        reason: "Verify the agent called get_series and reported series data with node label values, or explicitly confirmed the metric has no series with node labels"
+        contains: "kube_pod_info"
+        reason: "Verify the agent queried the kube_pod_info metric and reported its cardinality"
   prompt:
     inline: |
-      How many time series exist for the kube_pod_info metric? Show the cardinality.
+      How many time series exist for the kube_pod_info metric? Show the count and list the label names present.
diff --git a/evals/tasks/observability/labels/label-names.yaml b/evals/tasks/observability/labels/label-names.yaml
index 084c923a9..adf413963 100644
--- a/evals/tasks/observability/labels/label-names.yaml
+++ b/evals/tasks/observability/labels/label-names.yaml
@@ -6,8 +6,8 @@ metadata:
   parallel: true
   runs: 1
   labels:
-    suite: observability
     category: labels
+    suite: observability
     toolType: exploration
   description: |
     Tests if the agent follows the correct workflow: first calling list_metrics to
@@ -17,10 +17,10 @@ spec:
   verify:
     - llmJudge:
         contains: "namespace"
-        reason: "Verify the agent called get_label_names and reported the results: the output should include the namespace label which is a standard Kubernetes label"
+        reason: "Verify the output includes the namespace label which is a standard Kubernetes label"
     - llmJudge:
         contains: "pod"
-        reason: "Verify the agent called get_label_names and reported the results: the output should include the pod label"
+        reason: "Verify the output includes the pod label"
   prompt:
     inline: |
       What labels are available for the kube_pod_info metric?
diff --git a/evals/tasks/observability/labels/label-values.yaml b/evals/tasks/observability/labels/label-values.yaml
index e012ebca3..d78e5b39f 100644
--- a/evals/tasks/observability/labels/label-values.yaml
+++ b/evals/tasks/observability/labels/label-values.yaml
@@ -6,8 +6,8 @@ metadata:
   parallel: true
   runs: 1
   labels:
-    suite: observability
     category: labels
+    suite: observability
     toolType: exploration
   description: |
     Tests the full discovery workflow: list_metrics to verify the metric, then
@@ -16,7 +16,7 @@ spec:
   verify:
     - llmJudge:
         contains: "kube-system"
-        reason: "Verify the agent called get_label_values and reported the results: the output should list actual namespace values from the cluster such as kube-system"
+        reason: "Verify the output lists actual namespace values from the cluster such as kube-system"
   prompt:
     inline: |
       What are the unique namespace values for the kube_pod_info metric?
diff --git a/evals/tasks/observability/labels/series-by-namespace.yaml b/evals/tasks/observability/labels/series-by-namespace.yaml
index f782a5a4d..23d78b58e 100644
--- a/evals/tasks/observability/labels/series-by-namespace.yaml
+++ b/evals/tasks/observability/labels/series-by-namespace.yaml
@@ -6,8 +6,8 @@ metadata:
   parallel: true
   runs: 1
   labels:
-    suite: observability
     category: labels
+    suite: observability
     toolType: exploration
   description: |
     Tests if the agent can use the get_series tool with a label selector
@@ -18,11 +18,11 @@ spec:
   verify:
     - llmJudge:
         contains: "pod"
-        reason: "Verify the agent called get_series and reported the results: either listing actual series data containing label dimensions like pod, or confirming no series were found for the given namespace"
+        reason: "Verify the agent retrieved actual series data containing label dimensions like pod"
     - llmJudge:
         contains: "container"
-        reason: "Verify the agent called get_series and reported series with container label values from the monitoring namespace, or explicitly confirmed no matching series exist"
+        reason: "Verify the agent reported series with container label values from the namespace"
   prompt:
     inline: |
       How many time series exist for container_cpu_usage_seconds_total
-      in the monitoring namespace?
+      in the openshift-monitoring namespace?
diff --git a/evals/tasks/observability/metrics/list-metrics.yaml b/evals/tasks/observability/metrics/list-metrics.yaml
index ee3283059..1f5adc71a 100644
--- a/evals/tasks/observability/metrics/list-metrics.yaml
+++ b/evals/tasks/observability/metrics/list-metrics.yaml
@@ -6,8 +6,8 @@ metadata:
   parallel: true
   runs: 1
   labels:
-    suite: observability
     category: metrics
+    suite: observability
     toolType: discovery
   description: |
     Tests if the agent can discover the list_metrics tool to find Kubernetes-related
@@ -16,7 +16,7 @@ spec:
   verify:
     - llmJudge:
         contains: "kube_pod_info"
-        reason: "Verify the agent called list_metrics and reported the results: the output should list specific kube metrics discovered from Prometheus"
+        reason: "Verify the output lists specific kube metrics discovered from Prometheus"
   prompt:
     inline: |
       List all available Prometheus metrics that contain 'kube' in the name.
diff --git a/evals/tasks/observability/metrics/list-node-metrics.yaml b/evals/tasks/observability/metrics/list-node-metrics.yaml
index d6f4184a4..f496460aa 100644
--- a/evals/tasks/observability/metrics/list-node-metrics.yaml
+++ b/evals/tasks/observability/metrics/list-node-metrics.yaml
@@ -6,8 +6,8 @@ metadata:
   parallel: true
   runs: 1
   labels:
-    suite: observability
     category: metrics
+    suite: observability
     toolType: discovery
   description: |
     Tests if the agent can discover node-related metrics using the list_metrics tool
@@ -16,7 +16,7 @@ spec:
   verify:
     - llmJudge:
         contains: "node_"
-        reason: "Verify the agent called list_metrics and reported the results: the output should list specific node metrics discovered from Prometheus (e.g. node_cpu_seconds_total)"
+        reason: "Verify the output lists specific node metrics discovered from Prometheus (e.g. node_cpu_seconds_total)"
   prompt:
     inline: |
       What node-related metrics are available in Prometheus?
diff --git a/evals/tasks/observability/queries/backend-reachability.yaml b/evals/tasks/observability/queries/backend-reachability.yaml
index 5b5009583..5a027a5a9 100644
--- a/evals/tasks/observability/queries/backend-reachability.yaml
+++ b/evals/tasks/observability/queries/backend-reachability.yaml
@@ -6,8 +6,8 @@ metadata:
   parallel: true
   runs: 1
   labels:
-    suite: observability
     category: queries
+    suite: observability
     toolType: smoke-test
   description: |
     Smoke test to verify the agent can reach the metrics backend and get a
@@ -17,7 +17,7 @@ spec:
   verify:
     - llmJudge:
         contains: "prometheus-k8s"
-        reason: "Verify the agent called execute_instant_query with the up metric and reported the results: the response should confirm the prometheus-k8s target status"
+        reason: "Verify the agent queried the up metric and reported on the prometheus-k8s target"
   prompt:
     inline: |
       Is the metrics backend reachable? Query up{job="prometheus-k8s"} and report the status.
diff --git a/evals/tasks/observability/queries/cpu-usage.yaml b/evals/tasks/observability/queries/cpu-usage.yaml
index f4c6a0f67..3baddf6cf 100644
--- a/evals/tasks/observability/queries/cpu-usage.yaml
+++ b/evals/tasks/observability/queries/cpu-usage.yaml
@@ -6,8 +6,8 @@ metadata:
   parallel: true
   runs: 1
   labels:
-    suite: observability
     category: queries
+    suite: observability
     toolType: query
   description: |
     Tests if the agent can find the appropriate CPU usage metric and query it to
@@ -16,10 +16,10 @@ spec:
   verify:
     - llmJudge:
         contains: "container_cpu_usage_seconds_total"
-        reason: "Verify the agent called execute_instant_query with the correct CPU metric container_cpu_usage_seconds_total and reported the results"
+        reason: "Verify the agent queried the correct CPU metric"
     - llmJudge:
         contains: "pod"
-        reason: "Verify the agent reported query results identifying specific pods, or explicitly confirmed no CPU data is available"
+        reason: "Verify the response identifies specific pods"
   prompt:
     inline: |
       Which pods are using the most CPU?
diff --git a/evals/tasks/observability/queries/crashlooping-pods.yaml b/evals/tasks/observability/queries/crashlooping-pods.yaml
index 24119f551..4e975964e 100644
--- a/evals/tasks/observability/queries/crashlooping-pods.yaml
+++ b/evals/tasks/observability/queries/crashlooping-pods.yaml
@@ -6,8 +6,8 @@ metadata:
   parallel: true
   runs: 1
   labels:
-    suite: observability
     category: queries
+    suite: observability
     toolType: range-query
   description: |
     Tests if the agent can identify crashlooping pods by discovering and querying
@@ -16,7 +16,7 @@ spec:
   verify:
     - llmJudge:
         contains: "kube_pod_container_status_restarts_total"
-        reason: "Verify the agent called execute_range_query with the correct metric kube_pod_container_status_restarts_total and reported the results: either listing crashlooping pods, or explicitly confirming no pods have restarted"
+        reason: "Verify the agent used the correct metric for container restart counts"
   prompt:
     inline: |
       Which pods were crashlooping in the last 5 minutes?
diff --git a/evals/tasks/observability/queries/diagnose-cluster-health.yaml b/evals/tasks/observability/queries/diagnose-cluster-health.yaml
index a2e45e1a9..7542b61e5 100644
--- a/evals/tasks/observability/queries/diagnose-cluster-health.yaml
+++ b/evals/tasks/observability/queries/diagnose-cluster-health.yaml
@@ -6,8 +6,8 @@ metadata:
   parallel: true
   runs: 1
   labels:
-    suite: observability
     category: queries
+    suite: observability
     toolType: diagnostic
   description: |
     Tests the agent's ability to handle an ambiguous diagnostic prompt.
@@ -17,11 +17,11 @@ metadata:
 spec:
   verify:
     - llmJudge:
-        contains: "alertname"
-        reason: "Verify the agent called get_alerts and reported the results: either listing specific alert names, or explicitly confirming that no alerts are currently firing"
+        contains: "Watchdog"
+        reason: "Verify the agent checked alerts and reported specific alert names such as Watchdog"
     - llmJudge:
         contains: "kube_"
-        reason: "Verify the agent called execute_instant_query with Kubernetes metrics (e.g. kube_node_status_condition) and reported the results for health assessment"
+        reason: "Verify the agent queried Kubernetes metrics (e.g. kube_node_status_condition) for health assessment"
   prompt:
     inline: |
       Is the cluster healthy? Give me an overview of any issues.
diff --git a/evals/tasks/observability/queries/high-cardinality-rejection.yaml b/evals/tasks/observability/queries/high-cardinality-rejection.yaml
index 9a5cf78c9..dbb5b3bba 100644
--- a/evals/tasks/observability/queries/high-cardinality-rejection.yaml
+++ b/evals/tasks/observability/queries/high-cardinality-rejection.yaml
@@ -6,8 +6,8 @@ metadata:
   parallel: true
   runs: 1
   labels:
-    suite: observability
     category: queries
+    suite: observability
     toolType: error-handling
   description: |
     Tests agent behavior when a query is rejected by obs-mcp guardrails
@@ -17,7 +17,7 @@ spec:
   verify:
     - llmJudge:
         contains: "guardrail"
-        reason: "Verify the agent attempted a query that was rejected by a guardrail and reported the result: the response should explain the guardrail rejection and suggest a scoped alternative"
+        reason: "Verify the agent explains the query was rejected by a guardrail and suggests a scoped alternative"
   prompt:
     inline: |
       Show me the CPU, memory, network, and disk usage for every single container
diff --git a/evals/tasks/observability/queries/memory-usage.yaml b/evals/tasks/observability/queries/memory-usage.yaml
index 2a41e2c83..58da3fce8 100644
--- a/evals/tasks/observability/queries/memory-usage.yaml
+++ b/evals/tasks/observability/queries/memory-usage.yaml
@@ -6,8 +6,8 @@ metadata:
   parallel: true
   runs: 1
   labels:
-    suite: observability
     category: queries
+    suite: observability
     toolType: query
   description: |
     Tests if the agent can find the appropriate memory usage metric and query it
@@ -15,11 +15,11 @@ metadata:
 spec:
   verify:
     - llmJudge:
-        contains: "container_memory_working_set_bytes"
-        reason: "Verify the agent called execute_instant_query with the correct memory metric container_memory_working_set_bytes and reported the results"
+        contains: "container_memory"
+        reason: "Verify the agent queried a container memory metric (working_set_bytes or usage_bytes)"
     - llmJudge:
         contains: "pod"
-        reason: "Verify the agent reported query results identifying specific pods, or explicitly confirmed no memory data is available"
+        reason: "Verify the response identifies specific pods"
   prompt:
     inline: |
       Which pods are using the most memory? Show me the top 5.
diff --git a/evals/tasks/observability/queries/namespace-pod-count.yaml b/evals/tasks/observability/queries/namespace-pod-count.yaml
index 11ef3d0f8..b180a917d 100644
--- a/evals/tasks/observability/queries/namespace-pod-count.yaml
+++ b/evals/tasks/observability/queries/namespace-pod-count.yaml
@@ -6,8 +6,8 @@ metadata:
   parallel: true
   runs: 1
   labels:
-    suite: observability
     category: queries
+    suite: observability
     toolType: multi-step
   description: |
     Tests multi-step reasoning: the agent must discover a suitable metric
@@ -18,10 +18,10 @@ spec:
   verify:
     - llmJudge:
         contains: "kube-system"
-        reason: "Verify the agent called execute_instant_query and reported the results: the response should list actual namespace names from the cluster such as kube-system"
+        reason: "Verify the response lists actual namespace names from the cluster such as kube-system"
     - llmJudge:
         contains: "kube_pod"
-        reason: "Verify the agent called execute_instant_query with a kube_pod metric to count pods and reported the results"
+        reason: "Verify the agent used a kube_pod metric to count pods"
   prompt:
     inline: |
       Which namespaces have the most running pods? Show me the top 5.
diff --git a/evals/tasks/observability/queries/namespace-resource-usage.yaml b/evals/tasks/observability/queries/namespace-resource-usage.yaml
index 26ee2d73f..4434ca52c 100644
--- a/evals/tasks/observability/queries/namespace-resource-usage.yaml
+++ b/evals/tasks/observability/queries/namespace-resource-usage.yaml
@@ -6,8 +6,8 @@ metadata:
   parallel: true
   runs: 1
   labels:
-    suite: observability
     category: queries
+    suite: observability
     toolType: multi-step
   description: |
     Tests multi-step reasoning: the agent must discover CPU and memory metrics,
@@ -18,10 +18,10 @@ spec:
   verify:
     - llmJudge:
         contains: "container_cpu_usage_seconds_total"
-        reason: "Verify the agent called list_metrics to discover and then executed a query with the correct CPU metric container_cpu_usage_seconds_total, reporting the results"
+        reason: "Verify the agent discovered and used the correct CPU metric"
     - llmJudge:
-        contains: "container_memory_working_set_bytes"
-        reason: "Verify the agent called list_metrics to discover and then executed a query with the correct memory metric container_memory_working_set_bytes, reporting the results"
+        contains: "container_memory"
+        reason: "Verify the agent discovered and used a container memory metric (working_set_bytes or usage_bytes)"
   prompt:
     inline: |
       Which namespace is consuming the most CPU and memory?
diff --git a/evals/tasks/observability/queries/network-traffic.yaml b/evals/tasks/observability/queries/network-traffic.yaml
index 879b7a943..9b1e06fd3 100644
--- a/evals/tasks/observability/queries/network-traffic.yaml
+++ b/evals/tasks/observability/queries/network-traffic.yaml
@@ -6,8 +6,8 @@ metadata:
   parallel: true
   runs: 1
   labels:
-    suite: observability
     category: queries
+    suite: observability
     toolType: query
   description: |
     Tests if the agent can discover network-related metrics and query them to find
@@ -16,10 +16,10 @@ spec:
   verify:
     - llmJudge:
         contains: "container_network_receive_bytes_total"
-        reason: "Verify the agent called execute_instant_query with the correct network metric container_network_receive_bytes_total and reported the results"
+        reason: "Verify the agent queried the correct network metric"
     - llmJudge:
         contains: "pod"
-        reason: "Verify the agent reported query results identifying specific pods, or explicitly confirmed no network data is available"
+        reason: "Verify the response identifies specific pods"
   prompt:
     inline: |
       Which pods are receiving the most network traffic?
diff --git a/evals/tasks/observability/queries/nonexistent-metric.yaml b/evals/tasks/observability/queries/nonexistent-metric.yaml
index 25f0dbd22..ed76ee25a 100644
--- a/evals/tasks/observability/queries/nonexistent-metric.yaml
+++ b/evals/tasks/observability/queries/nonexistent-metric.yaml
@@ -6,8 +6,8 @@ metadata:
   parallel: true
   runs: 1
   labels:
-    suite: observability
     category: queries
+    suite: observability
     toolType: error-handling
   description: |
     Tests agent recovery when querying a metric that does not exist.
@@ -17,7 +17,7 @@ spec:
   verify:
     - llmJudge:
         contains: "not found"
-        reason: "Verify the agent called list_metrics and reported the result: explicitly communicating that the metric does not exist or was not found"
+        reason: "Verify the agent communicates that the metric does not exist or was not found"
   prompt:
     inline: |
       What is the current value of the metric fake_nonexistent_metric_total?
diff --git a/evals/tasks/observability/queries/nonexistent-namespace.yaml b/evals/tasks/observability/queries/nonexistent-namespace.yaml
index 3384f43e3..78090c0c1 100644
--- a/evals/tasks/observability/queries/nonexistent-namespace.yaml
+++ b/evals/tasks/observability/queries/nonexistent-namespace.yaml
@@ -6,8 +6,8 @@ metadata:
   parallel: true
   runs: 1
   labels:
-    suite: observability
     category: queries
+    suite: observability
     toolType: error-handling
   description: |
     Tests agent behavior when querying for resources in a namespace that
@@ -17,7 +17,7 @@ spec:
   verify:
     - llmJudge:
         contains: "no data"
-        reason: "Verify the agent called execute_instant_query and reported the result: explicitly confirming no data, no results, or no pods found in the nonexistent namespace"
+        reason: "Verify the agent reports no data, no results, or no pods found in the nonexistent namespace"
   prompt:
     inline: |
       Show me the memory usage for all pods in the namespace called totally-fake-namespace-12345.
diff --git a/evals/tasks/observability/queries/pending-pods.yaml b/evals/tasks/observability/queries/pending-pods.yaml
index 5ca55f27e..f5b47e387 100644
--- a/evals/tasks/observability/queries/pending-pods.yaml
+++ b/evals/tasks/observability/queries/pending-pods.yaml
@@ -6,8 +6,8 @@ metadata:
   parallel: true
   runs: 1
   labels:
-    suite: observability
     category: queries
+    suite: observability
     toolType: query
   description: |
     Tests if the agent can identify pods stuck in pending state by first discovering
@@ -17,7 +17,7 @@ spec:
   verify:
     - llmJudge:
         contains: "kube_pod_status_phase"
-        reason: "Verify the agent called execute_instant_query with the correct metric kube_pod_status_phase and reported the results: either listing pending pods, or explicitly confirming no pods are in pending state"
+        reason: "Verify the agent used the correct metric for pod phase status"
   prompt:
     inline: |
       Which pods are stuck in pending state?
diff --git a/evals/tasks/observability/queries/pods-created.yaml b/evals/tasks/observability/queries/pods-created.yaml
index cacf0e76b..097a57e24 100644
--- a/evals/tasks/observability/queries/pods-created.yaml
+++ b/evals/tasks/observability/queries/pods-created.yaml
@@ -6,8 +6,8 @@ metadata:
   parallel: true
   runs: 1
   labels:
-    suite: observability
     category: queries
+    suite: observability
     toolType: range-query
   description: |
     Tests if the agent can use a range query to find recently created pods by
@@ -16,7 +16,7 @@ spec:
   verify:
     - llmJudge:
         contains: "kube_pod_created"
-        reason: "Verify the agent called execute_range_query with the kube_pod_created metric and reported the results: either listing recently created pods, or explicitly confirming no pods were created in the time window"
+        reason: "Verify the agent discovered and used the kube_pod_created metric"
   prompt:
     inline: |
       How many pods were created in the last 5 minutes?
diff --git a/evals/tasks/observability/queries/prometheus-head-series.yaml b/evals/tasks/observability/queries/prometheus-head-series.yaml
index bcf2c77c8..9f62ffdf3 100644
--- a/evals/tasks/observability/queries/prometheus-head-series.yaml
+++ b/evals/tasks/observability/queries/prometheus-head-series.yaml
@@ -6,8 +6,8 @@ metadata:
   parallel: true
   runs: 1
   labels:
-    suite: observability
     category: queries
+    suite: observability
     toolType: query
   description: |
     Tests if the agent can query Prometheus internal metrics to report the current
@@ -16,7 +16,7 @@ spec:
   verify:
     - llmJudge:
         contains: "prometheus_tsdb_head_series"
-        reason: "Verify the agent called execute_instant_query with the correct Prometheus TSDB metric prometheus_tsdb_head_series and reported the current head series count"
+        reason: "Verify the agent used the correct Prometheus TSDB metric"
   prompt:
     inline: |
       How many head series does Prometheus have?
diff --git a/evals/tasks/observability/queries/prometheus-requests.yaml b/evals/tasks/observability/queries/prometheus-requests.yaml
index 4445d033a..f436130b4 100644
--- a/evals/tasks/observability/queries/prometheus-requests.yaml
+++ b/evals/tasks/observability/queries/prometheus-requests.yaml
@@ -6,8 +6,8 @@ metadata:
   parallel: true
   runs: 1
   labels:
-    suite: observability
     category: queries
+    suite: observability
     toolType: query
   description: |
     Tests if the agent can calculate the request rate to Prometheus by discovering
@@ -16,7 +16,7 @@ spec:
   verify:
     - llmJudge:
         contains: "prometheus_http_requests_total"
-        reason: "Verify the agent called execute_instant_query with the correct HTTP requests metric prometheus_http_requests_total and reported the request rate"
+        reason: "Verify the agent used the correct HTTP requests metric"
   prompt:
     inline: |
       How many requests per second are being made to Prometheus?
diff --git a/evals/tasks/observability/queries/prometheus-wal-size.yaml b/evals/tasks/observability/queries/prometheus-wal-size.yaml
index 14cb8a944..aa19ea451 100644
--- a/evals/tasks/observability/queries/prometheus-wal-size.yaml
+++ b/evals/tasks/observability/queries/prometheus-wal-size.yaml
@@ -6,8 +6,8 @@ metadata:
   parallel: true
   runs: 1
   labels:
-    suite: observability
     category: queries
+    suite: observability
     toolType: query
   description: |
     Tests if the agent can query the current Prometheus WAL storage size using
@@ -16,7 +16,7 @@ spec:
   verify:
     - llmJudge:
         contains: "prometheus_tsdb_wal_storage_size_bytes"
-        reason: "Verify the agent called execute_instant_query with the correct WAL storage metric prometheus_tsdb_wal_storage_size_bytes and reported the current WAL size"
+        reason: "Verify the agent used the correct WAL storage metric"
   prompt:
     inline: |
       What is the current storage size of the Prometheus WAL?
diff --git a/evals/tasks/observability/queries/time-range-query.yaml b/evals/tasks/observability/queries/time-range-query.yaml
index 2ac2c0d60..80a144be2 100644
--- a/evals/tasks/observability/queries/time-range-query.yaml
+++ b/evals/tasks/observability/queries/time-range-query.yaml
@@ -6,8 +6,8 @@ metadata:
   parallel: true
   runs: 1
   labels:
-    suite: observability
     category: queries
+    suite: observability
     toolType: multi-step
   description: |
     Tests whether the agent correctly uses execute_range_query with
@@ -17,10 +17,10 @@ spec:
   verify:
     - llmJudge:
         contains: "container_cpu_usage_seconds_total"
-        reason: "Verify the agent called execute_range_query with the correct CPU metric container_cpu_usage_seconds_total and reported the results"
+        reason: "Verify the agent discovered and used the correct CPU metric"
     - llmJudge:
         contains: "pod"
-        reason: "Verify the agent reported range query results including specific pod names, or explicitly confirmed no CPU data is available for the given namespace"
+        reason: "Verify the response includes specific pod names from the query results"
   prompt:
     inline: |
       Show me the CPU usage trend for pods in the default namespace over the last 30 minutes.
diff --git a/evals/tasks/observability/queries/visualize-cpu-usage.yaml b/evals/tasks/observability/queries/visualize-cpu-usage.yaml
index 8894dabd5..74d88ac85 100644
--- a/evals/tasks/observability/queries/visualize-cpu-usage.yaml
+++ b/evals/tasks/observability/queries/visualize-cpu-usage.yaml
@@ -6,8 +6,8 @@ metadata:
   parallel: true
   runs: 1
   labels:
-    suite: observability
     category: queries
+    suite: observability
     toolType: visualization
   description: |
     Tests if the agent uses the show_timeseries tool to visualize CPU usage
@@ -17,7 +17,7 @@ spec:
   verify:
     - llmJudge:
         contains: "container_cpu_usage_seconds_total"
-        reason: "Verify the agent called show_timeseries with the correct CPU metric container_cpu_usage_seconds_total and reported the visualization results"
+        reason: "Verify the agent discovered and visualized the correct CPU metric"
   prompt:
     inline: |
       Visualize the CPU usage for pods in the default namespace over the last 30 minutes.