rhobs · slashpai · Apr 23, 2026 · Apr 22, 2026 · Apr 22, 2026 · Apr 22, 2026
@@ -10,19 +10,18 @@ metadata:
     toolType: multi-step
   description: |
     Tests if the agent can perform multi-step alert triage: first retrieving
-    alerts from Alertmanager, then investigating related metrics for the most
-    critical alert using Prometheus queries.
+    alerts from Alertmanager, then investigating related metrics for a
+    firing alert using queries.
 spec:
   verify:
     - llmJudge:
-        contains: "alert"
-        reason: "Verify the agent retrieved alerts from Alertmanager"
+        contains: "AlertmanagerReceiversNotConfigured"
+        reason: "Verify the agent retrieved firing alerts and identified AlertmanagerReceiversNotConfigured"
     - llmJudge:
-        contains: "metric"
-        reason: "Verify the agent investigated a related Prometheus metric"
+        contains: "alertmanager"
+        reason: "Verify the agent investigated Alertmanager-related metrics for the alert"
   prompt:
     inline: |
       Check if there are any firing alerts. If there are, investigate
       the related metrics for the most critical alert and summarize
-      what's happening. If there are no firing alerts, check cluster
-      health metrics instead.
+      what's happening.
@@ -16,11 +16,8 @@ metadata:
 spec:
   verify:
     - llmJudge:
-        contains: "alert"
-        reason: "Verify the agent retrieved and reported on alerts"
-    - llmJudge:
-        contains: "critical"
-        reason: "Verify the agent filtered or addressed the critical severity"
+        contains: "AlertmanagerReceiversNotConfigured"
+        reason: "Verify the agent filtered for warning-severity alerts and found AlertmanagerReceiversNotConfigured"
   prompt:
     inline: |
-      Are there any firing alerts with severity=critical? Show only active alerts.
+      Show me only the active alerts with severity=warning.
@@ -14,8 +14,8 @@ metadata:
 spec:
   verify:
     - llmJudge:
-        contains: "alerts"
-        reason: "Verify the output reports on the alert state, whether alerts are firing or not"
+        contains: "Watchdog"
+        reason: "Verify the agent retrieved firing alerts and reported the Watchdog alert"
   prompt:
     inline: |
       Check the cluster for any firing alerts and report what you find.
@@ -14,8 +14,8 @@ metadata:
 spec:
   verify:
     - llmJudge:
-        contains: "silence"
-        reason: "Verify the output discusses alert silences, including when no active silences exist"
+        contains: "silences"
+        reason: "Verify the response mentions silences. The agent should report either active silences with their matchers or that no active silences exist — both are valid outcomes"
   prompt:
     inline: |
       Are there any active silences in Alertmanager?
@@ -15,11 +15,11 @@ metadata:
 spec:
   verify:
     - llmJudge:
-        contains: "series"
-        reason: "Verify the output reports time series information"
+        contains: "namespace"
+        reason: "Verify the agent retrieved actual series data containing label dimensions like namespace"
     - llmJudge:
         contains: "kube_pod_info"
-        reason: "Verify the agent queried the correct metric"
+        reason: "Verify the agent queried the kube_pod_info metric and reported its cardinality"
   prompt:
     inline: |
-      How many time series exist for the kube_pod_info metric? Show the cardinality.
+      How many time series exist for the kube_pod_info metric? Show the count and list the label names present.
@@ -14,8 +14,8 @@ metadata:
 spec:
   verify:
     - llmJudge:
-        contains: "namespace"
-        reason: "Verify the output lists actual namespace values from the cluster"
+        contains: "kube-system"
+        reason: "Verify the output lists actual namespace values from the cluster such as kube-system"
   prompt:
     inline: |
       What are the unique namespace values for the kube_pod_info metric?
@@ -16,12 +16,12 @@ metadata:
 spec:
   verify:
     - llmJudge:
-        contains: "series"
-        reason: "Verify the agent reported series information"
+        contains: "pod"
+        reason: "Verify the agent retrieved actual series data containing label dimensions like pod"
     - llmJudge:
-        contains: "monitoring"
-        reason: "Verify the agent scoped to the monitoring namespace"
+        contains: "container"
+        reason: "Verify the agent reported series with container label values from the namespace"
   prompt:
     inline: |
       How many time series exist for container_cpu_usage_seconds_total
-      in the monitoring namespace?
+      in the openshift-monitoring namespace?
@@ -14,8 +14,8 @@ metadata:
 spec:
   verify:
     - llmJudge:
-        contains: "kube"
-        reason: "Verify the output lists Kubernetes metrics matching the kube prefix"
+        contains: "kube_pod_info"
+        reason: "Verify the output lists specific kube metrics discovered from Prometheus"
   prompt:
     inline: |
       List all available Prometheus metrics that contain 'kube' in the name.
@@ -14,8 +14,8 @@ metadata:
 spec:
   verify:
     - llmJudge:
-        contains: "node"
-        reason: "Verify the output lists node-related metrics"
+        contains: "node_"
+        reason: "Verify the output lists specific node metrics discovered from Prometheus (e.g. node_cpu_seconds_total)"
   prompt:
     inline: |
       What node-related metrics are available in Prometheus?
@@ -15,8 +15,8 @@ metadata:
 spec:
   verify:
     - llmJudge:
-        contains: "up"
-        reason: "Verify the agent confirmed the metrics backend is reachable and targets are up"
+        contains: "prometheus-k8s"
+        reason: "Verify the agent queried the up metric and reported on the prometheus-k8s target"
   prompt:
     inline: |
       Is the metrics backend reachable? Query up{job="prometheus-k8s"} and report the status.
@@ -16,11 +16,11 @@ metadata:
 spec:
   verify:
     - llmJudge:
-        contains: "cluster"
-        reason: "Verify the response provides an overall cluster health assessment"
+        contains: "Watchdog"
+        reason: "Verify the agent checked alerts and reported specific alert names such as Watchdog"
     - llmJudge:
-        contains: "node"
-        reason: "Verify the agent checked node-level health indicators"
+        contains: "kube_"
+        reason: "Verify the agent queried Kubernetes metrics (e.g. kube_node_status_condition) for health assessment"
   prompt:
     inline: |
       Is the cluster healthy? Give me an overview of any issues.

@@ -15,8 +15,8 @@ metadata:
 spec:
   verify:
     - llmJudge:
-        contains: "namespace"
-        reason: "Verify the agent suggests scoping the query by namespace or another dimension"
+        contains: "guardrail"
+        reason: "Verify the agent explains the query was rejected by a guardrail and suggests a scoped alternative"
   prompt:
     inline: |
       Show me the CPU, memory, network, and disk usage for every single container

@@ -14,8 +14,8 @@ metadata:
 spec:
   verify:
     - llmJudge:
-        contains: "container_memory_working_set_bytes"
-        reason: "Verify the agent queried the correct memory metric"
+        contains: "container_memory"
+        reason: "Verify the agent queried a container memory metric (working_set_bytes or usage_bytes)"
     - llmJudge:
         contains: "pod"
         reason: "Verify the response identifies specific pods"

@@ -16,11 +16,11 @@ metadata:
 spec:
   verify:
     - llmJudge:
-        contains: "namespace"
-        reason: "Verify the response lists namespaces"
+        contains: "kube-system"
+        reason: "Verify the response lists actual namespace names from the cluster such as kube-system"
     - llmJudge:
-        contains: "pod"
-        reason: "Verify the response includes pod counts"
+        contains: "kube_pod"
+        reason: "Verify the agent used a kube_pod metric to count pods"
   prompt:
     inline: |
       Which namespaces have the most running pods? Show me the top 5.
@@ -16,14 +16,11 @@ metadata:
 spec:
   verify:
     - llmJudge:
-        contains: "namespace"
-        reason: "Verify the response breaks down resource usage by namespace"
+        contains: "container_cpu_usage_seconds_total"
+        reason: "Verify the agent discovered and used the correct CPU metric"
     - llmJudge:
-        contains: "cpu"
-        reason: "Verify the response includes CPU usage data"
-    - llmJudge:
-        contains: "memory"
-        reason: "Verify the response includes memory usage data"
+        contains: "container_memory"
+        reason: "Verify the agent discovered and used a container memory metric (working_set_bytes or usage_bytes)"
   prompt:
     inline: |
       Which namespace is consuming the most CPU and memory?

@@ -15,7 +15,7 @@ spec:
   verify:
     - llmJudge:
         contains: "kube_pod_created"
-        reason: "Verify the agent used the correct metric for pod creation timestamps"
+        reason: "Verify the agent discovered and used the kube_pod_created metric"
   prompt:
     inline: |
       How many pods were created in the last 5 minutes?
@@ -15,11 +15,11 @@ metadata:
 spec:
   verify:
     - llmJudge:
-        contains: "30 minutes"
-        reason: "Verify the agent honoured the requested 30-minute time window"
+        contains: "container_cpu_usage_seconds_total"
+        reason: "Verify the agent discovered and used the correct CPU metric"
     - llmJudge:
-        contains: "cpu"
-        reason: "Verify the response includes CPU usage data"
+        contains: "pod"
+        reason: "Verify the response includes specific pod names from the query results"
   prompt:
     inline: |
       Show me the CPU usage trend for pods in the default namespace over the last 30 minutes.
@@ -15,8 +15,8 @@ metadata:
 spec:
   verify:
     - llmJudge:
-        contains: "cpu"
-        reason: "Verify the agent queried and visualized CPU usage data"
+        contains: "container_cpu_usage_seconds_total"
+        reason: "Verify the agent discovered and visualized the correct CPU metric"
   prompt:
     inline: |
       Visualize the CPU usage for pods in the default namespace over the last 30 minutes.