diff --git a/evals/mcpchecker/tasks/alerts/alert-investigation.yaml b/evals/mcpchecker/tasks/alerts/alert-investigation.yaml
index 4388a72d..95eed497 100644
--- a/evals/mcpchecker/tasks/alerts/alert-investigation.yaml
+++ b/evals/mcpchecker/tasks/alerts/alert-investigation.yaml
@@ -10,19 +10,18 @@ metadata:
     toolType: multi-step
   description: |
     Tests if the agent can perform multi-step alert triage: first retrieving
-    alerts from Alertmanager, then investigating related metrics for the most
-    critical alert using Prometheus queries.
+    alerts from Alertmanager, then investigating related metrics for a
+    firing alert using queries.
 spec:
   verify:
     - llmJudge:
-        contains: "alert"
-        reason: "Verify the agent retrieved alerts from Alertmanager"
+        contains: "AlertmanagerReceiversNotConfigured"
+        reason: "Verify the agent retrieved firing alerts and identified AlertmanagerReceiversNotConfigured"
     - llmJudge:
-        contains: "metric"
-        reason: "Verify the agent investigated a related Prometheus metric"
+        contains: "alertmanager"
+        reason: "Verify the agent investigated Alertmanager-related metrics for the alert"
   prompt:
     inline: |
       Check if there are any firing alerts. If there are, investigate
       the related metrics for the most critical alert and summarize
-      what's happening. If there are no firing alerts, check cluster
-      health metrics instead.
+      what's happening.
diff --git a/evals/mcpchecker/tasks/alerts/filtered-alerts.yaml b/evals/mcpchecker/tasks/alerts/filtered-alerts.yaml
index ed6e0bcd..bed6cec4 100644
--- a/evals/mcpchecker/tasks/alerts/filtered-alerts.yaml
+++ b/evals/mcpchecker/tasks/alerts/filtered-alerts.yaml
@@ -16,11 +16,8 @@ metadata:
 spec:
   verify:
     - llmJudge:
-        contains: "alert"
-        reason: "Verify the agent retrieved and reported on alerts"
-    - llmJudge:
-        contains: "critical"
-        reason: "Verify the agent filtered or addressed the critical severity"
+        contains: "AlertmanagerReceiversNotConfigured"
+        reason: "Verify the agent filtered for warning-severity alerts and found AlertmanagerReceiversNotConfigured"
   prompt:
     inline: |
-      Are there any firing alerts with severity=critical? Show only active alerts.
+      Show me only the active alerts with severity=warning.
diff --git a/evals/mcpchecker/tasks/alerts/get-alerts.yaml b/evals/mcpchecker/tasks/alerts/get-alerts.yaml
index a21224a1..7ba817a9 100644
--- a/evals/mcpchecker/tasks/alerts/get-alerts.yaml
+++ b/evals/mcpchecker/tasks/alerts/get-alerts.yaml
@@ -14,8 +14,8 @@ metadata:
 spec:
   verify:
     - llmJudge:
-        contains: "alerts"
-        reason: "Verify the output reports on the alert state, whether alerts are firing or not"
+        contains: "Watchdog"
+        reason: "Verify the agent retrieved firing alerts and reported the Watchdog alert"
   prompt:
     inline: |
       Check the cluster for any firing alerts and report what you find.
diff --git a/evals/mcpchecker/tasks/alerts/get-silences.yaml b/evals/mcpchecker/tasks/alerts/get-silences.yaml
index fe550e23..1f4e36ce 100644
--- a/evals/mcpchecker/tasks/alerts/get-silences.yaml
+++ b/evals/mcpchecker/tasks/alerts/get-silences.yaml
@@ -14,8 +14,8 @@ metadata:
 spec:
   verify:
     - llmJudge:
-        contains: "silence"
-        reason: "Verify the output discusses alert silences, including when no active silences exist"
+        contains: "silences"
+        reason: "Verify the response mentions silences. The agent should report either active silences with their matchers or that no active silences exist — both are valid outcomes"
   prompt:
     inline: |
       Are there any active silences in Alertmanager?
diff --git a/evals/mcpchecker/tasks/labels/get-series.yaml b/evals/mcpchecker/tasks/labels/get-series.yaml
index 29e40259..85eecb29 100644
--- a/evals/mcpchecker/tasks/labels/get-series.yaml
+++ b/evals/mcpchecker/tasks/labels/get-series.yaml
@@ -15,11 +15,11 @@ metadata:
 spec:
   verify:
     - llmJudge:
-        contains: "series"
-        reason: "Verify the output reports time series information"
+        contains: "namespace"
+        reason: "Verify the agent retrieved actual series data containing label dimensions like namespace"
     - llmJudge:
         contains: "kube_pod_info"
-        reason: "Verify the agent queried the correct metric"
+        reason: "Verify the agent queried the kube_pod_info metric and reported its cardinality"
   prompt:
     inline: |
-      How many time series exist for the kube_pod_info metric? Show the cardinality.
+      How many time series exist for the kube_pod_info metric? Show the count and list the label names present.
diff --git a/evals/mcpchecker/tasks/labels/label-values.yaml b/evals/mcpchecker/tasks/labels/label-values.yaml
index fe86f6bd..aaa840ce 100644
--- a/evals/mcpchecker/tasks/labels/label-values.yaml
+++ b/evals/mcpchecker/tasks/labels/label-values.yaml
@@ -14,8 +14,8 @@ metadata:
 spec:
   verify:
     - llmJudge:
-        contains: "namespace"
-        reason: "Verify the output lists actual namespace values from the cluster"
+        contains: "kube-system"
+        reason: "Verify the output lists actual namespace values from the cluster such as kube-system"
   prompt:
     inline: |
       What are the unique namespace values for the kube_pod_info metric?
diff --git a/evals/mcpchecker/tasks/labels/series-by-namespace.yaml b/evals/mcpchecker/tasks/labels/series-by-namespace.yaml
index 92b62df3..14d3058e 100644
--- a/evals/mcpchecker/tasks/labels/series-by-namespace.yaml
+++ b/evals/mcpchecker/tasks/labels/series-by-namespace.yaml
@@ -16,12 +16,12 @@ metadata:
 spec:
   verify:
     - llmJudge:
-        contains: "series"
-        reason: "Verify the agent reported series information"
+        contains: "pod"
+        reason: "Verify the agent retrieved actual series data containing label dimensions like pod"
     - llmJudge:
-        contains: "monitoring"
-        reason: "Verify the agent scoped to the monitoring namespace"
+        contains: "container"
+        reason: "Verify the agent reported series with container label values from the namespace"
   prompt:
     inline: |
       How many time series exist for container_cpu_usage_seconds_total
-      in the monitoring namespace?
+      in the openshift-monitoring namespace?
diff --git a/evals/mcpchecker/tasks/metrics/list-metrics.yaml b/evals/mcpchecker/tasks/metrics/list-metrics.yaml
index ab6f01a6..2182dcad 100644
--- a/evals/mcpchecker/tasks/metrics/list-metrics.yaml
+++ b/evals/mcpchecker/tasks/metrics/list-metrics.yaml
@@ -14,8 +14,8 @@ metadata:
 spec:
   verify:
     - llmJudge:
-        contains: "kube"
-        reason: "Verify the output lists Kubernetes metrics matching the kube prefix"
+        contains: "kube_pod_info"
+        reason: "Verify the output lists specific kube metrics discovered from Prometheus"
   prompt:
     inline: |
       List all available Prometheus metrics that contain 'kube' in the name.
diff --git a/evals/mcpchecker/tasks/metrics/list-node-metrics.yaml b/evals/mcpchecker/tasks/metrics/list-node-metrics.yaml
index 0b8c5640..fd932e65 100644
--- a/evals/mcpchecker/tasks/metrics/list-node-metrics.yaml
+++ b/evals/mcpchecker/tasks/metrics/list-node-metrics.yaml
@@ -14,8 +14,8 @@ metadata:
 spec:
   verify:
     - llmJudge:
-        contains: "node"
-        reason: "Verify the output lists node-related metrics"
+        contains: "node_"
+        reason: "Verify the output lists specific node metrics discovered from Prometheus (e.g. node_cpu_seconds_total)"
   prompt:
     inline: |
       What node-related metrics are available in Prometheus?
diff --git a/evals/mcpchecker/tasks/queries/backend-reachability.yaml b/evals/mcpchecker/tasks/queries/backend-reachability.yaml
index e981e1ef..a276cd5f 100644
--- a/evals/mcpchecker/tasks/queries/backend-reachability.yaml
+++ b/evals/mcpchecker/tasks/queries/backend-reachability.yaml
@@ -15,8 +15,8 @@ metadata:
 spec:
   verify:
     - llmJudge:
-        contains: "up"
-        reason: "Verify the agent confirmed the metrics backend is reachable and targets are up"
+        contains: "prometheus-k8s"
+        reason: "Verify the agent queried the up metric and reported on the prometheus-k8s target"
   prompt:
     inline: |
       Is the metrics backend reachable? Query up{job="prometheus-k8s"} and report the status.
diff --git a/evals/mcpchecker/tasks/queries/diagnose-cluster-health.yaml b/evals/mcpchecker/tasks/queries/diagnose-cluster-health.yaml
index 4d37e5d7..c0b1db9e 100644
--- a/evals/mcpchecker/tasks/queries/diagnose-cluster-health.yaml
+++ b/evals/mcpchecker/tasks/queries/diagnose-cluster-health.yaml
@@ -16,11 +16,11 @@ metadata:
 spec:
   verify:
     - llmJudge:
-        contains: "cluster"
-        reason: "Verify the response provides an overall cluster health assessment"
+        contains: "Watchdog"
+        reason: "Verify the agent checked alerts and reported specific alert names such as Watchdog"
     - llmJudge:
-        contains: "node"
-        reason: "Verify the agent checked node-level health indicators"
+        contains: "kube_"
+        reason: "Verify the agent queried Kubernetes metrics (e.g. kube_node_status_condition) for health assessment"
   prompt:
     inline: |
       Is the cluster healthy? Give me an overview of any issues.
diff --git a/evals/mcpchecker/tasks/queries/high-cardinality-rejection.yaml b/evals/mcpchecker/tasks/queries/high-cardinality-rejection.yaml
index a8d815a6..70e71f99 100644
--- a/evals/mcpchecker/tasks/queries/high-cardinality-rejection.yaml
+++ b/evals/mcpchecker/tasks/queries/high-cardinality-rejection.yaml
@@ -15,8 +15,8 @@ metadata:
 spec:
   verify:
     - llmJudge:
-        contains: "namespace"
-        reason: "Verify the agent suggests scoping the query by namespace or another dimension"
+        contains: "guardrail"
+        reason: "Verify the agent explains the query was rejected by a guardrail and suggests a scoped alternative"
   prompt:
     inline: |
       Show me the CPU, memory, network, and disk usage for every single container
diff --git a/evals/mcpchecker/tasks/queries/memory-usage.yaml b/evals/mcpchecker/tasks/queries/memory-usage.yaml
index b36fbaf5..5b5371cb 100644
--- a/evals/mcpchecker/tasks/queries/memory-usage.yaml
+++ b/evals/mcpchecker/tasks/queries/memory-usage.yaml
@@ -14,8 +14,8 @@ metadata:
 spec:
   verify:
     - llmJudge:
-        contains: "container_memory_working_set_bytes"
-        reason: "Verify the agent queried the correct memory metric"
+        contains: "container_memory"
+        reason: "Verify the agent queried a container memory metric (working_set_bytes or usage_bytes)"
     - llmJudge:
         contains: "pod"
         reason: "Verify the response identifies specific pods"
diff --git a/evals/mcpchecker/tasks/queries/namespace-pod-count.yaml b/evals/mcpchecker/tasks/queries/namespace-pod-count.yaml
index 8e5e7316..cfe57668 100644
--- a/evals/mcpchecker/tasks/queries/namespace-pod-count.yaml
+++ b/evals/mcpchecker/tasks/queries/namespace-pod-count.yaml
@@ -16,11 +16,11 @@ metadata:
 spec:
   verify:
     - llmJudge:
-        contains: "namespace"
-        reason: "Verify the response lists namespaces"
+        contains: "kube-system"
+        reason: "Verify the response lists actual namespace names from the cluster such as kube-system"
     - llmJudge:
-        contains: "pod"
-        reason: "Verify the response includes pod counts"
+        contains: "kube_pod"
+        reason: "Verify the agent used a kube_pod metric to count pods"
   prompt:
     inline: |
       Which namespaces have the most running pods? Show me the top 5.
diff --git a/evals/mcpchecker/tasks/queries/namespace-resource-usage.yaml b/evals/mcpchecker/tasks/queries/namespace-resource-usage.yaml
index 1d974631..1be48503 100644
--- a/evals/mcpchecker/tasks/queries/namespace-resource-usage.yaml
+++ b/evals/mcpchecker/tasks/queries/namespace-resource-usage.yaml
@@ -16,14 +16,11 @@ metadata:
 spec:
   verify:
     - llmJudge:
-        contains: "namespace"
-        reason: "Verify the response breaks down resource usage by namespace"
+        contains: "container_cpu_usage_seconds_total"
+        reason: "Verify the agent discovered and used the correct CPU metric"
     - llmJudge:
-        contains: "cpu"
-        reason: "Verify the response includes CPU usage data"
-    - llmJudge:
-        contains: "memory"
-        reason: "Verify the response includes memory usage data"
+        contains: "container_memory"
+        reason: "Verify the agent discovered and used a container memory metric (working_set_bytes or usage_bytes)"
   prompt:
     inline: |
       Which namespace is consuming the most CPU and memory?
diff --git a/evals/mcpchecker/tasks/queries/pods-created.yaml b/evals/mcpchecker/tasks/queries/pods-created.yaml
index 9ac0b071..c27976d5 100644
--- a/evals/mcpchecker/tasks/queries/pods-created.yaml
+++ b/evals/mcpchecker/tasks/queries/pods-created.yaml
@@ -15,7 +15,7 @@ spec:
   verify:
     - llmJudge:
         contains: "kube_pod_created"
-        reason: "Verify the agent used the correct metric for pod creation timestamps"
+        reason: "Verify the agent discovered and used the kube_pod_created metric"
   prompt:
     inline: |
       How many pods were created in the last 5 minutes?
diff --git a/evals/mcpchecker/tasks/queries/time-range-query.yaml b/evals/mcpchecker/tasks/queries/time-range-query.yaml
index aa64946d..ce7f7b5c 100644
--- a/evals/mcpchecker/tasks/queries/time-range-query.yaml
+++ b/evals/mcpchecker/tasks/queries/time-range-query.yaml
@@ -15,11 +15,11 @@ metadata:
 spec:
   verify:
     - llmJudge:
-        contains: "30 minutes"
-        reason: "Verify the agent honoured the requested 30-minute time window"
+        contains: "container_cpu_usage_seconds_total"
+        reason: "Verify the agent discovered and used the correct CPU metric"
     - llmJudge:
-        contains: "cpu"
-        reason: "Verify the response includes CPU usage data"
+        contains: "pod"
+        reason: "Verify the response includes specific pod names from the query results"
   prompt:
     inline: |
       Show me the CPU usage trend for pods in the default namespace over the last 30 minutes.
diff --git a/evals/mcpchecker/tasks/queries/visualize-cpu-usage.yaml b/evals/mcpchecker/tasks/queries/visualize-cpu-usage.yaml
index a323ee56..efc1a511 100644
--- a/evals/mcpchecker/tasks/queries/visualize-cpu-usage.yaml
+++ b/evals/mcpchecker/tasks/queries/visualize-cpu-usage.yaml
@@ -15,8 +15,8 @@ metadata:
 spec:
   verify:
     - llmJudge:
-        contains: "cpu"
-        reason: "Verify the agent queried and visualized CPU usage data"
+        contains: "container_cpu_usage_seconds_total"
+        reason: "Verify the agent discovered and visualized the correct CPU metric"
   prompt:
     inline: |
       Visualize the CPU usage for pods in the default namespace over the last 30 minutes.