openshift · openshift-merge-bot · Apr 24, 2026 · Apr 20, 2026 · Apr 22, 2026 · Apr 23, 2026
diff --git a/evals/claude-code/eval.yaml b/evals/claude-code/eval.yaml
@@ -72,3 +72,13 @@ config:
             toolPattern: ".*"
         minToolCalls: 1
         maxToolCalls: 20
+    # Observability tasks
+    - glob: ../tasks/observability/*/*.yaml
+      labelSelector:
+        suite: observability
+      assertions:
+        toolsUsed:
+          - server: kubernetes
+            toolPattern: ".*"
+        minToolCalls: 1
+        maxToolCalls: 20
diff --git a/evals/gemini-agent/eval.yaml b/evals/gemini-agent/eval.yaml
@@ -17,3 +17,13 @@ config:
         toolsUsed:
           - server: kubernetes
             toolPattern: ".*"
+    # Observability tasks
+    - glob: ../tasks/observability/*/*.yaml
+      labelSelector:
+        suite: observability
+      assertions:
+        toolsUsed:
+          - server: kubernetes
+            toolPattern: ".*"
+        minToolCalls: 1
+        maxToolCalls: 20
diff --git a/evals/openai-agent/eval.yaml b/evals/openai-agent/eval.yaml
@@ -72,3 +72,13 @@ config:
             toolPattern: ".*"
         minToolCalls: 1
         maxToolCalls: 20
+    # Observability tasks
+    - glob: ../tasks/observability/*/*.yaml
+      labelSelector:
+        suite: observability
+      assertions:
+        toolsUsed:
+          - server: kubernetes
+            toolPattern: ".*"
+        minToolCalls: 1
+        maxToolCalls: 20
diff --git a/evals/tasks/observability/alerts/alert-investigation.yaml b/evals/tasks/observability/alerts/alert-investigation.yaml
@@ -0,0 +1,28 @@
+kind: Task
+apiVersion: mcpchecker/v1alpha2
+metadata:
+  name: "alert-investigation"
+  difficulty: medium
+  parallel: true
+  runs: 1
+  labels:
+    category: alerts
+    suite: observability
+    toolType: multi-step
+  description: |
+    Tests if the agent can perform multi-step alert triage: first retrieving
+    alerts from Alertmanager, then investigating related metrics for a
+    firing alert using queries.
+spec:
+  verify:
+    - llmJudge:
+        contains: "AlertmanagerReceiversNotConfigured"
+        reason: "Verify the agent retrieved firing alerts and identified AlertmanagerReceiversNotConfigured"
+    - llmJudge:
+        contains: "alertmanager"
+        reason: "Verify the agent investigated Alertmanager-related metrics for the alert"
+  prompt:
+    inline: |
+      Check if there are any firing alerts. If there are, investigate
+      the related metrics for the most critical alert and summarize
+      what's happening.
diff --git a/evals/tasks/observability/alerts/filtered-alerts.yaml b/evals/tasks/observability/alerts/filtered-alerts.yaml
@@ -0,0 +1,24 @@
+kind: Task
+apiVersion: mcpchecker/v1alpha2
+metadata:
+  name: "filtered-alerts"
+  difficulty: medium
+  parallel: true
+  runs: 1
+  labels:
+    category: alerts
+    suite: observability
+    toolType: alertmanager
+  description: |
+    Tests if the agent uses the get_alerts tool with filter parameters
+    to retrieve only active alerts matching a specific severity. The agent
+    should pass appropriate filter arguments rather than fetching all alerts
+    and filtering client-side.
+spec:
+  verify:
+    - llmJudge:
+        contains: "AlertmanagerReceiversNotConfigured"
+        reason: "Verify the agent filtered for warning-severity alerts and found AlertmanagerReceiversNotConfigured"
+  prompt:
+    inline: |
+      Show me only the active alerts with severity=warning.
diff --git a/evals/tasks/observability/alerts/get-alerts.yaml b/evals/tasks/observability/alerts/get-alerts.yaml
@@ -0,0 +1,22 @@
+kind: Task
+apiVersion: mcpchecker/v1alpha2
+metadata:
+  name: "get-alerts"
+  difficulty: easy
+  parallel: true
+  runs: 1
+  labels:
+    category: alerts
+    suite: observability
+    toolType: alertmanager
+  description: |
+    Tests if the agent can discover and use the get_alerts tool to retrieve
+    currently firing alerts from Alertmanager.
+spec:
+  verify:
+    - llmJudge:
+        contains: "Watchdog"
+        reason: "Verify the agent retrieved firing alerts and reported the Watchdog alert"
+  prompt:
+    inline: |
+      Check the cluster for any firing alerts and report what you find.
diff --git a/evals/tasks/observability/alerts/get-silences.yaml b/evals/tasks/observability/alerts/get-silences.yaml
@@ -0,0 +1,22 @@
+kind: Task
+apiVersion: mcpchecker/v1alpha2
+metadata:
+  name: "get-silences"
+  difficulty: easy
+  parallel: true
+  runs: 1
+  labels:
+    category: alerts
+    suite: observability
+    toolType: alertmanager
+  description: |
+    Tests if the agent can discover and use the get_silences tool to retrieve
+    active silences from Alertmanager.
+spec:
+  verify:
+    - llmJudge:
+        contains: "silences"
+        reason: "Verify the response mentions silences. The agent should report either active silences with their matchers or that no active silences exist — both are valid outcomes"
-    - llmJudge:
-        contains: "silences"
-        reason: "Verify the response mentions silences. The agent should report either active silences with their matchers or that no active silences exist — both are valid outcomes"
+    - llmJudge:
+        contains: "matchers"
+        reason: "Verify the response includes concrete silence details when active silences exist"
+    - llmJudge:
+        contains: "no active silences"
+        reason: "Allow explicit empty-state reporting when no silences are present"
-    - llmJudge:
-        contains: "silences"
-        reason: "Verify the response mentions silences. The agent should report either active silences with their matchers or that no active silences exist — both are valid outcomes"
+    - llmJudge:
+        contains: "matchers"
+        reason: "Verify the response includes concrete silence details when active silences exist"
+    - llmJudge:
+        contains: "no active silences"
+        reason: "Allow explicit empty-state reporting when no silences are present"
+  prompt:
+    inline: |
+      Are there any active silences in Alertmanager?
diff --git a/evals/tasks/observability/labels/get-series.yaml b/evals/tasks/observability/labels/get-series.yaml
@@ -0,0 +1,26 @@
+kind: Task
+apiVersion: mcpchecker/v1alpha2
+metadata:
+  name: "get-series-cardinality"
+  difficulty: medium
+  parallel: true
+  runs: 1
+  labels:
+    category: labels
+    suite: observability
+    toolType: exploration
+  description: |
+    Tests if the agent can use the get_series tool to check cardinality for a metric.
+    The agent should first verify the metric exists via list_metrics, then use
+    get_series to retrieve matching time series and report the count.
+spec:
+  verify:
+    - llmJudge:
+        contains: "namespace"
+        reason: "Verify the agent retrieved actual series data containing label dimensions like namespace"
+    - llmJudge:
+        contains: "kube_pod_info"
+        reason: "Verify the agent queried the kube_pod_info metric and reported its cardinality"
+  prompt:
+    inline: |
+      How many time series exist for the kube_pod_info metric? Show the count and list the label names present.
diff --git a/evals/tasks/observability/labels/label-names.yaml b/evals/tasks/observability/labels/label-names.yaml
@@ -0,0 +1,26 @@
+kind: Task
+apiVersion: mcpchecker/v1alpha2
+metadata:
+  name: "label-names"
+  difficulty: easy
+  parallel: true
+  runs: 1
+  labels:
+    category: labels
+    suite: observability
+    toolType: exploration
+  description: |
+    Tests if the agent follows the correct workflow: first calling list_metrics to
+    verify kube_pod_info exists, then calling get_label_names to discover available
+    labels for that metric.
+spec:
+  verify:
+    - llmJudge:
+        contains: "namespace"
+        reason: "Verify the output includes the namespace label which is a standard Kubernetes label"
+    - llmJudge:
+        contains: "pod"
+        reason: "Verify the output includes the pod label"
+  prompt:
+    inline: |
+      What labels are available for the kube_pod_info metric?
diff --git a/evals/tasks/observability/labels/label-values.yaml b/evals/tasks/observability/labels/label-values.yaml
@@ -0,0 +1,22 @@
+kind: Task
+apiVersion: mcpchecker/v1alpha2
+metadata:
+  name: "label-values"
+  difficulty: medium
+  parallel: true
+  runs: 1
+  labels:
+    category: labels
+    suite: observability
+    toolType: exploration
+  description: |
+    Tests the full discovery workflow: list_metrics to verify the metric, then
+    get_label_values to retrieve unique namespace values for kube_pod_info.
+spec:
+  verify:
+    - llmJudge:
+        contains: "kube-system"
+        reason: "Verify the output lists actual namespace values from the cluster such as kube-system"
+  prompt:
+    inline: |
+      What are the unique namespace values for the kube_pod_info metric?
diff --git a/evals/tasks/observability/labels/series-by-namespace.yaml b/evals/tasks/observability/labels/series-by-namespace.yaml
@@ -0,0 +1,28 @@
+kind: Task
+apiVersion: mcpchecker/v1alpha2
+metadata:
+  name: "series-by-namespace"
+  difficulty: medium
+  parallel: true
+  runs: 1
+  labels:
+    category: labels
+    suite: observability
+    toolType: exploration
+  description: |
+    Tests if the agent can use the get_series tool with a label selector
+    to find time series scoped to a specific namespace. The agent should
+    first verify the metric exists, then use get_series with a namespace
+    matcher to report the cardinality within that scope.
+spec:
+  verify:
+    - llmJudge:
+        contains: "pod"
+        reason: "Verify the agent retrieved actual series data containing label dimensions like pod"
+    - llmJudge:
+        contains: "container"
+        reason: "Verify the agent reported series with container label values from the namespace"
+  prompt:
+    inline: |
+      How many time series exist for container_cpu_usage_seconds_total
+      in the openshift-monitoring namespace?
diff --git a/evals/tasks/observability/metrics/list-metrics.yaml b/evals/tasks/observability/metrics/list-metrics.yaml
@@ -0,0 +1,22 @@
+kind: Task
+apiVersion: mcpchecker/v1alpha2
+metadata:
+  name: "list-kube-metrics"
+  difficulty: easy
+  parallel: true
+  runs: 1
+  labels:
+    category: metrics
+    suite: observability
+    toolType: discovery
+  description: |
+    Tests if the agent can discover the list_metrics tool to find Kubernetes-related
+    metrics. The agent should use the name_regex parameter to filter for kube metrics.
+spec:
+  verify:
+    - llmJudge:
+        contains: "kube_pod_info"
+        reason: "Verify the output lists specific kube metrics discovered from Prometheus"
+  prompt:
+    inline: |
+      List all available Prometheus metrics that contain 'kube' in the name.
diff --git a/evals/tasks/observability/metrics/list-node-metrics.yaml b/evals/tasks/observability/metrics/list-node-metrics.yaml
@@ -0,0 +1,22 @@
+kind: Task
+apiVersion: mcpchecker/v1alpha2
+metadata:
+  name: "list-node-metrics"
+  difficulty: easy
+  parallel: true
+  runs: 1
+  labels:
+    category: metrics
+    suite: observability
+    toolType: discovery
+  description: |
+    Tests if the agent can discover node-related metrics using the list_metrics tool
+    with a regex filter for node metrics.
+spec:
+  verify:
+    - llmJudge:
+        contains: "node_"
+        reason: "Verify the output lists specific node metrics discovered from Prometheus (e.g. node_cpu_seconds_total)"
+  prompt:
+    inline: |
+      What node-related metrics are available in Prometheus?
diff --git a/evals/tasks/observability/queries/backend-reachability.yaml b/evals/tasks/observability/queries/backend-reachability.yaml
@@ -0,0 +1,23 @@
+kind: Task
+apiVersion: mcpchecker/v1alpha2
+metadata:
+  name: "backend-reachability"
+  difficulty: easy
+  parallel: true
+  runs: 1
+  labels:
+    category: queries
+    suite: observability
+    toolType: smoke-test
+  description: |
+    Smoke test to verify the agent can reach the metrics backend and get a
+    valid response. The agent should query the up metric and confirm the
+    backend is reachable before any complex eval tasks run.
+spec:
+  verify:
+    - llmJudge:
+        contains: "prometheus-k8s"
+        reason: "Verify the agent queried the up metric and reported on the prometheus-k8s target"
+  prompt:
+    inline: |
+      Is the metrics backend reachable? Query up{job="prometheus-k8s"} and report the status.
diff --git a/evals/tasks/observability/queries/cpu-usage.yaml b/evals/tasks/observability/queries/cpu-usage.yaml
@@ -0,0 +1,25 @@
+kind: Task
+apiVersion: mcpchecker/v1alpha2
+metadata:
+  name: "cpu-usage"
+  difficulty: medium
+  parallel: true
+  runs: 1
+  labels:
+    category: queries
+    suite: observability
+    toolType: query
+  description: |
+    Tests if the agent can find the appropriate CPU usage metric and query it to
+    determine which pods are consuming the most CPU resources.
+spec:
+  verify:
+    - llmJudge:
+        contains: "container_cpu_usage_seconds_total"
+        reason: "Verify the agent queried the correct CPU metric"
+    - llmJudge:
+        contains: "pod"
+        reason: "Verify the response identifies specific pods"
-    - llmJudge:
-        contains: "pod"
-        reason: "Verify the response identifies specific pods"
+    - llmJudge:
+        contains: "pod="
+        reason: "Verify the response identifies specific pods by label (e.g., pod=\"<name>\")"
-    - llmJudge:
-        contains: "pod"
-        reason: "Verify the response identifies specific pods"
+    - llmJudge:
+        contains: "pod="
+        reason: "Verify the response identifies specific pods by label (e.g., pod=\"<name>\")"
+  prompt:
+    inline: |
+      Which pods are using the most CPU?
diff --git a/evals/tasks/observability/queries/crashlooping-pods.yaml b/evals/tasks/observability/queries/crashlooping-pods.yaml
@@ -0,0 +1,22 @@
+kind: Task
+apiVersion: mcpchecker/v1alpha2
+metadata:
+  name: "crashlooping-pods"
+  difficulty: medium
+  parallel: true
+  runs: 1
+  labels:
+    category: queries
+    suite: observability
+    toolType: range-query
+  description: |
+    Tests if the agent can identify crashlooping pods by discovering and querying
+    the kube_pod_container_status_restarts_total metric over a time range.
+spec:
+  verify:
+    - llmJudge:
+        contains: "kube_pod_container_status_restarts_total"
+        reason: "Verify the agent used the correct metric for container restart counts"
+  prompt:
+    inline: |
+      Which pods were crashlooping in the last 5 minutes?
diff --git a/evals/tasks/observability/queries/diagnose-cluster-health.yaml b/evals/tasks/observability/queries/diagnose-cluster-health.yaml
@@ -0,0 +1,28 @@
+kind: Task
+apiVersion: mcpchecker/v1alpha2
+metadata:
+  name: "diagnose-cluster-health"
+  difficulty: hard
+  parallel: true
+  runs: 1
+  labels:
+    category: queries
+    suite: observability
+    toolType: diagnostic
+  description: |
+    Tests the agent's ability to handle an ambiguous diagnostic prompt.
+    The agent must autonomously decide which tools and metrics to check,
+    ideally covering alerts, node status, and pod health without being
+    told exactly what to look for.
+spec:
+  verify:
+    - llmJudge:
+        contains: "Watchdog"
+        reason: "Verify the agent checked alerts and reported specific alert names such as Watchdog"
+    - llmJudge:
+        contains: "kube_"
+        reason: "Verify the agent queried Kubernetes metrics (e.g. kube_node_status_condition) for health assessment"
+  prompt:
+    inline: |
+      Is the cluster healthy? Give me an overview of any issues.
+      Check alerts, node status, and pod health.