rhobs · openshift-merge-bot · Apr 28, 2026 · Apr 28, 2026 · Apr 28, 2026
@@ -6,7 +6,7 @@ CONTAINER_CLI ?= docker
 IMAGE ?= ghcr.io/rhobs/obs-mcp
 TAG ?= $(shell git rev-parse --short HEAD)
 TOOLS_DIR := hack/tools
-MCPCHECKER_VERSION ?= 0.0.15
+MCPCHECKER_VERSION ?= 0.0.16
 
 ROOT_DIR := $(shell pwd)
 TOOLS_BIN_DIR := $(ROOT_DIR)/tmp/bin

@@ -4,7 +4,7 @@ Evaluations for obs-mcp using [mcpchecker](https://github.com/mcpchecker/mcpchec
 
 ## Pre-requisites
 
-- [mcpchecker](https://github.com/mcpchecker/mcpchecker#install) installed (v0.0.15+) — run `make install-mcpchecker` from the repo root
+- [mcpchecker](https://github.com/mcpchecker/mcpchecker#install) installed (v0.0.16+) — run `make install-mcpchecker` from the repo root
 - A Kubernetes/OpenShift cluster with Prometheus and Alertmanager running
 - obs-mcp server deployed and accessible (see [Backend Setup](#backend-setup))
 

@@ -7,6 +7,7 @@ metadata:
   runs: 1
   labels:
     category: alerts
+    suite: observability
     toolType: multi-step
   description: |
     Tests if the agent can perform multi-step alert triage: first retrieving

@@ -7,6 +7,7 @@ metadata:
   runs: 1
   labels:
     category: alerts
+    suite: observability
     toolType: alertmanager
   description: |
     Tests if the agent uses the get_alerts tool with filter parameters

@@ -7,6 +7,7 @@ metadata:
   runs: 1
   labels:
     category: alerts
+    suite: observability
     toolType: alertmanager
   description: |
     Tests if the agent can discover and use the get_alerts tool to retrieve

@@ -7,6 +7,7 @@ metadata:
   runs: 1
   labels:
     category: alerts
+    suite: observability
     toolType: alertmanager
   description: |
     Tests if the agent can discover and use the get_silences tool to retrieve

@@ -7,6 +7,7 @@ metadata:
   runs: 1
   labels:
     category: labels
+    suite: observability
     toolType: exploration
   description: |
     Tests if the agent can use the get_series tool to check cardinality for a metric.

@@ -7,6 +7,7 @@ metadata:
   runs: 1
   labels:
     category: labels
+    suite: observability
     toolType: exploration
   description: |
     Tests if the agent follows the correct workflow: first calling list_metrics to

@@ -7,6 +7,7 @@ metadata:
   runs: 1
   labels:
     category: labels
+    suite: observability
     toolType: exploration
   description: |
     Tests the full discovery workflow: list_metrics to verify the metric, then

@@ -7,6 +7,7 @@ metadata:
   runs: 1
   labels:
     category: labels
+    suite: observability
     toolType: exploration
   description: |
     Tests if the agent can use the get_series tool with a label selector
@@ -24,4 +25,5 @@ spec:
   prompt:
     inline: |
       How many time series exist for container_cpu_usage_seconds_total
-      in the openshift-monitoring namespace?
+      in the openshift-monitoring namespace? List the label dimensions
+      (such as pod, container) present in the series.
@@ -7,6 +7,7 @@ metadata:
   runs: 1
   labels:
     category: metrics
+    suite: observability
     toolType: discovery
   description: |
     Tests if the agent can discover the list_metrics tool to find Kubernetes-related

@@ -7,6 +7,7 @@ metadata:
   runs: 1
   labels:
     category: metrics
+    suite: observability
     toolType: discovery
   description: |
     Tests if the agent can discover node-related metrics using the list_metrics tool

@@ -7,6 +7,7 @@ metadata:
   runs: 1
   labels:
     category: queries
+    suite: observability
     toolType: smoke-test
   description: |
     Smoke test to verify the agent can reach the metrics backend and get a

@@ -7,6 +7,7 @@ metadata:
   runs: 1
   labels:
     category: queries
+    suite: observability
     toolType: query
   description: |
     Tests if the agent can find the appropriate CPU usage metric and query it to
@@ -21,4 +22,5 @@ spec:
         reason: "Verify the response identifies specific pods"
   prompt:
     inline: |
-      Which pods are using the most CPU?
+      Which pods are using the most CPU? Show me the top 5.
+      Use Prometheus metrics and include the metric name and PromQL query you used.
@@ -7,6 +7,7 @@ metadata:
   runs: 1
   labels:
     category: queries
+    suite: observability
     toolType: range-query
   description: |
     Tests if the agent can identify crashlooping pods by discovering and querying
@@ -19,3 +20,4 @@ spec:
   prompt:
     inline: |
       Which pods were crashlooping in the last 5 minutes?
+      Use Prometheus metrics and include the metric name and PromQL query you used.
@@ -7,6 +7,7 @@ metadata:
   runs: 1
   labels:
     category: queries
+    suite: observability
     toolType: diagnostic
   description: |
     Tests the agent's ability to handle an ambiguous diagnostic prompt.
@@ -24,4 +25,5 @@ spec:
   prompt:
     inline: |
       Is the cluster healthy? Give me an overview of any issues.
-      Check alerts, node status, and pod health.
+      Check alerts, node status, and pod health using Prometheus metrics.
+      Include the metric names and PromQL queries you used.
@@ -7,6 +7,7 @@ metadata:
   runs: 1
   labels:
     category: queries
+    suite: observability
     toolType: error-handling
   description: |
     Tests agent behavior when a query is rejected by obs-mcp guardrails

@@ -7,6 +7,7 @@ metadata:
   runs: 1
   labels:
     category: queries
+    suite: observability
     toolType: query
   description: |
     Tests if the agent can find the appropriate memory usage metric and query it
@@ -22,3 +23,4 @@ spec:
   prompt:
     inline: |
       Which pods are using the most memory? Show me the top 5.
+      Use Prometheus metrics and include the metric name and PromQL query you used.
@@ -7,6 +7,7 @@ metadata:
   runs: 1
   labels:
     category: queries
+    suite: observability
     toolType: multi-step
   description: |
     Tests multi-step reasoning: the agent must discover a suitable metric
@@ -24,3 +25,4 @@ spec:
   prompt:
     inline: |
       Which namespaces have the most running pods? Show me the top 5.
+      Use Prometheus metrics and include the metric name and PromQL query you used.
@@ -7,6 +7,7 @@ metadata:
   runs: 1
   labels:
     category: queries
+    suite: observability
     toolType: multi-step
   description: |
     Tests multi-step reasoning: the agent must discover CPU and memory metrics,
@@ -25,3 +26,4 @@ spec:
     inline: |
       Which namespace is consuming the most CPU and memory?
       Show me the top namespace for each.
+      Use Prometheus metrics and include the metric names and PromQL queries you used.
@@ -7,6 +7,7 @@ metadata:
   runs: 1
   labels:
     category: queries
+    suite: observability
     toolType: query
   description: |
     Tests if the agent can discover network-related metrics and query them to find
@@ -22,3 +23,4 @@ spec:
   prompt:
     inline: |
       Which pods are receiving the most network traffic?
+      Use Prometheus metrics and include the metric name and PromQL query you used.
@@ -7,6 +7,7 @@ metadata:
   runs: 1
   labels:
     category: queries
+    suite: observability
     toolType: error-handling
   description: |
     Tests agent recovery when querying a metric that does not exist.

@@ -7,6 +7,7 @@ metadata:
   runs: 1
   labels:
     category: queries
+    suite: observability
     toolType: error-handling
   description: |
     Tests agent behavior when querying for resources in a namespace that

@@ -7,6 +7,7 @@ metadata:
   runs: 1
   labels:
     category: queries
+    suite: observability
     toolType: query
   description: |
     Tests if the agent can identify pods stuck in pending state by first discovering
@@ -20,3 +21,4 @@ spec:
   prompt:
     inline: |
       Which pods are stuck in pending state?
+      Use Prometheus metrics and include the metric name and PromQL query you used.
@@ -7,6 +7,7 @@ metadata:
   runs: 1
   labels:
     category: queries
+    suite: observability
     toolType: range-query
   description: |
     Tests if the agent can use a range query to find recently created pods by
@@ -19,3 +20,4 @@ spec:
   prompt:
     inline: |
       How many pods were created in the last 5 minutes?
+      Include the Prometheus metric name and PromQL query you used.
@@ -7,6 +7,7 @@ metadata:
   runs: 1
   labels:
     category: queries
+    suite: observability
     toolType: query
   description: |
     Tests if the agent can query Prometheus internal metrics to report the current
@@ -19,3 +20,4 @@ spec:
   prompt:
     inline: |
       How many head series does Prometheus have?
+      Include the Prometheus metric name and PromQL query you used.
@@ -7,6 +7,7 @@ metadata:
   runs: 1
   labels:
     category: queries
+    suite: observability
     toolType: query
   description: |
     Tests if the agent can calculate the request rate to Prometheus by discovering
@@ -19,3 +20,4 @@ spec:
   prompt:
     inline: |
       How many requests per second are being made to Prometheus?
+      Include the Prometheus metric name and PromQL query you used.
@@ -7,6 +7,7 @@ metadata:
   runs: 1
   labels:
     category: queries
+    suite: observability
     toolType: query
   description: |
     Tests if the agent can query the current Prometheus WAL storage size using
@@ -19,3 +20,4 @@ spec:
   prompt:
     inline: |
       What is the current storage size of the Prometheus WAL?
+      Include the Prometheus metric name and PromQL query you used.
@@ -7,6 +7,7 @@ metadata:
   runs: 1
   labels:
     category: queries
+    suite: observability
     toolType: multi-step
   description: |
     Tests whether the agent correctly uses execute_range_query with
@@ -23,3 +24,4 @@ spec:
   prompt:
     inline: |
       Show me the CPU usage trend for pods in the default namespace over the last 30 minutes.
+      Include the Prometheus metric name and PromQL query you used.
@@ -7,6 +7,7 @@ metadata:
   runs: 1
   labels:
     category: queries
+    suite: observability
     toolType: visualization
   description: |
     Tests if the agent uses the show_timeseries tool to visualize CPU usage
@@ -20,3 +21,4 @@ spec:
   prompt:
     inline: |
       Visualize the CPU usage for pods in the default namespace over the last 30 minutes.
+      Include the Prometheus metric name and PromQL query you used.