diff --git a/Makefile b/Makefile index 0fabac16..f5bcc09e 100644 --- a/Makefile +++ b/Makefile @@ -6,7 +6,7 @@ CONTAINER_CLI ?= docker IMAGE ?= ghcr.io/rhobs/obs-mcp TAG ?= $(shell git rev-parse --short HEAD) TOOLS_DIR := hack/tools -MCPCHECKER_VERSION ?= 0.0.15 +MCPCHECKER_VERSION ?= 0.0.16 ROOT_DIR := $(shell pwd) TOOLS_BIN_DIR := $(ROOT_DIR)/tmp/bin diff --git a/evals/mcpchecker/README.md b/evals/mcpchecker/README.md index 557db82b..d8b0f4b6 100644 --- a/evals/mcpchecker/README.md +++ b/evals/mcpchecker/README.md @@ -4,7 +4,7 @@ Evaluations for obs-mcp using [mcpchecker](https://github.com/mcpchecker/mcpchec ## Pre-requisites -- [mcpchecker](https://github.com/mcpchecker/mcpchecker#install) installed (v0.0.15+) — run `make install-mcpchecker` from the repo root +- [mcpchecker](https://github.com/mcpchecker/mcpchecker#install) installed (v0.0.16+) — run `make install-mcpchecker` from the repo root - A Kubernetes/OpenShift cluster with Prometheus and Alertmanager running - obs-mcp server deployed and accessible (see [Backend Setup](#backend-setup)) diff --git a/evals/mcpchecker/tasks/alerts/alert-investigation.yaml b/evals/mcpchecker/tasks/alerts/alert-investigation.yaml index 95eed497..f0f35ce2 100644 --- a/evals/mcpchecker/tasks/alerts/alert-investigation.yaml +++ b/evals/mcpchecker/tasks/alerts/alert-investigation.yaml @@ -7,6 +7,7 @@ metadata: runs: 1 labels: category: alerts + suite: observability toolType: multi-step description: | Tests if the agent can perform multi-step alert triage: first retrieving diff --git a/evals/mcpchecker/tasks/alerts/filtered-alerts.yaml b/evals/mcpchecker/tasks/alerts/filtered-alerts.yaml index bed6cec4..6f9aaf91 100644 --- a/evals/mcpchecker/tasks/alerts/filtered-alerts.yaml +++ b/evals/mcpchecker/tasks/alerts/filtered-alerts.yaml @@ -7,6 +7,7 @@ metadata: runs: 1 labels: category: alerts + suite: observability toolType: alertmanager description: | Tests if the agent uses the get_alerts tool with filter parameters diff --git a/evals/mcpchecker/tasks/alerts/get-alerts.yaml b/evals/mcpchecker/tasks/alerts/get-alerts.yaml index 7ba817a9..49dbd5f9 100644 --- a/evals/mcpchecker/tasks/alerts/get-alerts.yaml +++ b/evals/mcpchecker/tasks/alerts/get-alerts.yaml @@ -7,6 +7,7 @@ metadata: runs: 1 labels: category: alerts + suite: observability toolType: alertmanager description: | Tests if the agent can discover and use the get_alerts tool to retrieve diff --git a/evals/mcpchecker/tasks/alerts/get-silences.yaml b/evals/mcpchecker/tasks/alerts/get-silences.yaml index 1f4e36ce..f986c785 100644 --- a/evals/mcpchecker/tasks/alerts/get-silences.yaml +++ b/evals/mcpchecker/tasks/alerts/get-silences.yaml @@ -7,6 +7,7 @@ metadata: runs: 1 labels: category: alerts + suite: observability toolType: alertmanager description: | Tests if the agent can discover and use the get_silences tool to retrieve diff --git a/evals/mcpchecker/tasks/labels/get-series.yaml b/evals/mcpchecker/tasks/labels/get-series.yaml index 85eecb29..95645079 100644 --- a/evals/mcpchecker/tasks/labels/get-series.yaml +++ b/evals/mcpchecker/tasks/labels/get-series.yaml @@ -7,6 +7,7 @@ metadata: runs: 1 labels: category: labels + suite: observability toolType: exploration description: | Tests if the agent can use the get_series tool to check cardinality for a metric. diff --git a/evals/mcpchecker/tasks/labels/label-names.yaml b/evals/mcpchecker/tasks/labels/label-names.yaml index 9aef1c53..adf41396 100644 --- a/evals/mcpchecker/tasks/labels/label-names.yaml +++ b/evals/mcpchecker/tasks/labels/label-names.yaml @@ -7,6 +7,7 @@ metadata: runs: 1 labels: category: labels + suite: observability toolType: exploration description: | Tests if the agent follows the correct workflow: first calling list_metrics to diff --git a/evals/mcpchecker/tasks/labels/label-values.yaml b/evals/mcpchecker/tasks/labels/label-values.yaml index aaa840ce..d78e5b39 100644 --- a/evals/mcpchecker/tasks/labels/label-values.yaml +++ b/evals/mcpchecker/tasks/labels/label-values.yaml @@ -7,6 +7,7 @@ metadata: runs: 1 labels: category: labels + suite: observability toolType: exploration description: | Tests the full discovery workflow: list_metrics to verify the metric, then diff --git a/evals/mcpchecker/tasks/labels/series-by-namespace.yaml b/evals/mcpchecker/tasks/labels/series-by-namespace.yaml index 14d3058e..e3201236 100644 --- a/evals/mcpchecker/tasks/labels/series-by-namespace.yaml +++ b/evals/mcpchecker/tasks/labels/series-by-namespace.yaml @@ -7,6 +7,7 @@ metadata: runs: 1 labels: category: labels + suite: observability toolType: exploration description: | Tests if the agent can use the get_series tool with a label selector @@ -24,4 +25,5 @@ spec: prompt: inline: | How many time series exist for container_cpu_usage_seconds_total - in the openshift-monitoring namespace? + in the openshift-monitoring namespace? List the label dimensions + (such as pod, container) present in the series. diff --git a/evals/mcpchecker/tasks/metrics/list-metrics.yaml b/evals/mcpchecker/tasks/metrics/list-metrics.yaml index 2182dcad..1f5adc71 100644 --- a/evals/mcpchecker/tasks/metrics/list-metrics.yaml +++ b/evals/mcpchecker/tasks/metrics/list-metrics.yaml @@ -7,6 +7,7 @@ metadata: runs: 1 labels: category: metrics + suite: observability toolType: discovery description: | Tests if the agent can discover the list_metrics tool to find Kubernetes-related diff --git a/evals/mcpchecker/tasks/metrics/list-node-metrics.yaml b/evals/mcpchecker/tasks/metrics/list-node-metrics.yaml index fd932e65..f496460a 100644 --- a/evals/mcpchecker/tasks/metrics/list-node-metrics.yaml +++ b/evals/mcpchecker/tasks/metrics/list-node-metrics.yaml @@ -7,6 +7,7 @@ metadata: runs: 1 labels: category: metrics + suite: observability toolType: discovery description: | Tests if the agent can discover node-related metrics using the list_metrics tool diff --git a/evals/mcpchecker/tasks/queries/backend-reachability.yaml b/evals/mcpchecker/tasks/queries/backend-reachability.yaml index a276cd5f..5a027a5a 100644 --- a/evals/mcpchecker/tasks/queries/backend-reachability.yaml +++ b/evals/mcpchecker/tasks/queries/backend-reachability.yaml @@ -7,6 +7,7 @@ metadata: runs: 1 labels: category: queries + suite: observability toolType: smoke-test description: | Smoke test to verify the agent can reach the metrics backend and get a diff --git a/evals/mcpchecker/tasks/queries/cpu-usage.yaml b/evals/mcpchecker/tasks/queries/cpu-usage.yaml index 7bff3294..f3d1ef0c 100644 --- a/evals/mcpchecker/tasks/queries/cpu-usage.yaml +++ b/evals/mcpchecker/tasks/queries/cpu-usage.yaml @@ -7,6 +7,7 @@ metadata: runs: 1 labels: category: queries + suite: observability toolType: query description: | Tests if the agent can find the appropriate CPU usage metric and query it to @@ -21,4 +22,5 @@ spec: reason: "Verify the response identifies specific pods" prompt: inline: | - Which pods are using the most CPU? + Which pods are using the most CPU? Show me the top 5. + Use Prometheus metrics and include the metric name and PromQL query you used. diff --git a/evals/mcpchecker/tasks/queries/crashlooping-pods.yaml b/evals/mcpchecker/tasks/queries/crashlooping-pods.yaml index a6de7267..9499c6a7 100644 --- a/evals/mcpchecker/tasks/queries/crashlooping-pods.yaml +++ b/evals/mcpchecker/tasks/queries/crashlooping-pods.yaml @@ -7,6 +7,7 @@ metadata: runs: 1 labels: category: queries + suite: observability toolType: range-query description: | Tests if the agent can identify crashlooping pods by discovering and querying @@ -19,3 +20,4 @@ spec: prompt: inline: | Which pods were crashlooping in the last 5 minutes? + Use Prometheus metrics and include the metric name and PromQL query you used. diff --git a/evals/mcpchecker/tasks/queries/diagnose-cluster-health.yaml b/evals/mcpchecker/tasks/queries/diagnose-cluster-health.yaml index c0b1db9e..6b7936f4 100644 --- a/evals/mcpchecker/tasks/queries/diagnose-cluster-health.yaml +++ b/evals/mcpchecker/tasks/queries/diagnose-cluster-health.yaml @@ -7,6 +7,7 @@ metadata: runs: 1 labels: category: queries + suite: observability toolType: diagnostic description: | Tests the agent's ability to handle an ambiguous diagnostic prompt. @@ -24,4 +25,5 @@ spec: prompt: inline: | Is the cluster healthy? Give me an overview of any issues. - Check alerts, node status, and pod health. + Check alerts, node status, and pod health using Prometheus metrics. + Include the metric names and PromQL queries you used. diff --git a/evals/mcpchecker/tasks/queries/high-cardinality-rejection.yaml b/evals/mcpchecker/tasks/queries/high-cardinality-rejection.yaml index 70e71f99..dbb5b3bb 100644 --- a/evals/mcpchecker/tasks/queries/high-cardinality-rejection.yaml +++ b/evals/mcpchecker/tasks/queries/high-cardinality-rejection.yaml @@ -7,6 +7,7 @@ metadata: runs: 1 labels: category: queries + suite: observability toolType: error-handling description: | Tests agent behavior when a query is rejected by obs-mcp guardrails diff --git a/evals/mcpchecker/tasks/queries/memory-usage.yaml b/evals/mcpchecker/tasks/queries/memory-usage.yaml index 5b5371cb..784a2ac1 100644 --- a/evals/mcpchecker/tasks/queries/memory-usage.yaml +++ b/evals/mcpchecker/tasks/queries/memory-usage.yaml @@ -7,6 +7,7 @@ metadata: runs: 1 labels: category: queries + suite: observability toolType: query description: | Tests if the agent can find the appropriate memory usage metric and query it @@ -22,3 +23,4 @@ spec: prompt: inline: | Which pods are using the most memory? Show me the top 5. + Use Prometheus metrics and include the metric name and PromQL query you used. diff --git a/evals/mcpchecker/tasks/queries/namespace-pod-count.yaml b/evals/mcpchecker/tasks/queries/namespace-pod-count.yaml index cfe57668..e8100682 100644 --- a/evals/mcpchecker/tasks/queries/namespace-pod-count.yaml +++ b/evals/mcpchecker/tasks/queries/namespace-pod-count.yaml @@ -7,6 +7,7 @@ metadata: runs: 1 labels: category: queries + suite: observability toolType: multi-step description: | Tests multi-step reasoning: the agent must discover a suitable metric @@ -24,3 +25,4 @@ spec: prompt: inline: | Which namespaces have the most running pods? Show me the top 5. + Use Prometheus metrics and include the metric name and PromQL query you used. diff --git a/evals/mcpchecker/tasks/queries/namespace-resource-usage.yaml b/evals/mcpchecker/tasks/queries/namespace-resource-usage.yaml index 1be48503..d492f9c0 100644 --- a/evals/mcpchecker/tasks/queries/namespace-resource-usage.yaml +++ b/evals/mcpchecker/tasks/queries/namespace-resource-usage.yaml @@ -7,6 +7,7 @@ metadata: runs: 1 labels: category: queries + suite: observability toolType: multi-step description: | Tests multi-step reasoning: the agent must discover CPU and memory metrics, @@ -25,3 +26,4 @@ spec: inline: | Which namespace is consuming the most CPU and memory? Show me the top namespace for each. + Use Prometheus metrics and include the metric names and PromQL queries you used. diff --git a/evals/mcpchecker/tasks/queries/network-traffic.yaml b/evals/mcpchecker/tasks/queries/network-traffic.yaml index a42b9bac..017cf2ff 100644 --- a/evals/mcpchecker/tasks/queries/network-traffic.yaml +++ b/evals/mcpchecker/tasks/queries/network-traffic.yaml @@ -7,6 +7,7 @@ metadata: runs: 1 labels: category: queries + suite: observability toolType: query description: | Tests if the agent can discover network-related metrics and query them to find @@ -22,3 +23,4 @@ spec: prompt: inline: | Which pods are receiving the most network traffic? + Use Prometheus metrics and include the metric name and PromQL query you used. diff --git a/evals/mcpchecker/tasks/queries/nonexistent-metric.yaml b/evals/mcpchecker/tasks/queries/nonexistent-metric.yaml index 93240f62..ed76ee25 100644 --- a/evals/mcpchecker/tasks/queries/nonexistent-metric.yaml +++ b/evals/mcpchecker/tasks/queries/nonexistent-metric.yaml @@ -7,6 +7,7 @@ metadata: runs: 1 labels: category: queries + suite: observability toolType: error-handling description: | Tests agent recovery when querying a metric that does not exist. diff --git a/evals/mcpchecker/tasks/queries/nonexistent-namespace.yaml b/evals/mcpchecker/tasks/queries/nonexistent-namespace.yaml index 476430c4..78090c0c 100644 --- a/evals/mcpchecker/tasks/queries/nonexistent-namespace.yaml +++ b/evals/mcpchecker/tasks/queries/nonexistent-namespace.yaml @@ -7,6 +7,7 @@ metadata: runs: 1 labels: category: queries + suite: observability toolType: error-handling description: | Tests agent behavior when querying for resources in a namespace that diff --git a/evals/mcpchecker/tasks/queries/pending-pods.yaml b/evals/mcpchecker/tasks/queries/pending-pods.yaml index 63c84b4a..ebc94825 100644 --- a/evals/mcpchecker/tasks/queries/pending-pods.yaml +++ b/evals/mcpchecker/tasks/queries/pending-pods.yaml @@ -7,6 +7,7 @@ metadata: runs: 1 labels: category: queries + suite: observability toolType: query description: | Tests if the agent can identify pods stuck in pending state by first discovering @@ -20,3 +21,4 @@ spec: prompt: inline: | Which pods are stuck in pending state? + Use Prometheus metrics and include the metric name and PromQL query you used. diff --git a/evals/mcpchecker/tasks/queries/pods-created.yaml b/evals/mcpchecker/tasks/queries/pods-created.yaml index c27976d5..65dd773b 100644 --- a/evals/mcpchecker/tasks/queries/pods-created.yaml +++ b/evals/mcpchecker/tasks/queries/pods-created.yaml @@ -7,6 +7,7 @@ metadata: runs: 1 labels: category: queries + suite: observability toolType: range-query description: | Tests if the agent can use a range query to find recently created pods by @@ -19,3 +20,4 @@ spec: prompt: inline: | How many pods were created in the last 5 minutes? + Include the Prometheus metric name and PromQL query you used. diff --git a/evals/mcpchecker/tasks/queries/prometheus-head-series.yaml b/evals/mcpchecker/tasks/queries/prometheus-head-series.yaml index 4d9c04a4..081bacb0 100644 --- a/evals/mcpchecker/tasks/queries/prometheus-head-series.yaml +++ b/evals/mcpchecker/tasks/queries/prometheus-head-series.yaml @@ -7,6 +7,7 @@ metadata: runs: 1 labels: category: queries + suite: observability toolType: query description: | Tests if the agent can query Prometheus internal metrics to report the current @@ -19,3 +20,4 @@ spec: prompt: inline: | How many head series does Prometheus have? + Include the Prometheus metric name and PromQL query you used. diff --git a/evals/mcpchecker/tasks/queries/prometheus-requests.yaml b/evals/mcpchecker/tasks/queries/prometheus-requests.yaml index f236eebf..32163899 100644 --- a/evals/mcpchecker/tasks/queries/prometheus-requests.yaml +++ b/evals/mcpchecker/tasks/queries/prometheus-requests.yaml @@ -7,6 +7,7 @@ metadata: runs: 1 labels: category: queries + suite: observability toolType: query description: | Tests if the agent can calculate the request rate to Prometheus by discovering @@ -19,3 +20,4 @@ spec: prompt: inline: | How many requests per second are being made to Prometheus? + Include the Prometheus metric name and PromQL query you used. diff --git a/evals/mcpchecker/tasks/queries/prometheus-wal-size.yaml b/evals/mcpchecker/tasks/queries/prometheus-wal-size.yaml index 813ca877..07eb621e 100644 --- a/evals/mcpchecker/tasks/queries/prometheus-wal-size.yaml +++ b/evals/mcpchecker/tasks/queries/prometheus-wal-size.yaml @@ -7,6 +7,7 @@ metadata: runs: 1 labels: category: queries + suite: observability toolType: query description: | Tests if the agent can query the current Prometheus WAL storage size using @@ -19,3 +20,4 @@ spec: prompt: inline: | What is the current storage size of the Prometheus WAL? + Include the Prometheus metric name and PromQL query you used. diff --git a/evals/mcpchecker/tasks/queries/time-range-query.yaml b/evals/mcpchecker/tasks/queries/time-range-query.yaml index ce7f7b5c..a4df7a39 100644 --- a/evals/mcpchecker/tasks/queries/time-range-query.yaml +++ b/evals/mcpchecker/tasks/queries/time-range-query.yaml @@ -7,6 +7,7 @@ metadata: runs: 1 labels: category: queries + suite: observability toolType: multi-step description: | Tests whether the agent correctly uses execute_range_query with @@ -23,3 +24,4 @@ spec: prompt: inline: | Show me the CPU usage trend for pods in the default namespace over the last 30 minutes. + Include the Prometheus metric name and PromQL query you used. diff --git a/evals/mcpchecker/tasks/queries/visualize-cpu-usage.yaml b/evals/mcpchecker/tasks/queries/visualize-cpu-usage.yaml index efc1a511..29bcd3c0 100644 --- a/evals/mcpchecker/tasks/queries/visualize-cpu-usage.yaml +++ b/evals/mcpchecker/tasks/queries/visualize-cpu-usage.yaml @@ -7,6 +7,7 @@ metadata: runs: 1 labels: category: queries + suite: observability toolType: visualization description: | Tests if the agent uses the show_timeseries tool to visualize CPU usage @@ -20,3 +21,4 @@ spec: prompt: inline: | Visualize the CPU usage for pods in the default namespace over the last 30 minutes. + Include the Prometheus metric name and PromQL query you used.