diff --git a/evals/mcpchecker/tasks/alerts/alert-investigation.yaml b/evals/mcpchecker/tasks/alerts/alert-investigation.yaml index 4388a72d..95eed497 100644 --- a/evals/mcpchecker/tasks/alerts/alert-investigation.yaml +++ b/evals/mcpchecker/tasks/alerts/alert-investigation.yaml @@ -10,19 +10,18 @@ metadata: toolType: multi-step description: | Tests if the agent can perform multi-step alert triage: first retrieving - alerts from Alertmanager, then investigating related metrics for the most - critical alert using Prometheus queries. + alerts from Alertmanager, then investigating related metrics for a + firing alert using queries. spec: verify: - llmJudge: - contains: "alert" - reason: "Verify the agent retrieved alerts from Alertmanager" + contains: "AlertmanagerReceiversNotConfigured" + reason: "Verify the agent retrieved firing alerts and identified AlertmanagerReceiversNotConfigured" - llmJudge: - contains: "metric" - reason: "Verify the agent investigated a related Prometheus metric" + contains: "alertmanager" + reason: "Verify the agent investigated Alertmanager-related metrics for the alert" prompt: inline: | Check if there are any firing alerts. If there are, investigate the related metrics for the most critical alert and summarize - what's happening. If there are no firing alerts, check cluster - health metrics instead. + what's happening. diff --git a/evals/mcpchecker/tasks/alerts/filtered-alerts.yaml b/evals/mcpchecker/tasks/alerts/filtered-alerts.yaml index ed6e0bcd..bed6cec4 100644 --- a/evals/mcpchecker/tasks/alerts/filtered-alerts.yaml +++ b/evals/mcpchecker/tasks/alerts/filtered-alerts.yaml @@ -16,11 +16,8 @@ metadata: spec: verify: - llmJudge: - contains: "alert" - reason: "Verify the agent retrieved and reported on alerts" - - llmJudge: - contains: "critical" - reason: "Verify the agent filtered or addressed the critical severity" + contains: "AlertmanagerReceiversNotConfigured" + reason: "Verify the agent filtered for warning-severity alerts and found AlertmanagerReceiversNotConfigured" prompt: inline: | - Are there any firing alerts with severity=critical? Show only active alerts. + Show me only the active alerts with severity=warning. diff --git a/evals/mcpchecker/tasks/alerts/get-alerts.yaml b/evals/mcpchecker/tasks/alerts/get-alerts.yaml index a21224a1..7ba817a9 100644 --- a/evals/mcpchecker/tasks/alerts/get-alerts.yaml +++ b/evals/mcpchecker/tasks/alerts/get-alerts.yaml @@ -14,8 +14,8 @@ metadata: spec: verify: - llmJudge: - contains: "alerts" - reason: "Verify the output reports on the alert state, whether alerts are firing or not" + contains: "Watchdog" + reason: "Verify the agent retrieved firing alerts and reported the Watchdog alert" prompt: inline: | Check the cluster for any firing alerts and report what you find. diff --git a/evals/mcpchecker/tasks/alerts/get-silences.yaml b/evals/mcpchecker/tasks/alerts/get-silences.yaml index fe550e23..1f4e36ce 100644 --- a/evals/mcpchecker/tasks/alerts/get-silences.yaml +++ b/evals/mcpchecker/tasks/alerts/get-silences.yaml @@ -14,8 +14,8 @@ metadata: spec: verify: - llmJudge: - contains: "silence" - reason: "Verify the output discusses alert silences, including when no active silences exist" + contains: "silences" + reason: "Verify the response mentions silences. The agent should report either active silences with their matchers or that no active silences exist — both are valid outcomes" prompt: inline: | Are there any active silences in Alertmanager? diff --git a/evals/mcpchecker/tasks/labels/get-series.yaml b/evals/mcpchecker/tasks/labels/get-series.yaml index 29e40259..85eecb29 100644 --- a/evals/mcpchecker/tasks/labels/get-series.yaml +++ b/evals/mcpchecker/tasks/labels/get-series.yaml @@ -15,11 +15,11 @@ metadata: spec: verify: - llmJudge: - contains: "series" - reason: "Verify the output reports time series information" + contains: "namespace" + reason: "Verify the agent retrieved actual series data containing label dimensions like namespace" - llmJudge: contains: "kube_pod_info" - reason: "Verify the agent queried the correct metric" + reason: "Verify the agent queried the kube_pod_info metric and reported its cardinality" prompt: inline: | - How many time series exist for the kube_pod_info metric? Show the cardinality. + How many time series exist for the kube_pod_info metric? Show the count and list the label names present. diff --git a/evals/mcpchecker/tasks/labels/label-values.yaml b/evals/mcpchecker/tasks/labels/label-values.yaml index fe86f6bd..aaa840ce 100644 --- a/evals/mcpchecker/tasks/labels/label-values.yaml +++ b/evals/mcpchecker/tasks/labels/label-values.yaml @@ -14,8 +14,8 @@ metadata: spec: verify: - llmJudge: - contains: "namespace" - reason: "Verify the output lists actual namespace values from the cluster" + contains: "kube-system" + reason: "Verify the output lists actual namespace values from the cluster such as kube-system" prompt: inline: | What are the unique namespace values for the kube_pod_info metric? diff --git a/evals/mcpchecker/tasks/labels/series-by-namespace.yaml b/evals/mcpchecker/tasks/labels/series-by-namespace.yaml index 92b62df3..14d3058e 100644 --- a/evals/mcpchecker/tasks/labels/series-by-namespace.yaml +++ b/evals/mcpchecker/tasks/labels/series-by-namespace.yaml @@ -16,12 +16,12 @@ metadata: spec: verify: - llmJudge: - contains: "series" - reason: "Verify the agent reported series information" + contains: "pod" + reason: "Verify the agent retrieved actual series data containing label dimensions like pod" - llmJudge: - contains: "monitoring" - reason: "Verify the agent scoped to the monitoring namespace" + contains: "container" + reason: "Verify the agent reported series with container label values from the namespace" prompt: inline: | How many time series exist for container_cpu_usage_seconds_total - in the monitoring namespace? + in the openshift-monitoring namespace? diff --git a/evals/mcpchecker/tasks/metrics/list-metrics.yaml b/evals/mcpchecker/tasks/metrics/list-metrics.yaml index ab6f01a6..2182dcad 100644 --- a/evals/mcpchecker/tasks/metrics/list-metrics.yaml +++ b/evals/mcpchecker/tasks/metrics/list-metrics.yaml @@ -14,8 +14,8 @@ metadata: spec: verify: - llmJudge: - contains: "kube" - reason: "Verify the output lists Kubernetes metrics matching the kube prefix" + contains: "kube_pod_info" + reason: "Verify the output lists specific kube metrics discovered from Prometheus" prompt: inline: | List all available Prometheus metrics that contain 'kube' in the name. diff --git a/evals/mcpchecker/tasks/metrics/list-node-metrics.yaml b/evals/mcpchecker/tasks/metrics/list-node-metrics.yaml index 0b8c5640..fd932e65 100644 --- a/evals/mcpchecker/tasks/metrics/list-node-metrics.yaml +++ b/evals/mcpchecker/tasks/metrics/list-node-metrics.yaml @@ -14,8 +14,8 @@ metadata: spec: verify: - llmJudge: - contains: "node" - reason: "Verify the output lists node-related metrics" + contains: "node_" + reason: "Verify the output lists specific node metrics discovered from Prometheus (e.g. node_cpu_seconds_total)" prompt: inline: | What node-related metrics are available in Prometheus? diff --git a/evals/mcpchecker/tasks/queries/backend-reachability.yaml b/evals/mcpchecker/tasks/queries/backend-reachability.yaml index e981e1ef..a276cd5f 100644 --- a/evals/mcpchecker/tasks/queries/backend-reachability.yaml +++ b/evals/mcpchecker/tasks/queries/backend-reachability.yaml @@ -15,8 +15,8 @@ metadata: spec: verify: - llmJudge: - contains: "up" - reason: "Verify the agent confirmed the metrics backend is reachable and targets are up" + contains: "prometheus-k8s" + reason: "Verify the agent queried the up metric and reported on the prometheus-k8s target" prompt: inline: | Is the metrics backend reachable? Query up{job="prometheus-k8s"} and report the status. diff --git a/evals/mcpchecker/tasks/queries/diagnose-cluster-health.yaml b/evals/mcpchecker/tasks/queries/diagnose-cluster-health.yaml index 4d37e5d7..c0b1db9e 100644 --- a/evals/mcpchecker/tasks/queries/diagnose-cluster-health.yaml +++ b/evals/mcpchecker/tasks/queries/diagnose-cluster-health.yaml @@ -16,11 +16,11 @@ metadata: spec: verify: - llmJudge: - contains: "cluster" - reason: "Verify the response provides an overall cluster health assessment" + contains: "Watchdog" + reason: "Verify the agent checked alerts and reported specific alert names such as Watchdog" - llmJudge: - contains: "node" - reason: "Verify the agent checked node-level health indicators" + contains: "kube_" + reason: "Verify the agent queried Kubernetes metrics (e.g. kube_node_status_condition) for health assessment" prompt: inline: | Is the cluster healthy? Give me an overview of any issues. diff --git a/evals/mcpchecker/tasks/queries/high-cardinality-rejection.yaml b/evals/mcpchecker/tasks/queries/high-cardinality-rejection.yaml index a8d815a6..70e71f99 100644 --- a/evals/mcpchecker/tasks/queries/high-cardinality-rejection.yaml +++ b/evals/mcpchecker/tasks/queries/high-cardinality-rejection.yaml @@ -15,8 +15,8 @@ metadata: spec: verify: - llmJudge: - contains: "namespace" - reason: "Verify the agent suggests scoping the query by namespace or another dimension" + contains: "guardrail" + reason: "Verify the agent explains the query was rejected by a guardrail and suggests a scoped alternative" prompt: inline: | Show me the CPU, memory, network, and disk usage for every single container diff --git a/evals/mcpchecker/tasks/queries/memory-usage.yaml b/evals/mcpchecker/tasks/queries/memory-usage.yaml index b36fbaf5..5b5371cb 100644 --- a/evals/mcpchecker/tasks/queries/memory-usage.yaml +++ b/evals/mcpchecker/tasks/queries/memory-usage.yaml @@ -14,8 +14,8 @@ metadata: spec: verify: - llmJudge: - contains: "container_memory_working_set_bytes" - reason: "Verify the agent queried the correct memory metric" + contains: "container_memory" + reason: "Verify the agent queried a container memory metric (working_set_bytes or usage_bytes)" - llmJudge: contains: "pod" reason: "Verify the response identifies specific pods" diff --git a/evals/mcpchecker/tasks/queries/namespace-pod-count.yaml b/evals/mcpchecker/tasks/queries/namespace-pod-count.yaml index 8e5e7316..cfe57668 100644 --- a/evals/mcpchecker/tasks/queries/namespace-pod-count.yaml +++ b/evals/mcpchecker/tasks/queries/namespace-pod-count.yaml @@ -16,11 +16,11 @@ metadata: spec: verify: - llmJudge: - contains: "namespace" - reason: "Verify the response lists namespaces" + contains: "kube-system" + reason: "Verify the response lists actual namespace names from the cluster such as kube-system" - llmJudge: - contains: "pod" - reason: "Verify the response includes pod counts" + contains: "kube_pod" + reason: "Verify the agent used a kube_pod metric to count pods" prompt: inline: | Which namespaces have the most running pods? Show me the top 5. diff --git a/evals/mcpchecker/tasks/queries/namespace-resource-usage.yaml b/evals/mcpchecker/tasks/queries/namespace-resource-usage.yaml index 1d974631..1be48503 100644 --- a/evals/mcpchecker/tasks/queries/namespace-resource-usage.yaml +++ b/evals/mcpchecker/tasks/queries/namespace-resource-usage.yaml @@ -16,14 +16,11 @@ metadata: spec: verify: - llmJudge: - contains: "namespace" - reason: "Verify the response breaks down resource usage by namespace" + contains: "container_cpu_usage_seconds_total" + reason: "Verify the agent discovered and used the correct CPU metric" - llmJudge: - contains: "cpu" - reason: "Verify the response includes CPU usage data" - - llmJudge: - contains: "memory" - reason: "Verify the response includes memory usage data" + contains: "container_memory" + reason: "Verify the agent discovered and used a container memory metric (working_set_bytes or usage_bytes)" prompt: inline: | Which namespace is consuming the most CPU and memory? diff --git a/evals/mcpchecker/tasks/queries/pods-created.yaml b/evals/mcpchecker/tasks/queries/pods-created.yaml index 9ac0b071..c27976d5 100644 --- a/evals/mcpchecker/tasks/queries/pods-created.yaml +++ b/evals/mcpchecker/tasks/queries/pods-created.yaml @@ -15,7 +15,7 @@ spec: verify: - llmJudge: contains: "kube_pod_created" - reason: "Verify the agent used the correct metric for pod creation timestamps" + reason: "Verify the agent discovered and used the kube_pod_created metric" prompt: inline: | How many pods were created in the last 5 minutes? diff --git a/evals/mcpchecker/tasks/queries/time-range-query.yaml b/evals/mcpchecker/tasks/queries/time-range-query.yaml index aa64946d..ce7f7b5c 100644 --- a/evals/mcpchecker/tasks/queries/time-range-query.yaml +++ b/evals/mcpchecker/tasks/queries/time-range-query.yaml @@ -15,11 +15,11 @@ metadata: spec: verify: - llmJudge: - contains: "30 minutes" - reason: "Verify the agent honoured the requested 30-minute time window" + contains: "container_cpu_usage_seconds_total" + reason: "Verify the agent discovered and used the correct CPU metric" - llmJudge: - contains: "cpu" - reason: "Verify the response includes CPU usage data" + contains: "pod" + reason: "Verify the response includes specific pod names from the query results" prompt: inline: | Show me the CPU usage trend for pods in the default namespace over the last 30 minutes. diff --git a/evals/mcpchecker/tasks/queries/visualize-cpu-usage.yaml b/evals/mcpchecker/tasks/queries/visualize-cpu-usage.yaml index a323ee56..efc1a511 100644 --- a/evals/mcpchecker/tasks/queries/visualize-cpu-usage.yaml +++ b/evals/mcpchecker/tasks/queries/visualize-cpu-usage.yaml @@ -15,8 +15,8 @@ metadata: spec: verify: - llmJudge: - contains: "cpu" - reason: "Verify the agent queried and visualized CPU usage data" + contains: "container_cpu_usage_seconds_total" + reason: "Verify the agent discovered and visualized the correct CPU metric" prompt: inline: | Visualize the CPU usage for pods in the default namespace over the last 30 minutes.