diff --git a/evals/claude-code/eval.yaml b/evals/claude-code/eval.yaml index 4a46b8131..bbf5c9e0d 100644 --- a/evals/claude-code/eval.yaml +++ b/evals/claude-code/eval.yaml @@ -72,3 +72,13 @@ config: toolPattern: ".*" minToolCalls: 1 maxToolCalls: 20 + # Observability tasks + - glob: ../tasks/observability/*/*.yaml + labelSelector: + suite: observability + assertions: + toolsUsed: + - server: kubernetes + toolPattern: ".*" + minToolCalls: 1 + maxToolCalls: 20 diff --git a/evals/gemini-agent/eval.yaml b/evals/gemini-agent/eval.yaml index d7ee6eb82..3a1a66132 100644 --- a/evals/gemini-agent/eval.yaml +++ b/evals/gemini-agent/eval.yaml @@ -17,3 +17,13 @@ config: toolsUsed: - server: kubernetes toolPattern: ".*" + # Observability tasks + - glob: ../tasks/observability/*/*.yaml + labelSelector: + suite: observability + assertions: + toolsUsed: + - server: kubernetes + toolPattern: ".*" + minToolCalls: 1 + maxToolCalls: 20 diff --git a/evals/openai-agent/eval.yaml b/evals/openai-agent/eval.yaml index d3a71773f..1dedd4787 100644 --- a/evals/openai-agent/eval.yaml +++ b/evals/openai-agent/eval.yaml @@ -72,3 +72,13 @@ config: toolPattern: ".*" minToolCalls: 1 maxToolCalls: 20 + # Observability tasks + - glob: ../tasks/observability/*/*.yaml + labelSelector: + suite: observability + assertions: + toolsUsed: + - server: kubernetes + toolPattern: ".*" + minToolCalls: 1 + maxToolCalls: 20 diff --git a/evals/tasks/observability/alerts/alert-investigation.yaml b/evals/tasks/observability/alerts/alert-investigation.yaml new file mode 100644 index 000000000..f0f35ce2d --- /dev/null +++ b/evals/tasks/observability/alerts/alert-investigation.yaml @@ -0,0 +1,28 @@ +kind: Task +apiVersion: mcpchecker/v1alpha2 +metadata: + name: "alert-investigation" + difficulty: medium + parallel: true + runs: 1 + labels: + category: alerts + suite: observability + toolType: multi-step + description: | + Tests if the agent can perform multi-step alert triage: first retrieving + alerts from Alertmanager, then investigating related metrics for a + firing alert using queries. +spec: + verify: + - llmJudge: + contains: "AlertmanagerReceiversNotConfigured" + reason: "Verify the agent retrieved firing alerts and identified AlertmanagerReceiversNotConfigured" + - llmJudge: + contains: "alertmanager" + reason: "Verify the agent investigated Alertmanager-related metrics for the alert" + prompt: + inline: | + Check if there are any firing alerts. If there are, investigate + the related metrics for the most critical alert and summarize + what's happening. diff --git a/evals/tasks/observability/alerts/filtered-alerts.yaml b/evals/tasks/observability/alerts/filtered-alerts.yaml new file mode 100644 index 000000000..6f9aaf91a --- /dev/null +++ b/evals/tasks/observability/alerts/filtered-alerts.yaml @@ -0,0 +1,24 @@ +kind: Task +apiVersion: mcpchecker/v1alpha2 +metadata: + name: "filtered-alerts" + difficulty: medium + parallel: true + runs: 1 + labels: + category: alerts + suite: observability + toolType: alertmanager + description: | + Tests if the agent uses the get_alerts tool with filter parameters + to retrieve only active alerts matching a specific severity. The agent + should pass appropriate filter arguments rather than fetching all alerts + and filtering client-side. +spec: + verify: + - llmJudge: + contains: "AlertmanagerReceiversNotConfigured" + reason: "Verify the agent filtered for warning-severity alerts and found AlertmanagerReceiversNotConfigured" + prompt: + inline: | + Show me only the active alerts with severity=warning. diff --git a/evals/tasks/observability/alerts/get-alerts.yaml b/evals/tasks/observability/alerts/get-alerts.yaml new file mode 100644 index 000000000..49dbd5f92 --- /dev/null +++ b/evals/tasks/observability/alerts/get-alerts.yaml @@ -0,0 +1,22 @@ +kind: Task +apiVersion: mcpchecker/v1alpha2 +metadata: + name: "get-alerts" + difficulty: easy + parallel: true + runs: 1 + labels: + category: alerts + suite: observability + toolType: alertmanager + description: | + Tests if the agent can discover and use the get_alerts tool to retrieve + currently firing alerts from Alertmanager. +spec: + verify: + - llmJudge: + contains: "Watchdog" + reason: "Verify the agent retrieved firing alerts and reported the Watchdog alert" + prompt: + inline: | + Check the cluster for any firing alerts and report what you find. diff --git a/evals/tasks/observability/alerts/get-silences.yaml b/evals/tasks/observability/alerts/get-silences.yaml new file mode 100644 index 000000000..f986c7851 --- /dev/null +++ b/evals/tasks/observability/alerts/get-silences.yaml @@ -0,0 +1,22 @@ +kind: Task +apiVersion: mcpchecker/v1alpha2 +metadata: + name: "get-silences" + difficulty: easy + parallel: true + runs: 1 + labels: + category: alerts + suite: observability + toolType: alertmanager + description: | + Tests if the agent can discover and use the get_silences tool to retrieve + active silences from Alertmanager. +spec: + verify: + - llmJudge: + contains: "silences" + reason: "Verify the response mentions silences. The agent should report either active silences with their matchers or that no active silences exist — both are valid outcomes" + prompt: + inline: | + Are there any active silences in Alertmanager? diff --git a/evals/tasks/observability/labels/get-series.yaml b/evals/tasks/observability/labels/get-series.yaml new file mode 100644 index 000000000..95645079a --- /dev/null +++ b/evals/tasks/observability/labels/get-series.yaml @@ -0,0 +1,26 @@ +kind: Task +apiVersion: mcpchecker/v1alpha2 +metadata: + name: "get-series-cardinality" + difficulty: medium + parallel: true + runs: 1 + labels: + category: labels + suite: observability + toolType: exploration + description: | + Tests if the agent can use the get_series tool to check cardinality for a metric. + The agent should first verify the metric exists via list_metrics, then use + get_series to retrieve matching time series and report the count. +spec: + verify: + - llmJudge: + contains: "namespace" + reason: "Verify the agent retrieved actual series data containing label dimensions like namespace" + - llmJudge: + contains: "kube_pod_info" + reason: "Verify the agent queried the kube_pod_info metric and reported its cardinality" + prompt: + inline: | + How many time series exist for the kube_pod_info metric? Show the count and list the label names present. diff --git a/evals/tasks/observability/labels/label-names.yaml b/evals/tasks/observability/labels/label-names.yaml new file mode 100644 index 000000000..adf413963 --- /dev/null +++ b/evals/tasks/observability/labels/label-names.yaml @@ -0,0 +1,26 @@ +kind: Task +apiVersion: mcpchecker/v1alpha2 +metadata: + name: "label-names" + difficulty: easy + parallel: true + runs: 1 + labels: + category: labels + suite: observability + toolType: exploration + description: | + Tests if the agent follows the correct workflow: first calling list_metrics to + verify kube_pod_info exists, then calling get_label_names to discover available + labels for that metric. +spec: + verify: + - llmJudge: + contains: "namespace" + reason: "Verify the output includes the namespace label which is a standard Kubernetes label" + - llmJudge: + contains: "pod" + reason: "Verify the output includes the pod label" + prompt: + inline: | + What labels are available for the kube_pod_info metric? diff --git a/evals/tasks/observability/labels/label-values.yaml b/evals/tasks/observability/labels/label-values.yaml new file mode 100644 index 000000000..d78e5b39f --- /dev/null +++ b/evals/tasks/observability/labels/label-values.yaml @@ -0,0 +1,22 @@ +kind: Task +apiVersion: mcpchecker/v1alpha2 +metadata: + name: "label-values" + difficulty: medium + parallel: true + runs: 1 + labels: + category: labels + suite: observability + toolType: exploration + description: | + Tests the full discovery workflow: list_metrics to verify the metric, then + get_label_values to retrieve unique namespace values for kube_pod_info. +spec: + verify: + - llmJudge: + contains: "kube-system" + reason: "Verify the output lists actual namespace values from the cluster such as kube-system" + prompt: + inline: | + What are the unique namespace values for the kube_pod_info metric? diff --git a/evals/tasks/observability/labels/series-by-namespace.yaml b/evals/tasks/observability/labels/series-by-namespace.yaml new file mode 100644 index 000000000..23d78b58e --- /dev/null +++ b/evals/tasks/observability/labels/series-by-namespace.yaml @@ -0,0 +1,28 @@ +kind: Task +apiVersion: mcpchecker/v1alpha2 +metadata: + name: "series-by-namespace" + difficulty: medium + parallel: true + runs: 1 + labels: + category: labels + suite: observability + toolType: exploration + description: | + Tests if the agent can use the get_series tool with a label selector + to find time series scoped to a specific namespace. The agent should + first verify the metric exists, then use get_series with a namespace + matcher to report the cardinality within that scope. +spec: + verify: + - llmJudge: + contains: "pod" + reason: "Verify the agent retrieved actual series data containing label dimensions like pod" + - llmJudge: + contains: "container" + reason: "Verify the agent reported series with container label values from the namespace" + prompt: + inline: | + How many time series exist for container_cpu_usage_seconds_total + in the openshift-monitoring namespace? diff --git a/evals/tasks/observability/metrics/list-metrics.yaml b/evals/tasks/observability/metrics/list-metrics.yaml new file mode 100644 index 000000000..1f5adc71a --- /dev/null +++ b/evals/tasks/observability/metrics/list-metrics.yaml @@ -0,0 +1,22 @@ +kind: Task +apiVersion: mcpchecker/v1alpha2 +metadata: + name: "list-kube-metrics" + difficulty: easy + parallel: true + runs: 1 + labels: + category: metrics + suite: observability + toolType: discovery + description: | + Tests if the agent can discover the list_metrics tool to find Kubernetes-related + metrics. The agent should use the name_regex parameter to filter for kube metrics. +spec: + verify: + - llmJudge: + contains: "kube_pod_info" + reason: "Verify the output lists specific kube metrics discovered from Prometheus" + prompt: + inline: | + List all available Prometheus metrics that contain 'kube' in the name. diff --git a/evals/tasks/observability/metrics/list-node-metrics.yaml b/evals/tasks/observability/metrics/list-node-metrics.yaml new file mode 100644 index 000000000..f496460aa --- /dev/null +++ b/evals/tasks/observability/metrics/list-node-metrics.yaml @@ -0,0 +1,22 @@ +kind: Task +apiVersion: mcpchecker/v1alpha2 +metadata: + name: "list-node-metrics" + difficulty: easy + parallel: true + runs: 1 + labels: + category: metrics + suite: observability + toolType: discovery + description: | + Tests if the agent can discover node-related metrics using the list_metrics tool + with a regex filter for node metrics. +spec: + verify: + - llmJudge: + contains: "node_" + reason: "Verify the output lists specific node metrics discovered from Prometheus (e.g. node_cpu_seconds_total)" + prompt: + inline: | + What node-related metrics are available in Prometheus? diff --git a/evals/tasks/observability/queries/backend-reachability.yaml b/evals/tasks/observability/queries/backend-reachability.yaml new file mode 100644 index 000000000..5a027a5a9 --- /dev/null +++ b/evals/tasks/observability/queries/backend-reachability.yaml @@ -0,0 +1,23 @@ +kind: Task +apiVersion: mcpchecker/v1alpha2 +metadata: + name: "backend-reachability" + difficulty: easy + parallel: true + runs: 1 + labels: + category: queries + suite: observability + toolType: smoke-test + description: | + Smoke test to verify the agent can reach the metrics backend and get a + valid response. The agent should query the up metric and confirm the + backend is reachable before any complex eval tasks run. +spec: + verify: + - llmJudge: + contains: "prometheus-k8s" + reason: "Verify the agent queried the up metric and reported on the prometheus-k8s target" + prompt: + inline: | + Is the metrics backend reachable? Query up{job="prometheus-k8s"} and report the status. diff --git a/evals/tasks/observability/queries/cpu-usage.yaml b/evals/tasks/observability/queries/cpu-usage.yaml new file mode 100644 index 000000000..3baddf6cf --- /dev/null +++ b/evals/tasks/observability/queries/cpu-usage.yaml @@ -0,0 +1,25 @@ +kind: Task +apiVersion: mcpchecker/v1alpha2 +metadata: + name: "cpu-usage" + difficulty: medium + parallel: true + runs: 1 + labels: + category: queries + suite: observability + toolType: query + description: | + Tests if the agent can find the appropriate CPU usage metric and query it to + determine which pods are consuming the most CPU resources. +spec: + verify: + - llmJudge: + contains: "container_cpu_usage_seconds_total" + reason: "Verify the agent queried the correct CPU metric" + - llmJudge: + contains: "pod" + reason: "Verify the response identifies specific pods" + prompt: + inline: | + Which pods are using the most CPU? diff --git a/evals/tasks/observability/queries/crashlooping-pods.yaml b/evals/tasks/observability/queries/crashlooping-pods.yaml new file mode 100644 index 000000000..4e975964e --- /dev/null +++ b/evals/tasks/observability/queries/crashlooping-pods.yaml @@ -0,0 +1,22 @@ +kind: Task +apiVersion: mcpchecker/v1alpha2 +metadata: + name: "crashlooping-pods" + difficulty: medium + parallel: true + runs: 1 + labels: + category: queries + suite: observability + toolType: range-query + description: | + Tests if the agent can identify crashlooping pods by discovering and querying + the kube_pod_container_status_restarts_total metric over a time range. +spec: + verify: + - llmJudge: + contains: "kube_pod_container_status_restarts_total" + reason: "Verify the agent used the correct metric for container restart counts" + prompt: + inline: | + Which pods were crashlooping in the last 5 minutes? diff --git a/evals/tasks/observability/queries/diagnose-cluster-health.yaml b/evals/tasks/observability/queries/diagnose-cluster-health.yaml new file mode 100644 index 000000000..7542b61e5 --- /dev/null +++ b/evals/tasks/observability/queries/diagnose-cluster-health.yaml @@ -0,0 +1,28 @@ +kind: Task +apiVersion: mcpchecker/v1alpha2 +metadata: + name: "diagnose-cluster-health" + difficulty: hard + parallel: true + runs: 1 + labels: + category: queries + suite: observability + toolType: diagnostic + description: | + Tests the agent's ability to handle an ambiguous diagnostic prompt. + The agent must autonomously decide which tools and metrics to check, + ideally covering alerts, node status, and pod health without being + told exactly what to look for. +spec: + verify: + - llmJudge: + contains: "Watchdog" + reason: "Verify the agent checked alerts and reported specific alert names such as Watchdog" + - llmJudge: + contains: "kube_" + reason: "Verify the agent queried Kubernetes metrics (e.g. kube_node_status_condition) for health assessment" + prompt: + inline: | + Is the cluster healthy? Give me an overview of any issues. + Check alerts, node status, and pod health. diff --git a/evals/tasks/observability/queries/high-cardinality-rejection.yaml b/evals/tasks/observability/queries/high-cardinality-rejection.yaml new file mode 100644 index 000000000..dbb5b3bba --- /dev/null +++ b/evals/tasks/observability/queries/high-cardinality-rejection.yaml @@ -0,0 +1,24 @@ +kind: Task +apiVersion: mcpchecker/v1alpha2 +metadata: + name: "high-cardinality-rejection" + difficulty: medium + parallel: true + runs: 1 + labels: + category: queries + suite: observability + toolType: error-handling + description: | + Tests agent behavior when a query is rejected by obs-mcp guardrails + due to high cardinality. The agent should explain the guardrail + rejection and suggest a scoped alternative rather than giving up. +spec: + verify: + - llmJudge: + contains: "guardrail" + reason: "Verify the agent explains the query was rejected by a guardrail and suggests a scoped alternative" + prompt: + inline: | + Show me the CPU, memory, network, and disk usage for every single container + across all namespaces over the last 24 hours with 1-second resolution. diff --git a/evals/tasks/observability/queries/memory-usage.yaml b/evals/tasks/observability/queries/memory-usage.yaml new file mode 100644 index 000000000..58da3fce8 --- /dev/null +++ b/evals/tasks/observability/queries/memory-usage.yaml @@ -0,0 +1,25 @@ +kind: Task +apiVersion: mcpchecker/v1alpha2 +metadata: + name: "memory-usage" + difficulty: medium + parallel: true + runs: 1 + labels: + category: queries + suite: observability + toolType: query + description: | + Tests if the agent can find the appropriate memory usage metric and query it + to determine which pods are consuming the most memory resources. +spec: + verify: + - llmJudge: + contains: "container_memory" + reason: "Verify the agent queried a container memory metric (working_set_bytes or usage_bytes)" + - llmJudge: + contains: "pod" + reason: "Verify the response identifies specific pods" + prompt: + inline: | + Which pods are using the most memory? Show me the top 5. diff --git a/evals/tasks/observability/queries/namespace-pod-count.yaml b/evals/tasks/observability/queries/namespace-pod-count.yaml new file mode 100644 index 000000000..b180a917d --- /dev/null +++ b/evals/tasks/observability/queries/namespace-pod-count.yaml @@ -0,0 +1,27 @@ +kind: Task +apiVersion: mcpchecker/v1alpha2 +metadata: + name: "namespace-pod-count" + difficulty: medium + parallel: true + runs: 1 + labels: + category: queries + suite: observability + toolType: multi-step + description: | + Tests multi-step reasoning: the agent must discover a suitable metric + via list_metrics, explore label values to find namespaces, then query + to determine which namespaces have the most pods. Requires chaining + discovery, label exploration, and a query. +spec: + verify: + - llmJudge: + contains: "kube-system" + reason: "Verify the response lists actual namespace names from the cluster such as kube-system" + - llmJudge: + contains: "kube_pod" + reason: "Verify the agent used a kube_pod metric to count pods" + prompt: + inline: | + Which namespaces have the most running pods? Show me the top 5. diff --git a/evals/tasks/observability/queries/namespace-resource-usage.yaml b/evals/tasks/observability/queries/namespace-resource-usage.yaml new file mode 100644 index 000000000..4434ca52c --- /dev/null +++ b/evals/tasks/observability/queries/namespace-resource-usage.yaml @@ -0,0 +1,28 @@ +kind: Task +apiVersion: mcpchecker/v1alpha2 +metadata: + name: "namespace-resource-usage" + difficulty: hard + parallel: true + runs: 1 + labels: + category: queries + suite: observability + toolType: multi-step + description: | + Tests multi-step reasoning: the agent must discover CPU and memory metrics, + then query them with namespace-level aggregation to identify the top + resource-consuming namespaces. Requires chaining list_metrics, label + exploration, and multiple instant queries. +spec: + verify: + - llmJudge: + contains: "container_cpu_usage_seconds_total" + reason: "Verify the agent discovered and used the correct CPU metric" + - llmJudge: + contains: "container_memory" + reason: "Verify the agent discovered and used a container memory metric (working_set_bytes or usage_bytes)" + prompt: + inline: | + Which namespace is consuming the most CPU and memory? + Show me the top namespace for each. diff --git a/evals/tasks/observability/queries/network-traffic.yaml b/evals/tasks/observability/queries/network-traffic.yaml new file mode 100644 index 000000000..9b1e06fd3 --- /dev/null +++ b/evals/tasks/observability/queries/network-traffic.yaml @@ -0,0 +1,25 @@ +kind: Task +apiVersion: mcpchecker/v1alpha2 +metadata: + name: "network-traffic" + difficulty: medium + parallel: true + runs: 1 + labels: + category: queries + suite: observability + toolType: query + description: | + Tests if the agent can discover network-related metrics and query them to find + which pods are receiving the most network traffic. +spec: + verify: + - llmJudge: + contains: "container_network_receive_bytes_total" + reason: "Verify the agent queried the correct network metric" + - llmJudge: + contains: "pod" + reason: "Verify the response identifies specific pods" + prompt: + inline: | + Which pods are receiving the most network traffic? diff --git a/evals/tasks/observability/queries/nonexistent-metric.yaml b/evals/tasks/observability/queries/nonexistent-metric.yaml new file mode 100644 index 000000000..ed76ee25a --- /dev/null +++ b/evals/tasks/observability/queries/nonexistent-metric.yaml @@ -0,0 +1,23 @@ +kind: Task +apiVersion: mcpchecker/v1alpha2 +metadata: + name: "nonexistent-metric" + difficulty: easy + parallel: true + runs: 1 + labels: + category: queries + suite: observability + toolType: error-handling + description: | + Tests agent recovery when querying a metric that does not exist. + The agent should discover that the metric is missing via list_metrics + and inform the user rather than fabricating results. +spec: + verify: + - llmJudge: + contains: "not found" + reason: "Verify the agent communicates that the metric does not exist or was not found" + prompt: + inline: | + What is the current value of the metric fake_nonexistent_metric_total? diff --git a/evals/tasks/observability/queries/nonexistent-namespace.yaml b/evals/tasks/observability/queries/nonexistent-namespace.yaml new file mode 100644 index 000000000..78090c0c1 --- /dev/null +++ b/evals/tasks/observability/queries/nonexistent-namespace.yaml @@ -0,0 +1,23 @@ +kind: Task +apiVersion: mcpchecker/v1alpha2 +metadata: + name: "nonexistent-namespace" + difficulty: easy + parallel: true + runs: 1 + labels: + category: queries + suite: observability + toolType: error-handling + description: | + Tests agent behavior when querying for resources in a namespace that + does not exist. The agent should query and report empty results + gracefully rather than hallucinating data. +spec: + verify: + - llmJudge: + contains: "no data" + reason: "Verify the agent reports no data, no results, or no pods found in the nonexistent namespace" + prompt: + inline: | + Show me the memory usage for all pods in the namespace called totally-fake-namespace-12345. diff --git a/evals/tasks/observability/queries/pending-pods.yaml b/evals/tasks/observability/queries/pending-pods.yaml new file mode 100644 index 000000000..f5b47e387 --- /dev/null +++ b/evals/tasks/observability/queries/pending-pods.yaml @@ -0,0 +1,23 @@ +kind: Task +apiVersion: mcpchecker/v1alpha2 +metadata: + name: "pending-pods" + difficulty: medium + parallel: true + runs: 1 + labels: + category: queries + suite: observability + toolType: query + description: | + Tests if the agent can identify pods stuck in pending state by first discovering + the kube_pod_status_phase metric and then running an instant query to find + pods with phase=Pending. +spec: + verify: + - llmJudge: + contains: "kube_pod_status_phase" + reason: "Verify the agent used the correct metric for pod phase status" + prompt: + inline: | + Which pods are stuck in pending state? diff --git a/evals/tasks/observability/queries/pods-created.yaml b/evals/tasks/observability/queries/pods-created.yaml new file mode 100644 index 000000000..097a57e24 --- /dev/null +++ b/evals/tasks/observability/queries/pods-created.yaml @@ -0,0 +1,22 @@ +kind: Task +apiVersion: mcpchecker/v1alpha2 +metadata: + name: "pods-created" + difficulty: medium + parallel: true + runs: 1 + labels: + category: queries + suite: observability + toolType: range-query + description: | + Tests if the agent can use a range query to find recently created pods by + discovering the kube_pod_created metric and querying it over a 5-minute window. +spec: + verify: + - llmJudge: + contains: "kube_pod_created" + reason: "Verify the agent discovered and used the kube_pod_created metric" + prompt: + inline: | + How many pods were created in the last 5 minutes? diff --git a/evals/tasks/observability/queries/prometheus-head-series.yaml b/evals/tasks/observability/queries/prometheus-head-series.yaml new file mode 100644 index 000000000..9f62ffdf3 --- /dev/null +++ b/evals/tasks/observability/queries/prometheus-head-series.yaml @@ -0,0 +1,22 @@ +kind: Task +apiVersion: mcpchecker/v1alpha2 +metadata: + name: "prometheus-head-series" + difficulty: easy + parallel: true + runs: 1 + labels: + category: queries + suite: observability + toolType: query + description: | + Tests if the agent can query Prometheus internal metrics to report the current + number of head series using prometheus_tsdb_head_series. +spec: + verify: + - llmJudge: + contains: "prometheus_tsdb_head_series" + reason: "Verify the agent used the correct Prometheus TSDB metric" + prompt: + inline: | + How many head series does Prometheus have? diff --git a/evals/tasks/observability/queries/prometheus-requests.yaml b/evals/tasks/observability/queries/prometheus-requests.yaml new file mode 100644 index 000000000..f436130b4 --- /dev/null +++ b/evals/tasks/observability/queries/prometheus-requests.yaml @@ -0,0 +1,22 @@ +kind: Task +apiVersion: mcpchecker/v1alpha2 +metadata: + name: "prometheus-requests" + difficulty: medium + parallel: true + runs: 1 + labels: + category: queries + suite: observability + toolType: query + description: | + Tests if the agent can calculate the request rate to Prometheus by discovering + and querying the prometheus_http_requests_total metric. +spec: + verify: + - llmJudge: + contains: "prometheus_http_requests_total" + reason: "Verify the agent used the correct HTTP requests metric" + prompt: + inline: | + How many requests per second are being made to Prometheus? diff --git a/evals/tasks/observability/queries/prometheus-wal-size.yaml b/evals/tasks/observability/queries/prometheus-wal-size.yaml new file mode 100644 index 000000000..aa19ea451 --- /dev/null +++ b/evals/tasks/observability/queries/prometheus-wal-size.yaml @@ -0,0 +1,22 @@ +kind: Task +apiVersion: mcpchecker/v1alpha2 +metadata: + name: "prometheus-wal-size" + difficulty: easy + parallel: true + runs: 1 + labels: + category: queries + suite: observability + toolType: query + description: | + Tests if the agent can query the current Prometheus WAL storage size using + the prometheus_tsdb_wal_storage_size_bytes metric. +spec: + verify: + - llmJudge: + contains: "prometheus_tsdb_wal_storage_size_bytes" + reason: "Verify the agent used the correct WAL storage metric" + prompt: + inline: | + What is the current storage size of the Prometheus WAL? diff --git a/evals/tasks/observability/queries/time-range-query.yaml b/evals/tasks/observability/queries/time-range-query.yaml new file mode 100644 index 000000000..80a144be2 --- /dev/null +++ b/evals/tasks/observability/queries/time-range-query.yaml @@ -0,0 +1,26 @@ +kind: Task +apiVersion: mcpchecker/v1alpha2 +metadata: + name: "time-range-query" + difficulty: medium + parallel: true + runs: 1 + labels: + category: queries + suite: observability + toolType: multi-step + description: | + Tests whether the agent correctly uses execute_range_query with + appropriate start/end/step parameters when asked for data over + a specific time window. +spec: + verify: + - llmJudge: + contains: "container_cpu_usage_seconds_total" + reason: "Verify the agent discovered and used the correct CPU metric" + - llmJudge: + contains: "pod" + reason: "Verify the response includes specific pod names from the query results" + prompt: + inline: | + Show me the CPU usage trend for pods in the default namespace over the last 30 minutes. diff --git a/evals/tasks/observability/queries/visualize-cpu-usage.yaml b/evals/tasks/observability/queries/visualize-cpu-usage.yaml new file mode 100644 index 000000000..74d88ac85 --- /dev/null +++ b/evals/tasks/observability/queries/visualize-cpu-usage.yaml @@ -0,0 +1,23 @@ +kind: Task +apiVersion: mcpchecker/v1alpha2 +metadata: + name: "visualize-cpu-usage" + difficulty: medium + parallel: true + runs: 1 + labels: + category: queries + suite: observability + toolType: visualization + description: | + Tests if the agent uses the show_timeseries tool to visualize CPU usage + as a chart. The agent should discover the metric, then use show_timeseries + to render a time-series visualization. +spec: + verify: + - llmJudge: + contains: "container_cpu_usage_seconds_total" + reason: "Verify the agent discovered and visualized the correct CPU metric" + prompt: + inline: | + Visualize the CPU usage for pods in the default namespace over the last 30 minutes.