diff --git a/evals/claude-code/eval.yaml b/evals/claude-code/eval.yaml
index 4a46b8131..bbf5c9e0d 100644
--- a/evals/claude-code/eval.yaml
+++ b/evals/claude-code/eval.yaml
@@ -72,3 +72,13 @@ config:
             toolPattern: ".*"
         minToolCalls: 1
         maxToolCalls: 20
+    # Observability tasks
+    - glob: ../tasks/observability/*/*.yaml
+      labelSelector:
+        suite: observability
+      assertions:
+        toolsUsed:
+          - server: kubernetes
+            toolPattern: ".*"
+        minToolCalls: 1
+        maxToolCalls: 20
diff --git a/evals/gemini-agent/eval.yaml b/evals/gemini-agent/eval.yaml
index d7ee6eb82..3a1a66132 100644
--- a/evals/gemini-agent/eval.yaml
+++ b/evals/gemini-agent/eval.yaml
@@ -17,3 +17,13 @@ config:
         toolsUsed:
           - server: kubernetes
             toolPattern: ".*"
+    # Observability tasks
+    - glob: ../tasks/observability/*/*.yaml
+      labelSelector:
+        suite: observability
+      assertions:
+        toolsUsed:
+          - server: kubernetes
+            toolPattern: ".*"
+        minToolCalls: 1
+        maxToolCalls: 20
diff --git a/evals/openai-agent/eval.yaml b/evals/openai-agent/eval.yaml
index d3a71773f..1dedd4787 100644
--- a/evals/openai-agent/eval.yaml
+++ b/evals/openai-agent/eval.yaml
@@ -72,3 +72,13 @@ config:
             toolPattern: ".*"
         minToolCalls: 1
         maxToolCalls: 20
+    # Observability tasks
+    - glob: ../tasks/observability/*/*.yaml
+      labelSelector:
+        suite: observability
+      assertions:
+        toolsUsed:
+          - server: kubernetes
+            toolPattern: ".*"
+        minToolCalls: 1
+        maxToolCalls: 20
diff --git a/evals/tasks/observability/alerts/alert-investigation.yaml b/evals/tasks/observability/alerts/alert-investigation.yaml
new file mode 100644
index 000000000..f0f35ce2d
--- /dev/null
+++ b/evals/tasks/observability/alerts/alert-investigation.yaml
@@ -0,0 +1,28 @@
+kind: Task
+apiVersion: mcpchecker/v1alpha2
+metadata:
+  name: "alert-investigation"
+  difficulty: medium
+  parallel: true
+  runs: 1
+  labels:
+    category: alerts
+    suite: observability
+    toolType: multi-step
+  description: |
+    Tests if the agent can perform multi-step alert triage: first retrieving
+    alerts from Alertmanager, then investigating related metrics for a
+    firing alert using queries.
+spec:
+  verify:
+    - llmJudge:
+        contains: "AlertmanagerReceiversNotConfigured"
+        reason: "Verify the agent retrieved firing alerts and identified AlertmanagerReceiversNotConfigured"
+    - llmJudge:
+        contains: "alertmanager"
+        reason: "Verify the agent investigated Alertmanager-related metrics for the alert"
+  prompt:
+    inline: |
+      Check if there are any firing alerts. If there are, investigate
+      the related metrics for the most critical alert and summarize
+      what's happening.
diff --git a/evals/tasks/observability/alerts/filtered-alerts.yaml b/evals/tasks/observability/alerts/filtered-alerts.yaml
new file mode 100644
index 000000000..6f9aaf91a
--- /dev/null
+++ b/evals/tasks/observability/alerts/filtered-alerts.yaml
@@ -0,0 +1,24 @@
+kind: Task
+apiVersion: mcpchecker/v1alpha2
+metadata:
+  name: "filtered-alerts"
+  difficulty: medium
+  parallel: true
+  runs: 1
+  labels:
+    category: alerts
+    suite: observability
+    toolType: alertmanager
+  description: |
+    Tests if the agent uses the get_alerts tool with filter parameters
+    to retrieve only active alerts matching a specific severity. The agent
+    should pass appropriate filter arguments rather than fetching all alerts
+    and filtering client-side.
+spec:
+  verify:
+    - llmJudge:
+        contains: "AlertmanagerReceiversNotConfigured"
+        reason: "Verify the agent filtered for warning-severity alerts and found AlertmanagerReceiversNotConfigured"
+  prompt:
+    inline: |
+      Show me only the active alerts with severity=warning.
diff --git a/evals/tasks/observability/alerts/get-alerts.yaml b/evals/tasks/observability/alerts/get-alerts.yaml
new file mode 100644
index 000000000..49dbd5f92
--- /dev/null
+++ b/evals/tasks/observability/alerts/get-alerts.yaml
@@ -0,0 +1,22 @@
+kind: Task
+apiVersion: mcpchecker/v1alpha2
+metadata:
+  name: "get-alerts"
+  difficulty: easy
+  parallel: true
+  runs: 1
+  labels:
+    category: alerts
+    suite: observability
+    toolType: alertmanager
+  description: |
+    Tests if the agent can discover and use the get_alerts tool to retrieve
+    currently firing alerts from Alertmanager.
+spec:
+  verify:
+    - llmJudge:
+        contains: "Watchdog"
+        reason: "Verify the agent retrieved firing alerts and reported the Watchdog alert"
+  prompt:
+    inline: |
+      Check the cluster for any firing alerts and report what you find.
diff --git a/evals/tasks/observability/alerts/get-silences.yaml b/evals/tasks/observability/alerts/get-silences.yaml
new file mode 100644
index 000000000..f986c7851
--- /dev/null
+++ b/evals/tasks/observability/alerts/get-silences.yaml
@@ -0,0 +1,22 @@
+kind: Task
+apiVersion: mcpchecker/v1alpha2
+metadata:
+  name: "get-silences"
+  difficulty: easy
+  parallel: true
+  runs: 1
+  labels:
+    category: alerts
+    suite: observability
+    toolType: alertmanager
+  description: |
+    Tests if the agent can discover and use the get_silences tool to retrieve
+    active silences from Alertmanager.
+spec:
+  verify:
+    - llmJudge:
+        contains: "silences"
+        reason: "Verify the response mentions silences. The agent should report either active silences with their matchers or that no active silences exist — both are valid outcomes"
+  prompt:
+    inline: |
+      Are there any active silences in Alertmanager?
diff --git a/evals/tasks/observability/labels/get-series.yaml b/evals/tasks/observability/labels/get-series.yaml
new file mode 100644
index 000000000..95645079a
--- /dev/null
+++ b/evals/tasks/observability/labels/get-series.yaml
@@ -0,0 +1,26 @@
+kind: Task
+apiVersion: mcpchecker/v1alpha2
+metadata:
+  name: "get-series-cardinality"
+  difficulty: medium
+  parallel: true
+  runs: 1
+  labels:
+    category: labels
+    suite: observability
+    toolType: exploration
+  description: |
+    Tests if the agent can use the get_series tool to check cardinality for a metric.
+    The agent should first verify the metric exists via list_metrics, then use
+    get_series to retrieve matching time series and report the count.
+spec:
+  verify:
+    - llmJudge:
+        contains: "namespace"
+        reason: "Verify the agent retrieved actual series data containing label dimensions like namespace"
+    - llmJudge:
+        contains: "kube_pod_info"
+        reason: "Verify the agent queried the kube_pod_info metric and reported its cardinality"
+  prompt:
+    inline: |
+      How many time series exist for the kube_pod_info metric? Show the count and list the label names present.
diff --git a/evals/tasks/observability/labels/label-names.yaml b/evals/tasks/observability/labels/label-names.yaml
new file mode 100644
index 000000000..adf413963
--- /dev/null
+++ b/evals/tasks/observability/labels/label-names.yaml
@@ -0,0 +1,26 @@
+kind: Task
+apiVersion: mcpchecker/v1alpha2
+metadata:
+  name: "label-names"
+  difficulty: easy
+  parallel: true
+  runs: 1
+  labels:
+    category: labels
+    suite: observability
+    toolType: exploration
+  description: |
+    Tests if the agent follows the correct workflow: first calling list_metrics to
+    verify kube_pod_info exists, then calling get_label_names to discover available
+    labels for that metric.
+spec:
+  verify:
+    - llmJudge:
+        contains: "namespace"
+        reason: "Verify the output includes the namespace label which is a standard Kubernetes label"
+    - llmJudge:
+        contains: "pod"
+        reason: "Verify the output includes the pod label"
+  prompt:
+    inline: |
+      What labels are available for the kube_pod_info metric?
diff --git a/evals/tasks/observability/labels/label-values.yaml b/evals/tasks/observability/labels/label-values.yaml
new file mode 100644
index 000000000..d78e5b39f
--- /dev/null
+++ b/evals/tasks/observability/labels/label-values.yaml
@@ -0,0 +1,22 @@
+kind: Task
+apiVersion: mcpchecker/v1alpha2
+metadata:
+  name: "label-values"
+  difficulty: medium
+  parallel: true
+  runs: 1
+  labels:
+    category: labels
+    suite: observability
+    toolType: exploration
+  description: |
+    Tests the full discovery workflow: list_metrics to verify the metric, then
+    get_label_values to retrieve unique namespace values for kube_pod_info.
+spec:
+  verify:
+    - llmJudge:
+        contains: "kube-system"
+        reason: "Verify the output lists actual namespace values from the cluster such as kube-system"
+  prompt:
+    inline: |
+      What are the unique namespace values for the kube_pod_info metric?
diff --git a/evals/tasks/observability/labels/series-by-namespace.yaml b/evals/tasks/observability/labels/series-by-namespace.yaml
new file mode 100644
index 000000000..23d78b58e
--- /dev/null
+++ b/evals/tasks/observability/labels/series-by-namespace.yaml
@@ -0,0 +1,28 @@
+kind: Task
+apiVersion: mcpchecker/v1alpha2
+metadata:
+  name: "series-by-namespace"
+  difficulty: medium
+  parallel: true
+  runs: 1
+  labels:
+    category: labels
+    suite: observability
+    toolType: exploration
+  description: |
+    Tests if the agent can use the get_series tool with a label selector
+    to find time series scoped to a specific namespace. The agent should
+    first verify the metric exists, then use get_series with a namespace
+    matcher to report the cardinality within that scope.
+spec:
+  verify:
+    - llmJudge:
+        contains: "pod"
+        reason: "Verify the agent retrieved actual series data containing label dimensions like pod"
+    - llmJudge:
+        contains: "container"
+        reason: "Verify the agent reported series with container label values from the namespace"
+  prompt:
+    inline: |
+      How many time series exist for container_cpu_usage_seconds_total
+      in the openshift-monitoring namespace?
diff --git a/evals/tasks/observability/metrics/list-metrics.yaml b/evals/tasks/observability/metrics/list-metrics.yaml
new file mode 100644
index 000000000..1f5adc71a
--- /dev/null
+++ b/evals/tasks/observability/metrics/list-metrics.yaml
@@ -0,0 +1,22 @@
+kind: Task
+apiVersion: mcpchecker/v1alpha2
+metadata:
+  name: "list-kube-metrics"
+  difficulty: easy
+  parallel: true
+  runs: 1
+  labels:
+    category: metrics
+    suite: observability
+    toolType: discovery
+  description: |
+    Tests if the agent can discover the list_metrics tool to find Kubernetes-related
+    metrics. The agent should use the name_regex parameter to filter for kube metrics.
+spec:
+  verify:
+    - llmJudge:
+        contains: "kube_pod_info"
+        reason: "Verify the output lists specific kube metrics discovered from Prometheus"
+  prompt:
+    inline: |
+      List all available Prometheus metrics that contain 'kube' in the name.
diff --git a/evals/tasks/observability/metrics/list-node-metrics.yaml b/evals/tasks/observability/metrics/list-node-metrics.yaml
new file mode 100644
index 000000000..f496460aa
--- /dev/null
+++ b/evals/tasks/observability/metrics/list-node-metrics.yaml
@@ -0,0 +1,22 @@
+kind: Task
+apiVersion: mcpchecker/v1alpha2
+metadata:
+  name: "list-node-metrics"
+  difficulty: easy
+  parallel: true
+  runs: 1
+  labels:
+    category: metrics
+    suite: observability
+    toolType: discovery
+  description: |
+    Tests if the agent can discover node-related metrics using the list_metrics tool
+    with a regex filter for node metrics.
+spec:
+  verify:
+    - llmJudge:
+        contains: "node_"
+        reason: "Verify the output lists specific node metrics discovered from Prometheus (e.g. node_cpu_seconds_total)"
+  prompt:
+    inline: |
+      What node-related metrics are available in Prometheus?
diff --git a/evals/tasks/observability/queries/backend-reachability.yaml b/evals/tasks/observability/queries/backend-reachability.yaml
new file mode 100644
index 000000000..5a027a5a9
--- /dev/null
+++ b/evals/tasks/observability/queries/backend-reachability.yaml
@@ -0,0 +1,23 @@
+kind: Task
+apiVersion: mcpchecker/v1alpha2
+metadata:
+  name: "backend-reachability"
+  difficulty: easy
+  parallel: true
+  runs: 1
+  labels:
+    category: queries
+    suite: observability
+    toolType: smoke-test
+  description: |
+    Smoke test to verify the agent can reach the metrics backend and get a
+    valid response. The agent should query the up metric and confirm the
+    backend is reachable before any complex eval tasks run.
+spec:
+  verify:
+    - llmJudge:
+        contains: "prometheus-k8s"
+        reason: "Verify the agent queried the up metric and reported on the prometheus-k8s target"
+  prompt:
+    inline: |
+      Is the metrics backend reachable? Query up{job="prometheus-k8s"} and report the status.
diff --git a/evals/tasks/observability/queries/cpu-usage.yaml b/evals/tasks/observability/queries/cpu-usage.yaml
new file mode 100644
index 000000000..3baddf6cf
--- /dev/null
+++ b/evals/tasks/observability/queries/cpu-usage.yaml
@@ -0,0 +1,25 @@
+kind: Task
+apiVersion: mcpchecker/v1alpha2
+metadata:
+  name: "cpu-usage"
+  difficulty: medium
+  parallel: true
+  runs: 1
+  labels:
+    category: queries
+    suite: observability
+    toolType: query
+  description: |
+    Tests if the agent can find the appropriate CPU usage metric and query it to
+    determine which pods are consuming the most CPU resources.
+spec:
+  verify:
+    - llmJudge:
+        contains: "container_cpu_usage_seconds_total"
+        reason: "Verify the agent queried the correct CPU metric"
+    - llmJudge:
+        contains: "pod"
+        reason: "Verify the response identifies specific pods"
+  prompt:
+    inline: |
+      Which pods are using the most CPU?
diff --git a/evals/tasks/observability/queries/crashlooping-pods.yaml b/evals/tasks/observability/queries/crashlooping-pods.yaml
new file mode 100644
index 000000000..4e975964e
--- /dev/null
+++ b/evals/tasks/observability/queries/crashlooping-pods.yaml
@@ -0,0 +1,22 @@
+kind: Task
+apiVersion: mcpchecker/v1alpha2
+metadata:
+  name: "crashlooping-pods"
+  difficulty: medium
+  parallel: true
+  runs: 1
+  labels:
+    category: queries
+    suite: observability
+    toolType: range-query
+  description: |
+    Tests if the agent can identify crashlooping pods by discovering and querying
+    the kube_pod_container_status_restarts_total metric over a time range.
+spec:
+  verify:
+    - llmJudge:
+        contains: "kube_pod_container_status_restarts_total"
+        reason: "Verify the agent used the correct metric for container restart counts"
+  prompt:
+    inline: |
+      Which pods were crashlooping in the last 5 minutes?
diff --git a/evals/tasks/observability/queries/diagnose-cluster-health.yaml b/evals/tasks/observability/queries/diagnose-cluster-health.yaml
new file mode 100644
index 000000000..7542b61e5
--- /dev/null
+++ b/evals/tasks/observability/queries/diagnose-cluster-health.yaml
@@ -0,0 +1,28 @@
+kind: Task
+apiVersion: mcpchecker/v1alpha2
+metadata:
+  name: "diagnose-cluster-health"
+  difficulty: hard
+  parallel: true
+  runs: 1
+  labels:
+    category: queries
+    suite: observability
+    toolType: diagnostic
+  description: |
+    Tests the agent's ability to handle an ambiguous diagnostic prompt.
+    The agent must autonomously decide which tools and metrics to check,
+    ideally covering alerts, node status, and pod health without being
+    told exactly what to look for.
+spec:
+  verify:
+    - llmJudge:
+        contains: "Watchdog"
+        reason: "Verify the agent checked alerts and reported specific alert names such as Watchdog"
+    - llmJudge:
+        contains: "kube_"
+        reason: "Verify the agent queried Kubernetes metrics (e.g. kube_node_status_condition) for health assessment"
+  prompt:
+    inline: |
+      Is the cluster healthy? Give me an overview of any issues.
+      Check alerts, node status, and pod health.
diff --git a/evals/tasks/observability/queries/high-cardinality-rejection.yaml b/evals/tasks/observability/queries/high-cardinality-rejection.yaml
new file mode 100644
index 000000000..dbb5b3bba
--- /dev/null
+++ b/evals/tasks/observability/queries/high-cardinality-rejection.yaml
@@ -0,0 +1,24 @@
+kind: Task
+apiVersion: mcpchecker/v1alpha2
+metadata:
+  name: "high-cardinality-rejection"
+  difficulty: medium
+  parallel: true
+  runs: 1
+  labels:
+    category: queries
+    suite: observability
+    toolType: error-handling
+  description: |
+    Tests agent behavior when a query is rejected by obs-mcp guardrails
+    due to high cardinality. The agent should explain the guardrail
+    rejection and suggest a scoped alternative rather than giving up.
+spec:
+  verify:
+    - llmJudge:
+        contains: "guardrail"
+        reason: "Verify the agent explains the query was rejected by a guardrail and suggests a scoped alternative"
+  prompt:
+    inline: |
+      Show me the CPU, memory, network, and disk usage for every single container
+      across all namespaces over the last 24 hours with 1-second resolution.
diff --git a/evals/tasks/observability/queries/memory-usage.yaml b/evals/tasks/observability/queries/memory-usage.yaml
new file mode 100644
index 000000000..58da3fce8
--- /dev/null
+++ b/evals/tasks/observability/queries/memory-usage.yaml
@@ -0,0 +1,25 @@
+kind: Task
+apiVersion: mcpchecker/v1alpha2
+metadata:
+  name: "memory-usage"
+  difficulty: medium
+  parallel: true
+  runs: 1
+  labels:
+    category: queries
+    suite: observability
+    toolType: query
+  description: |
+    Tests if the agent can find the appropriate memory usage metric and query it
+    to determine which pods are consuming the most memory resources.
+spec:
+  verify:
+    - llmJudge:
+        contains: "container_memory"
+        reason: "Verify the agent queried a container memory metric (working_set_bytes or usage_bytes)"
+    - llmJudge:
+        contains: "pod"
+        reason: "Verify the response identifies specific pods"
+  prompt:
+    inline: |
+      Which pods are using the most memory? Show me the top 5.
diff --git a/evals/tasks/observability/queries/namespace-pod-count.yaml b/evals/tasks/observability/queries/namespace-pod-count.yaml
new file mode 100644
index 000000000..b180a917d
--- /dev/null
+++ b/evals/tasks/observability/queries/namespace-pod-count.yaml
@@ -0,0 +1,27 @@
+kind: Task
+apiVersion: mcpchecker/v1alpha2
+metadata:
+  name: "namespace-pod-count"
+  difficulty: medium
+  parallel: true
+  runs: 1
+  labels:
+    category: queries
+    suite: observability
+    toolType: multi-step
+  description: |
+    Tests multi-step reasoning: the agent must discover a suitable metric
+    via list_metrics, explore label values to find namespaces, then query
+    to determine which namespaces have the most pods. Requires chaining
+    discovery, label exploration, and a query.
+spec:
+  verify:
+    - llmJudge:
+        contains: "kube-system"
+        reason: "Verify the response lists actual namespace names from the cluster such as kube-system"
+    - llmJudge:
+        contains: "kube_pod"
+        reason: "Verify the agent used a kube_pod metric to count pods"
+  prompt:
+    inline: |
+      Which namespaces have the most running pods? Show me the top 5.
diff --git a/evals/tasks/observability/queries/namespace-resource-usage.yaml b/evals/tasks/observability/queries/namespace-resource-usage.yaml
new file mode 100644
index 000000000..4434ca52c
--- /dev/null
+++ b/evals/tasks/observability/queries/namespace-resource-usage.yaml
@@ -0,0 +1,28 @@
+kind: Task
+apiVersion: mcpchecker/v1alpha2
+metadata:
+  name: "namespace-resource-usage"
+  difficulty: hard
+  parallel: true
+  runs: 1
+  labels:
+    category: queries
+    suite: observability
+    toolType: multi-step
+  description: |
+    Tests multi-step reasoning: the agent must discover CPU and memory metrics,
+    then query them with namespace-level aggregation to identify the top
+    resource-consuming namespaces. Requires chaining list_metrics, label
+    exploration, and multiple instant queries.
+spec:
+  verify:
+    - llmJudge:
+        contains: "container_cpu_usage_seconds_total"
+        reason: "Verify the agent discovered and used the correct CPU metric"
+    - llmJudge:
+        contains: "container_memory"
+        reason: "Verify the agent discovered and used a container memory metric (working_set_bytes or usage_bytes)"
+  prompt:
+    inline: |
+      Which namespace is consuming the most CPU and memory?
+      Show me the top namespace for each.
diff --git a/evals/tasks/observability/queries/network-traffic.yaml b/evals/tasks/observability/queries/network-traffic.yaml
new file mode 100644
index 000000000..9b1e06fd3
--- /dev/null
+++ b/evals/tasks/observability/queries/network-traffic.yaml
@@ -0,0 +1,25 @@
+kind: Task
+apiVersion: mcpchecker/v1alpha2
+metadata:
+  name: "network-traffic"
+  difficulty: medium
+  parallel: true
+  runs: 1
+  labels:
+    category: queries
+    suite: observability
+    toolType: query
+  description: |
+    Tests if the agent can discover network-related metrics and query them to find
+    which pods are receiving the most network traffic.
+spec:
+  verify:
+    - llmJudge:
+        contains: "container_network_receive_bytes_total"
+        reason: "Verify the agent queried the correct network metric"
+    - llmJudge:
+        contains: "pod"
+        reason: "Verify the response identifies specific pods"
+  prompt:
+    inline: |
+      Which pods are receiving the most network traffic?
diff --git a/evals/tasks/observability/queries/nonexistent-metric.yaml b/evals/tasks/observability/queries/nonexistent-metric.yaml
new file mode 100644
index 000000000..ed76ee25a
--- /dev/null
+++ b/evals/tasks/observability/queries/nonexistent-metric.yaml
@@ -0,0 +1,23 @@
+kind: Task
+apiVersion: mcpchecker/v1alpha2
+metadata:
+  name: "nonexistent-metric"
+  difficulty: easy
+  parallel: true
+  runs: 1
+  labels:
+    category: queries
+    suite: observability
+    toolType: error-handling
+  description: |
+    Tests agent recovery when querying a metric that does not exist.
+    The agent should discover that the metric is missing via list_metrics
+    and inform the user rather than fabricating results.
+spec:
+  verify:
+    - llmJudge:
+        contains: "not found"
+        reason: "Verify the agent communicates that the metric does not exist or was not found"
+  prompt:
+    inline: |
+      What is the current value of the metric fake_nonexistent_metric_total?
diff --git a/evals/tasks/observability/queries/nonexistent-namespace.yaml b/evals/tasks/observability/queries/nonexistent-namespace.yaml
new file mode 100644
index 000000000..78090c0c1
--- /dev/null
+++ b/evals/tasks/observability/queries/nonexistent-namespace.yaml
@@ -0,0 +1,23 @@
+kind: Task
+apiVersion: mcpchecker/v1alpha2
+metadata:
+  name: "nonexistent-namespace"
+  difficulty: easy
+  parallel: true
+  runs: 1
+  labels:
+    category: queries
+    suite: observability
+    toolType: error-handling
+  description: |
+    Tests agent behavior when querying for resources in a namespace that
+    does not exist. The agent should query and report empty results
+    gracefully rather than hallucinating data.
+spec:
+  verify:
+    - llmJudge:
+        contains: "no data"
+        reason: "Verify the agent reports no data, no results, or no pods found in the nonexistent namespace"
+  prompt:
+    inline: |
+      Show me the memory usage for all pods in the namespace called totally-fake-namespace-12345.
diff --git a/evals/tasks/observability/queries/pending-pods.yaml b/evals/tasks/observability/queries/pending-pods.yaml
new file mode 100644
index 000000000..f5b47e387
--- /dev/null
+++ b/evals/tasks/observability/queries/pending-pods.yaml
@@ -0,0 +1,23 @@
+kind: Task
+apiVersion: mcpchecker/v1alpha2
+metadata:
+  name: "pending-pods"
+  difficulty: medium
+  parallel: true
+  runs: 1
+  labels:
+    category: queries
+    suite: observability
+    toolType: query
+  description: |
+    Tests if the agent can identify pods stuck in pending state by first discovering
+    the kube_pod_status_phase metric and then running an instant query to find
+    pods with phase=Pending.
+spec:
+  verify:
+    - llmJudge:
+        contains: "kube_pod_status_phase"
+        reason: "Verify the agent used the correct metric for pod phase status"
+  prompt:
+    inline: |
+      Which pods are stuck in pending state?
diff --git a/evals/tasks/observability/queries/pods-created.yaml b/evals/tasks/observability/queries/pods-created.yaml
new file mode 100644
index 000000000..097a57e24
--- /dev/null
+++ b/evals/tasks/observability/queries/pods-created.yaml
@@ -0,0 +1,22 @@
+kind: Task
+apiVersion: mcpchecker/v1alpha2
+metadata:
+  name: "pods-created"
+  difficulty: medium
+  parallel: true
+  runs: 1
+  labels:
+    category: queries
+    suite: observability
+    toolType: range-query
+  description: |
+    Tests if the agent can use a range query to find recently created pods by
+    discovering the kube_pod_created metric and querying it over a 5-minute window.
+spec:
+  verify:
+    - llmJudge:
+        contains: "kube_pod_created"
+        reason: "Verify the agent discovered and used the kube_pod_created metric"
+  prompt:
+    inline: |
+      How many pods were created in the last 5 minutes?
diff --git a/evals/tasks/observability/queries/prometheus-head-series.yaml b/evals/tasks/observability/queries/prometheus-head-series.yaml
new file mode 100644
index 000000000..9f62ffdf3
--- /dev/null
+++ b/evals/tasks/observability/queries/prometheus-head-series.yaml
@@ -0,0 +1,22 @@
+kind: Task
+apiVersion: mcpchecker/v1alpha2
+metadata:
+  name: "prometheus-head-series"
+  difficulty: easy
+  parallel: true
+  runs: 1
+  labels:
+    category: queries
+    suite: observability
+    toolType: query
+  description: |
+    Tests if the agent can query Prometheus internal metrics to report the current
+    number of head series using prometheus_tsdb_head_series.
+spec:
+  verify:
+    - llmJudge:
+        contains: "prometheus_tsdb_head_series"
+        reason: "Verify the agent used the correct Prometheus TSDB metric"
+  prompt:
+    inline: |
+      How many head series does Prometheus have?
diff --git a/evals/tasks/observability/queries/prometheus-requests.yaml b/evals/tasks/observability/queries/prometheus-requests.yaml
new file mode 100644
index 000000000..f436130b4
--- /dev/null
+++ b/evals/tasks/observability/queries/prometheus-requests.yaml
@@ -0,0 +1,22 @@
+kind: Task
+apiVersion: mcpchecker/v1alpha2
+metadata:
+  name: "prometheus-requests"
+  difficulty: medium
+  parallel: true
+  runs: 1
+  labels:
+    category: queries
+    suite: observability
+    toolType: query
+  description: |
+    Tests if the agent can calculate the request rate to Prometheus by discovering
+    and querying the prometheus_http_requests_total metric.
+spec:
+  verify:
+    - llmJudge:
+        contains: "prometheus_http_requests_total"
+        reason: "Verify the agent used the correct HTTP requests metric"
+  prompt:
+    inline: |
+      How many requests per second are being made to Prometheus?
diff --git a/evals/tasks/observability/queries/prometheus-wal-size.yaml b/evals/tasks/observability/queries/prometheus-wal-size.yaml
new file mode 100644
index 000000000..aa19ea451
--- /dev/null
+++ b/evals/tasks/observability/queries/prometheus-wal-size.yaml
@@ -0,0 +1,22 @@
+kind: Task
+apiVersion: mcpchecker/v1alpha2
+metadata:
+  name: "prometheus-wal-size"
+  difficulty: easy
+  parallel: true
+  runs: 1
+  labels:
+    category: queries
+    suite: observability
+    toolType: query
+  description: |
+    Tests if the agent can query the current Prometheus WAL storage size using
+    the prometheus_tsdb_wal_storage_size_bytes metric.
+spec:
+  verify:
+    - llmJudge:
+        contains: "prometheus_tsdb_wal_storage_size_bytes"
+        reason: "Verify the agent used the correct WAL storage metric"
+  prompt:
+    inline: |
+      What is the current storage size of the Prometheus WAL?
diff --git a/evals/tasks/observability/queries/time-range-query.yaml b/evals/tasks/observability/queries/time-range-query.yaml
new file mode 100644
index 000000000..80a144be2
--- /dev/null
+++ b/evals/tasks/observability/queries/time-range-query.yaml
@@ -0,0 +1,26 @@
+kind: Task
+apiVersion: mcpchecker/v1alpha2
+metadata:
+  name: "time-range-query"
+  difficulty: medium
+  parallel: true
+  runs: 1
+  labels:
+    category: queries
+    suite: observability
+    toolType: multi-step
+  description: |
+    Tests whether the agent correctly uses execute_range_query with
+    appropriate start/end/step parameters when asked for data over
+    a specific time window.
+spec:
+  verify:
+    - llmJudge:
+        contains: "container_cpu_usage_seconds_total"
+        reason: "Verify the agent discovered and used the correct CPU metric"
+    - llmJudge:
+        contains: "pod"
+        reason: "Verify the response includes specific pod names from the query results"
+  prompt:
+    inline: |
+      Show me the CPU usage trend for pods in the default namespace over the last 30 minutes.
diff --git a/evals/tasks/observability/queries/visualize-cpu-usage.yaml b/evals/tasks/observability/queries/visualize-cpu-usage.yaml
new file mode 100644
index 000000000..74d88ac85
--- /dev/null
+++ b/evals/tasks/observability/queries/visualize-cpu-usage.yaml
@@ -0,0 +1,23 @@
+kind: Task
+apiVersion: mcpchecker/v1alpha2
+metadata:
+  name: "visualize-cpu-usage"
+  difficulty: medium
+  parallel: true
+  runs: 1
+  labels:
+    category: queries
+    suite: observability
+    toolType: visualization
+  description: |
+    Tests if the agent uses the show_timeseries tool to visualize CPU usage
+    as a chart. The agent should discover the metric, then use show_timeseries
+    to render a time-series visualization.
+spec:
+  verify:
+    - llmJudge:
+        contains: "container_cpu_usage_seconds_total"
+        reason: "Verify the agent discovered and visualized the correct CPU metric"
+  prompt:
+    inline: |
+      Visualize the CPU usage for pods in the default namespace over the last 30 minutes.