Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 14 additions & 1 deletion .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,19 @@ jobs:
env:
DATABASE_URL: ${{ steps.postgres.outputs.connection-uri }}

contract:
name: Canopy contract
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v6
- name: Configure toolchain
run: |
rustup toolchain install --profile minimal --no-self-update stable
rustup default stable
- uses: Swatinem/rust-cache@v2
- name: Contract tests against live canopy
run: cargo test -p bestool --lib canopy_contract -- --ignored

clippy:
name: Clippy
runs-on: ubuntu-latest
Expand Down Expand Up @@ -112,7 +125,7 @@ jobs:
tests-pass:
if: always()
name: Tests pass
needs: [test, check-docs]
needs: [test, contract, check-docs]
runs-on: ubuntu-latest
steps:
- uses: re-actors/alls-green@release/v1
Expand Down
161 changes: 94 additions & 67 deletions crates/alertd/src/doctor/check.rs
Original file line number Diff line number Diff line change
@@ -1,31 +1,41 @@
use serde_json::{Map, Value, json};

/// Outcome of a single healthcheck.
///
/// Serialised on the wire as the per-check `result` field, a proper sum type
/// exhaustively matchable on both sides:
/// `passed | warning | failed | broken | skipped`.
#[derive(Debug, Clone)]
pub enum CheckStatus {
/// All good.
/// Check ran, system OK. `result: "passed"`.
Pass,
/// The check couldn't be run β€” either the platform doesn't support it, the
/// caller lacked the privilege to query the underlying source, or the
/// upstream (e.g. caddy `/metrics`) wasn't reachable. Reported as
/// `healthy: true` on the wire (we have no evidence of unhealth) but does
/// NOT count as "passing" in human-readable output.
/// Precondition not met; the check didn't run β€” either the platform
/// doesn't support it, the caller lacked the privilege to query the
/// underlying source, or the check doesn't apply to this server kind.
/// Says nothing about the system. `result: "skipped"`.
Skip(String),
/// Non-fatal degradation. Reported as `healthy: false` per-check on the
/// canopy wire format, but does NOT flip the top-level `healthy` flag.
/// Check ran, system degraded but not fatally. `result: "warning"`.
Warning(String),
/// Fatal failure. Sets `healthy: false` per-check AND flips top-level to
/// `healthy: false`.
/// Check ran, system under test is unhealthy. `result: "failed"`.
Fail(String),
/// The check itself errored or is misconfigured (e.g. its SQL no longer
/// matches the schema); says nothing about the system. `result: "broken"`.
Broken(String),
}

impl CheckStatus {
/// Whether this status maps to `healthy: true` in the per-check wire format.
pub fn is_healthy_on_wire(&self) -> bool {
matches!(self, CheckStatus::Pass | CheckStatus::Skip(_))
/// The `result` value for this status in the per-check wire format.
pub fn wire_result(&self) -> &'static str {
match self {
CheckStatus::Pass => "passed",
CheckStatus::Skip(_) => "skipped",
CheckStatus::Warning(_) => "warning",
CheckStatus::Fail(_) => "failed",
CheckStatus::Broken(_) => "broken",
}
}

/// Whether this status is fatal (flips top-level `healthy` to false).
/// Whether this status is fatal (the system under test is unhealthy).
pub fn is_fatal(&self) -> bool {
matches!(self, CheckStatus::Fail(_))
}
Expand Down Expand Up @@ -66,17 +76,15 @@ impl Check {
}
}

/// Build a Skip result. The `reason` is recorded in `details.reason` (or
/// kept on the status) so the operator sees *why* the check couldn't be
/// run; the summary is the short headline shown alongside `SKIP`.
/// Build a Skip result. The `reason` is kept on the status so the operator
/// sees *why* the check couldn't be run; the summary is the short headline
/// shown alongside `SKIP`.
pub fn skip(name: &'static str, summary: impl Into<String>, reason: impl Into<String>) -> Self {
let mut details = Map::new();
details.insert("skipped".into(), Value::Bool(true));
Self {
name,
status: CheckStatus::Skip(reason.into()),
summary: summary.into(),
details,
details: Map::new(),
payload_extras: Map::new(),
}
}
Expand Down Expand Up @@ -105,6 +113,22 @@ impl Check {
}
}

/// Build a Broken result: the check itself errored or is misconfigured,
/// which says nothing about the system under test.
pub fn broken(
name: &'static str,
summary: impl Into<String>,
reason: impl Into<String>,
) -> Self {
Self {
name,
status: CheckStatus::Broken(reason.into()),
summary: summary.into(),
details: Map::new(),
payload_extras: Map::new(),
}
}

pub fn with_detail(mut self, key: &str, value: impl Into<Value>) -> Self {
self.details.insert(key.to_string(), value.into());
self
Expand All @@ -127,7 +151,7 @@ impl Check {
pub fn to_wire(&self) -> Value {
let mut obj = Map::new();
obj.insert("check".into(), self.name.into());
obj.insert("healthy".into(), self.status.is_healthy_on_wire().into());
obj.insert("result".into(), self.status.wire_result().into());
for (k, v) in &self.details {
obj.insert(k.clone(), v.clone());
}
Expand All @@ -137,15 +161,16 @@ impl Check {
/// Encode this Check for streaming over the daemon's task endpoint.
///
/// Distinct from [`Self::to_wire`]: that one is the canopy-bound payload
/// (which collapses Warning/Fail status to a bare `healthy: false`); this
/// one preserves the full `CheckStatus` enum so consumers can render the
/// same colours and reason lines as a local sweep.
/// (which drops the reason); this one preserves the full `CheckStatus`
/// enum including reasons so consumers can render the same colours and
/// reason lines as a local sweep.
pub fn to_streaming_json(&self) -> Value {
let (status, reason) = match &self.status {
CheckStatus::Pass => ("pass", None),
CheckStatus::Skip(r) => ("skip", Some(r.as_str())),
CheckStatus::Warning(r) => ("warning", Some(r.as_str())),
CheckStatus::Fail(r) => ("fail", Some(r.as_str())),
CheckStatus::Broken(r) => ("broken", Some(r.as_str())),
};
let mut obj = json!({
"name": self.name,
Expand Down Expand Up @@ -181,6 +206,7 @@ impl Check {
("skip", Some(r)) => CheckStatus::Skip(r),
("warning", Some(r)) => CheckStatus::Warning(r),
("fail", Some(r)) => CheckStatus::Fail(r),
("broken", Some(r)) => CheckStatus::Broken(r),
_ => return None,
};
let summary = value.get("summary")?.as_str()?.to_string();
Expand Down Expand Up @@ -213,19 +239,14 @@ impl OverallResult {
OverallResult::Failing
} else if checks
.iter()
.any(|c| matches!(c.status, CheckStatus::Warning(_)))
.any(|c| matches!(c.status, CheckStatus::Warning(_) | CheckStatus::Broken(_)))
{
OverallResult::Degraded
} else {
OverallResult::Healthy
}
}

/// Whether the top-level `healthy` flag on the wire is `true`.
pub fn is_healthy_top_level(self) -> bool {
!matches!(self, OverallResult::Failing)
}

pub fn label(self) -> &'static str {
match self {
OverallResult::Healthy => "HEALTHY",
Expand All @@ -240,35 +261,40 @@ mod tests {
use super::*;

#[test]
fn pass_is_healthy_on_wire() {
assert!(Check::pass("x", "ok").status.is_healthy_on_wire());
fn wire_results() {
assert_eq!(CheckStatus::Pass.wire_result(), "passed");
assert_eq!(CheckStatus::Skip("r".into()).wire_result(), "skipped");
assert_eq!(CheckStatus::Warning("r".into()).wire_result(), "warning");
assert_eq!(CheckStatus::Fail("r".into()).wire_result(), "failed");
assert_eq!(CheckStatus::Broken("r".into()).wire_result(), "broken");
}

#[test]
fn warning_is_unhealthy_on_wire_but_not_fatal() {
let s = CheckStatus::Warning("w".into());
assert!(!s.is_healthy_on_wire());
assert!(!s.is_fatal());
fn warning_is_not_fatal() {
assert!(!CheckStatus::Warning("w".into()).is_fatal());
}

#[test]
fn fail_is_unhealthy_and_fatal() {
let s = CheckStatus::Fail("f".into());
assert!(!s.is_healthy_on_wire());
assert!(s.is_fatal());
fn fail_is_fatal() {
assert!(CheckStatus::Fail("f".into()).is_fatal());
}

#[test]
fn skip_is_healthy_on_wire_and_not_fatal() {
// Skip means "we didn't run this check" β€” it must not fire alerts
// or flip the top-level healthy flag, since we have no evidence of
// unhealth either way.
fn skip_is_not_fatal() {
// Skip means "we didn't run this check" β€” it must not fire alerts,
// since we have no evidence of unhealth either way.
let s = CheckStatus::Skip("reason".into());
assert!(s.is_healthy_on_wire());
assert!(!s.is_fatal());
assert!(s.is_skip());
}

#[test]
fn broken_is_not_fatal() {
// Broken means the check itself errored β€” it says nothing about the
// system under test, so it must not flag the deployment as failing.
assert!(!CheckStatus::Broken("reason".into()).is_fatal());
}

#[test]
fn skip_does_not_change_overall_result() {
let with_skip = vec![Check::pass("a", ""), Check::skip("b", "", "r")];
Expand All @@ -278,16 +304,6 @@ mod tests {
);
}

#[test]
fn skip_constructor_marks_skipped_detail() {
let c = Check::skip("memory", "not available", "platform mismatch");
assert_eq!(
c.details.get("skipped").and_then(Value::as_bool),
Some(true)
);
assert!(matches!(c.status, CheckStatus::Skip(_)));
}

#[test]
fn overall_from_checks() {
let healthy = vec![Check::pass("a", "")];
Expand All @@ -299,30 +315,41 @@ mod tests {
OverallResult::Degraded
);

let broken = vec![Check::pass("a", ""), Check::broken("b", "", "x")];
assert_eq!(OverallResult::from_checks(&broken), OverallResult::Degraded);

let failing = vec![Check::warning("a", "", "x"), Check::fail("b", "", "y")];
assert_eq!(OverallResult::from_checks(&failing), OverallResult::Failing);
}

#[test]
fn overall_top_level_healthy_only_true_when_not_failing() {
assert!(OverallResult::Healthy.is_healthy_top_level());
assert!(OverallResult::Degraded.is_healthy_top_level());
assert!(!OverallResult::Failing.is_healthy_top_level());
}

#[test]
fn check_to_wire_pass() {
let c = Check::pass("db_connect", "ok").with_detail("latency_ms", 3);
let v = c.to_wire();
assert_eq!(v["check"], "db_connect");
assert_eq!(v["healthy"], true);
assert_eq!(v["result"], "passed");
assert_eq!(v["latency_ms"], 3);
}

#[test]
fn check_to_wire_warning_marks_unhealthy() {
let c = Check::warning("disk_free", "20% used", "below threshold");
let v = c.to_wire();
assert_eq!(v["healthy"], false);
fn check_to_wire_statuses() {
let warn = Check::warning("disk_free", "20% used", "below threshold");
assert_eq!(warn.to_wire()["result"], "warning");
let fail = Check::fail("disk_free", "1% free", "out of space");
assert_eq!(fail.to_wire()["result"], "failed");
let broken = Check::broken("x", "query broken", "no such column");
assert_eq!(broken.to_wire()["result"], "broken");
let skip = Check::skip("x", "n/a", "central-only");
assert_eq!(skip.to_wire()["result"], "skipped");
}

#[test]
fn broken_round_trips_through_streaming_json() {
let c = Check::broken("x", "query broken", "no such column");
let v = c.to_streaming_json();
assert_eq!(v["status"], "broken");
assert_eq!(v["reason"], "no such column");
let back = Check::from_streaming_json(&v, |_| Some("x")).unwrap();
assert!(matches!(back.status, CheckStatus::Broken(r) if r == "no such column"));
}
}
70 changes: 70 additions & 0 deletions crates/alertd/src/doctor/checks.rs
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,25 @@ pub fn fmt_db_error(err: &tokio_postgres::Error) -> String {
fmt_chain(err)
}

/// Build the Check for a query that errored, classified by SQLSTATE.
///
/// Class 42 ("syntax error or access rule violation": dropped or renamed
/// columns, json/jsonb drift, missing functions) means the check's own SQL no
/// longer matches the schema β€” a fault in the healthcheck, not the deployment
/// β€” so it reports as BROKEN rather than flagging the server as failing.
/// Everything else stays FAIL.
pub fn query_error_check(name: &'static str, err: &tokio_postgres::Error) -> Check {
let reason = fmt_db_error(err);
if err
.as_db_error()
.is_some_and(|db| db.code().code().starts_with("42"))
{
Check::broken(name, "healthcheck query broken", reason)
} else {
Check::fail(name, "query failed", reason)
}
}

/// Walk a `std::error::Error`'s source chain and join all the messages.
///
/// `reqwest::Error`'s Display is just "error sending request for url (...)";
Expand Down Expand Up @@ -243,3 +262,54 @@ pub mod test_support {
}
}
}

#[cfg(test)]
mod tests {
use serde_json::Value;

use super::{query_error_check, test_support::central_ctx};
use crate::doctor::check::CheckStatus;

async fn query_err(sql: &str) -> Option<tokio_postgres::Error> {
let ctx = central_ctx().await?;
let client = ctx.db.expect("central_ctx always has a client");
Some(
client
.query(sql, &[])
.await
.expect_err("query should error"),
)
}

#[tokio::test]
async fn schema_drift_is_broken() {
// 42P01 undefined_table β€” the shape a dropped/renamed relation takes.
let Some(err) = query_err("SELECT nope FROM no_such_table_bestool_test").await else {
return;
};
let check = query_error_check("x", &err);
assert!(matches!(check.status, CheckStatus::Broken(_)));
assert_eq!(check.to_wire()["result"], Value::from("broken"));
}

#[tokio::test]
async fn syntax_error_is_broken() {
// 42601 syntax_error.
let Some(err) = query_err("SELECT FROM WHERE").await else {
return;
};
let check = query_error_check("x", &err);
assert!(matches!(check.status, CheckStatus::Broken(_)));
}

#[tokio::test]
async fn runtime_db_error_still_fails() {
// 22012 division_by_zero β€” not class 42, so the deployment is blamed.
let Some(err) = query_err("SELECT 1/0").await else {
return;
};
let check = query_error_check("x", &err);
assert!(check.status.is_fatal());
assert_eq!(check.to_wire()["result"], Value::from("failed"));
}
}
Loading