diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index dc872be1..1ee537f0 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -73,6 +73,19 @@ jobs: env: DATABASE_URL: ${{ steps.postgres.outputs.connection-uri }} + contract: + name: Canopy contract + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v6 + - name: Configure toolchain + run: | + rustup toolchain install --profile minimal --no-self-update stable + rustup default stable + - uses: Swatinem/rust-cache@v2 + - name: Contract tests against live canopy + run: cargo test -p bestool --lib canopy_contract -- --ignored + clippy: name: Clippy runs-on: ubuntu-latest @@ -112,7 +125,7 @@ jobs: tests-pass: if: always() name: Tests pass - needs: [test, check-docs] + needs: [test, contract, check-docs] runs-on: ubuntu-latest steps: - uses: re-actors/alls-green@release/v1 diff --git a/crates/alertd/src/doctor/check.rs b/crates/alertd/src/doctor/check.rs index 24751a38..32a1920d 100644 --- a/crates/alertd/src/doctor/check.rs +++ b/crates/alertd/src/doctor/check.rs @@ -1,31 +1,41 @@ use serde_json::{Map, Value, json}; /// Outcome of a single healthcheck. +/// +/// Serialised on the wire as the per-check `result` field, a proper sum type +/// exhaustively matchable on both sides: +/// `passed | warning | failed | broken | skipped`. #[derive(Debug, Clone)] pub enum CheckStatus { - /// All good. + /// Check ran, system OK. `result: "passed"`. Pass, - /// The check couldn't be run — either the platform doesn't support it, the - /// caller lacked the privilege to query the underlying source, or the - /// upstream (e.g. caddy `/metrics`) wasn't reachable. Reported as - /// `healthy: true` on the wire (we have no evidence of unhealth) but does - /// NOT count as "passing" in human-readable output. + /// Precondition not met; the check didn't run — either the platform + /// doesn't support it, the caller lacked the privilege to query the + /// underlying source, or the check doesn't apply to this server kind. + /// Says nothing about the system. `result: "skipped"`. Skip(String), - /// Non-fatal degradation. Reported as `healthy: false` per-check on the - /// canopy wire format, but does NOT flip the top-level `healthy` flag. + /// Check ran, system degraded but not fatally. `result: "warning"`. Warning(String), - /// Fatal failure. Sets `healthy: false` per-check AND flips top-level to - /// `healthy: false`. + /// Check ran, system under test is unhealthy. `result: "failed"`. Fail(String), + /// The check itself errored or is misconfigured (e.g. its SQL no longer + /// matches the schema); says nothing about the system. `result: "broken"`. + Broken(String), } impl CheckStatus { - /// Whether this status maps to `healthy: true` in the per-check wire format. - pub fn is_healthy_on_wire(&self) -> bool { - matches!(self, CheckStatus::Pass | CheckStatus::Skip(_)) + /// The `result` value for this status in the per-check wire format. + pub fn wire_result(&self) -> &'static str { + match self { + CheckStatus::Pass => "passed", + CheckStatus::Skip(_) => "skipped", + CheckStatus::Warning(_) => "warning", + CheckStatus::Fail(_) => "failed", + CheckStatus::Broken(_) => "broken", + } } - /// Whether this status is fatal (flips top-level `healthy` to false). + /// Whether this status is fatal (the system under test is unhealthy). pub fn is_fatal(&self) -> bool { matches!(self, CheckStatus::Fail(_)) } @@ -66,17 +76,15 @@ impl Check { } } - /// Build a Skip result. The `reason` is recorded in `details.reason` (or - /// kept on the status) so the operator sees *why* the check couldn't be - /// run; the summary is the short headline shown alongside `SKIP`. + /// Build a Skip result. The `reason` is kept on the status so the operator + /// sees *why* the check couldn't be run; the summary is the short headline + /// shown alongside `SKIP`. pub fn skip(name: &'static str, summary: impl Into, reason: impl Into) -> Self { - let mut details = Map::new(); - details.insert("skipped".into(), Value::Bool(true)); Self { name, status: CheckStatus::Skip(reason.into()), summary: summary.into(), - details, + details: Map::new(), payload_extras: Map::new(), } } @@ -105,6 +113,22 @@ impl Check { } } + /// Build a Broken result: the check itself errored or is misconfigured, + /// which says nothing about the system under test. + pub fn broken( + name: &'static str, + summary: impl Into, + reason: impl Into, + ) -> Self { + Self { + name, + status: CheckStatus::Broken(reason.into()), + summary: summary.into(), + details: Map::new(), + payload_extras: Map::new(), + } + } + pub fn with_detail(mut self, key: &str, value: impl Into) -> Self { self.details.insert(key.to_string(), value.into()); self @@ -127,7 +151,7 @@ impl Check { pub fn to_wire(&self) -> Value { let mut obj = Map::new(); obj.insert("check".into(), self.name.into()); - obj.insert("healthy".into(), self.status.is_healthy_on_wire().into()); + obj.insert("result".into(), self.status.wire_result().into()); for (k, v) in &self.details { obj.insert(k.clone(), v.clone()); } @@ -137,15 +161,16 @@ impl Check { /// Encode this Check for streaming over the daemon's task endpoint. /// /// Distinct from [`Self::to_wire`]: that one is the canopy-bound payload - /// (which collapses Warning/Fail status to a bare `healthy: false`); this - /// one preserves the full `CheckStatus` enum so consumers can render the - /// same colours and reason lines as a local sweep. + /// (which drops the reason); this one preserves the full `CheckStatus` + /// enum including reasons so consumers can render the same colours and + /// reason lines as a local sweep. pub fn to_streaming_json(&self) -> Value { let (status, reason) = match &self.status { CheckStatus::Pass => ("pass", None), CheckStatus::Skip(r) => ("skip", Some(r.as_str())), CheckStatus::Warning(r) => ("warning", Some(r.as_str())), CheckStatus::Fail(r) => ("fail", Some(r.as_str())), + CheckStatus::Broken(r) => ("broken", Some(r.as_str())), }; let mut obj = json!({ "name": self.name, @@ -181,6 +206,7 @@ impl Check { ("skip", Some(r)) => CheckStatus::Skip(r), ("warning", Some(r)) => CheckStatus::Warning(r), ("fail", Some(r)) => CheckStatus::Fail(r), + ("broken", Some(r)) => CheckStatus::Broken(r), _ => return None, }; let summary = value.get("summary")?.as_str()?.to_string(); @@ -213,7 +239,7 @@ impl OverallResult { OverallResult::Failing } else if checks .iter() - .any(|c| matches!(c.status, CheckStatus::Warning(_))) + .any(|c| matches!(c.status, CheckStatus::Warning(_) | CheckStatus::Broken(_))) { OverallResult::Degraded } else { @@ -221,11 +247,6 @@ impl OverallResult { } } - /// Whether the top-level `healthy` flag on the wire is `true`. - pub fn is_healthy_top_level(self) -> bool { - !matches!(self, OverallResult::Failing) - } - pub fn label(self) -> &'static str { match self { OverallResult::Healthy => "HEALTHY", @@ -240,35 +261,40 @@ mod tests { use super::*; #[test] - fn pass_is_healthy_on_wire() { - assert!(Check::pass("x", "ok").status.is_healthy_on_wire()); + fn wire_results() { + assert_eq!(CheckStatus::Pass.wire_result(), "passed"); + assert_eq!(CheckStatus::Skip("r".into()).wire_result(), "skipped"); + assert_eq!(CheckStatus::Warning("r".into()).wire_result(), "warning"); + assert_eq!(CheckStatus::Fail("r".into()).wire_result(), "failed"); + assert_eq!(CheckStatus::Broken("r".into()).wire_result(), "broken"); } #[test] - fn warning_is_unhealthy_on_wire_but_not_fatal() { - let s = CheckStatus::Warning("w".into()); - assert!(!s.is_healthy_on_wire()); - assert!(!s.is_fatal()); + fn warning_is_not_fatal() { + assert!(!CheckStatus::Warning("w".into()).is_fatal()); } #[test] - fn fail_is_unhealthy_and_fatal() { - let s = CheckStatus::Fail("f".into()); - assert!(!s.is_healthy_on_wire()); - assert!(s.is_fatal()); + fn fail_is_fatal() { + assert!(CheckStatus::Fail("f".into()).is_fatal()); } #[test] - fn skip_is_healthy_on_wire_and_not_fatal() { - // Skip means "we didn't run this check" — it must not fire alerts - // or flip the top-level healthy flag, since we have no evidence of - // unhealth either way. + fn skip_is_not_fatal() { + // Skip means "we didn't run this check" — it must not fire alerts, + // since we have no evidence of unhealth either way. let s = CheckStatus::Skip("reason".into()); - assert!(s.is_healthy_on_wire()); assert!(!s.is_fatal()); assert!(s.is_skip()); } + #[test] + fn broken_is_not_fatal() { + // Broken means the check itself errored — it says nothing about the + // system under test, so it must not flag the deployment as failing. + assert!(!CheckStatus::Broken("reason".into()).is_fatal()); + } + #[test] fn skip_does_not_change_overall_result() { let with_skip = vec![Check::pass("a", ""), Check::skip("b", "", "r")]; @@ -278,16 +304,6 @@ mod tests { ); } - #[test] - fn skip_constructor_marks_skipped_detail() { - let c = Check::skip("memory", "not available", "platform mismatch"); - assert_eq!( - c.details.get("skipped").and_then(Value::as_bool), - Some(true) - ); - assert!(matches!(c.status, CheckStatus::Skip(_))); - } - #[test] fn overall_from_checks() { let healthy = vec![Check::pass("a", "")]; @@ -299,30 +315,41 @@ mod tests { OverallResult::Degraded ); + let broken = vec![Check::pass("a", ""), Check::broken("b", "", "x")]; + assert_eq!(OverallResult::from_checks(&broken), OverallResult::Degraded); + let failing = vec![Check::warning("a", "", "x"), Check::fail("b", "", "y")]; assert_eq!(OverallResult::from_checks(&failing), OverallResult::Failing); } - #[test] - fn overall_top_level_healthy_only_true_when_not_failing() { - assert!(OverallResult::Healthy.is_healthy_top_level()); - assert!(OverallResult::Degraded.is_healthy_top_level()); - assert!(!OverallResult::Failing.is_healthy_top_level()); - } - #[test] fn check_to_wire_pass() { let c = Check::pass("db_connect", "ok").with_detail("latency_ms", 3); let v = c.to_wire(); assert_eq!(v["check"], "db_connect"); - assert_eq!(v["healthy"], true); + assert_eq!(v["result"], "passed"); assert_eq!(v["latency_ms"], 3); } #[test] - fn check_to_wire_warning_marks_unhealthy() { - let c = Check::warning("disk_free", "20% used", "below threshold"); - let v = c.to_wire(); - assert_eq!(v["healthy"], false); + fn check_to_wire_statuses() { + let warn = Check::warning("disk_free", "20% used", "below threshold"); + assert_eq!(warn.to_wire()["result"], "warning"); + let fail = Check::fail("disk_free", "1% free", "out of space"); + assert_eq!(fail.to_wire()["result"], "failed"); + let broken = Check::broken("x", "query broken", "no such column"); + assert_eq!(broken.to_wire()["result"], "broken"); + let skip = Check::skip("x", "n/a", "central-only"); + assert_eq!(skip.to_wire()["result"], "skipped"); + } + + #[test] + fn broken_round_trips_through_streaming_json() { + let c = Check::broken("x", "query broken", "no such column"); + let v = c.to_streaming_json(); + assert_eq!(v["status"], "broken"); + assert_eq!(v["reason"], "no such column"); + let back = Check::from_streaming_json(&v, |_| Some("x")).unwrap(); + assert!(matches!(back.status, CheckStatus::Broken(r) if r == "no such column")); } } diff --git a/crates/alertd/src/doctor/checks.rs b/crates/alertd/src/doctor/checks.rs index 60d9b0a1..22b7bf51 100644 --- a/crates/alertd/src/doctor/checks.rs +++ b/crates/alertd/src/doctor/checks.rs @@ -85,6 +85,25 @@ pub fn fmt_db_error(err: &tokio_postgres::Error) -> String { fmt_chain(err) } +/// Build the Check for a query that errored, classified by SQLSTATE. +/// +/// Class 42 ("syntax error or access rule violation": dropped or renamed +/// columns, json/jsonb drift, missing functions) means the check's own SQL no +/// longer matches the schema — a fault in the healthcheck, not the deployment +/// — so it reports as BROKEN rather than flagging the server as failing. +/// Everything else stays FAIL. +pub fn query_error_check(name: &'static str, err: &tokio_postgres::Error) -> Check { + let reason = fmt_db_error(err); + if err + .as_db_error() + .is_some_and(|db| db.code().code().starts_with("42")) + { + Check::broken(name, "healthcheck query broken", reason) + } else { + Check::fail(name, "query failed", reason) + } +} + /// Walk a `std::error::Error`'s source chain and join all the messages. /// /// `reqwest::Error`'s Display is just "error sending request for url (...)"; @@ -243,3 +262,54 @@ pub mod test_support { } } } + +#[cfg(test)] +mod tests { + use serde_json::Value; + + use super::{query_error_check, test_support::central_ctx}; + use crate::doctor::check::CheckStatus; + + async fn query_err(sql: &str) -> Option { + let ctx = central_ctx().await?; + let client = ctx.db.expect("central_ctx always has a client"); + Some( + client + .query(sql, &[]) + .await + .expect_err("query should error"), + ) + } + + #[tokio::test] + async fn schema_drift_is_broken() { + // 42P01 undefined_table — the shape a dropped/renamed relation takes. + let Some(err) = query_err("SELECT nope FROM no_such_table_bestool_test").await else { + return; + }; + let check = query_error_check("x", &err); + assert!(matches!(check.status, CheckStatus::Broken(_))); + assert_eq!(check.to_wire()["result"], Value::from("broken")); + } + + #[tokio::test] + async fn syntax_error_is_broken() { + // 42601 syntax_error. + let Some(err) = query_err("SELECT FROM WHERE").await else { + return; + }; + let check = query_error_check("x", &err); + assert!(matches!(check.status, CheckStatus::Broken(_))); + } + + #[tokio::test] + async fn runtime_db_error_still_fails() { + // 22012 division_by_zero — not class 42, so the deployment is blamed. + let Some(err) = query_err("SELECT 1/0").await else { + return; + }; + let check = query_error_check("x", &err); + assert!(check.status.is_fatal()); + assert_eq!(check.to_wire()["result"], Value::from("failed")); + } +} diff --git a/crates/alertd/src/doctor/checks/caddy_version.rs b/crates/alertd/src/doctor/checks/caddy_version.rs index a251ff81..a479c59c 100644 --- a/crates/alertd/src/doctor/checks/caddy_version.rs +++ b/crates/alertd/src/doctor/checks/caddy_version.rs @@ -189,6 +189,7 @@ mod tests { Skip(_) => "skip", Warning(_) => "warning", Fail(_) => "fail", + Broken(_) => "broken", } } diff --git a/crates/alertd/src/doctor/checks/db_version.rs b/crates/alertd/src/doctor/checks/db_version.rs index 7b9b45b7..e8caedca 100644 --- a/crates/alertd/src/doctor/checks/db_version.rs +++ b/crates/alertd/src/doctor/checks/db_version.rs @@ -1,4 +1,4 @@ -use super::CheckContext; +use super::{CheckContext, query_error_check}; use crate::doctor::check::Check; pub async fn run(ctx: CheckContext) -> Check { @@ -11,6 +11,6 @@ pub async fn run(ctx: CheckContext) -> Check { Ok(v) => Check::pass("db_version", v.clone()).with_detail("pg_version", v), Err(err) => Check::fail("db_version", "row decode failed", err.to_string()), }, - Err(err) => Check::fail("db_version", "SELECT version() failed", err.to_string()), + Err(err) => query_error_check("db_version", &err), } } diff --git a/crates/alertd/src/doctor/checks/external_users.rs b/crates/alertd/src/doctor/checks/external_users.rs index 42c641c1..a0cc7d0f 100644 --- a/crates/alertd/src/doctor/checks/external_users.rs +++ b/crates/alertd/src/doctor/checks/external_users.rs @@ -16,8 +16,8 @@ //! "is this *person* connected" is what's measured, even if their Windows //! session ID changes across reconnects); fall back to user@line otherwise. //! -//! A single threshold: sessions of 12h+ produce a warning ("healthy: false" -//! on the wire for this check, but never flips the overall result to FAILING). +//! A single threshold: sessions of 12h+ produce a warning (`result: "warning"` +//! on the wire for this check, which never flips the overall result to FAILING). //! A long-lived session can only ever warn, not fail — so a forgotten RDP //! session won't take the whole doctor result down. //! diff --git a/crates/alertd/src/doctor/checks/fhir_jobs.rs b/crates/alertd/src/doctor/checks/fhir_jobs.rs index bea2455b..20735d62 100644 --- a/crates/alertd/src/doctor/checks/fhir_jobs.rs +++ b/crates/alertd/src/doctor/checks/fhir_jobs.rs @@ -9,7 +9,7 @@ use jiff::Timestamp; use serde_json::{Map, Value}; -use super::{CheckContext, fmt_db_error}; +use super::{CheckContext, query_error_check}; use crate::doctor::check::Check; const WARN_DEPTH: i64 = 200; @@ -19,8 +19,11 @@ const FAIL_OLDEST_SECS: i64 = 60 * 60; // 1h pub async fn run(ctx: CheckContext) -> Check { if ctx.config.is_facility() { - return Check::pass("fhir_jobs", "not applicable on facility server") - .with_detail("skipped", true); + return Check::skip( + "fhir_jobs", + "not applicable on facility server", + "central-only check", + ); } let Some(client) = ctx.db.as_deref() else { @@ -41,10 +44,9 @@ pub async fn run(ctx: CheckContext) -> Check { && (db.code() == &tokio_postgres::error::SqlState::UNDEFINED_TABLE || db.code() == &tokio_postgres::error::SqlState::INVALID_SCHEMA_NAME) { - return Check::pass("fhir_jobs", "fhir.jobs table not present") - .with_detail("skipped", true); + return Check::skip("fhir_jobs", "fhir.jobs table not present", "table absent"); } - return Check::fail("fhir_jobs", "query failed", fmt_db_error(&err)); + return query_error_check("fhir_jobs", &err); } }; diff --git a/crates/alertd/src/doctor/checks/fhir_service_requests_unresolved.rs b/crates/alertd/src/doctor/checks/fhir_service_requests_unresolved.rs index f18cdf7a..b6f7f8f6 100644 --- a/crates/alertd/src/doctor/checks/fhir_service_requests_unresolved.rs +++ b/crates/alertd/src/doctor/checks/fhir_service_requests_unresolved.rs @@ -4,7 +4,7 @@ //! unresolved for over an hour, tiering on the longest outstanding duration: //! WARN past 1h, FAIL past 6h. -use super::{CheckContext, fmt_db_error}; +use super::{CheckContext, query_error_check}; use crate::doctor::check::Check; use bestool_tamanu::ApiServerKind; use serde_json::{Value, json}; @@ -34,7 +34,7 @@ pub async fn run(ctx: CheckContext) -> Check { let rows = match client.query(SQL, &[]).await { Ok(r) => r, - Err(err) => return Check::fail(NAME, "query failed", fmt_db_error(&err)), + Err(err) => return query_error_check(NAME, &err), }; if rows.is_empty() { diff --git a/crates/alertd/src/doctor/checks/migrations.rs b/crates/alertd/src/doctor/checks/migrations.rs index c2068156..9151f1c4 100644 --- a/crates/alertd/src/doctor/checks/migrations.rs +++ b/crates/alertd/src/doctor/checks/migrations.rs @@ -1,4 +1,4 @@ -use super::CheckContext; +use super::{CheckContext, query_error_check}; use crate::doctor::check::Check; pub async fn run(ctx: CheckContext) -> Check { @@ -18,6 +18,6 @@ pub async fn run(ctx: CheckContext) -> Check { "no migrations applied", "SequelizeMeta is empty", ), - Err(err) => Check::fail("migrations", "query failed", err.to_string()), + Err(err) => query_error_check("migrations", &err), } } diff --git a/crates/alertd/src/doctor/checks/sync_facility_stale.rs b/crates/alertd/src/doctor/checks/sync_facility_stale.rs index 9daa36a1..c0576d41 100644 --- a/crates/alertd/src/doctor/checks/sync_facility_stale.rs +++ b/crates/alertd/src/doctor/checks/sync_facility_stale.rs @@ -7,7 +7,7 @@ use serde_json::{Value, json}; -use super::{CheckContext, fmt_db_error}; +use super::{CheckContext, query_error_check}; use crate::doctor::check::Check; use bestool_tamanu::ApiServerKind; @@ -48,7 +48,7 @@ pub async fn run(ctx: CheckContext) -> Check { let rows = match client.query(SQL, &[]).await { Ok(r) => r, - Err(err) => return Check::fail(NAME, "query failed", fmt_db_error(&err)), + Err(err) => return query_error_check(NAME, &err), }; let mut warn = Vec::new(); diff --git a/crates/alertd/src/doctor/checks/sync_lookup.rs b/crates/alertd/src/doctor/checks/sync_lookup.rs index 4c229c73..7d9dc275 100644 --- a/crates/alertd/src/doctor/checks/sync_lookup.rs +++ b/crates/alertd/src/doctor/checks/sync_lookup.rs @@ -4,7 +4,7 @@ //! staleness: WARN past 2 minutes, FAIL past 5. If the tracking row is absent, //! treat the lookup as not tracked and pass. -use super::{CheckContext, fmt_db_error}; +use super::{CheckContext, query_error_check}; use crate::doctor::check::{Check, CheckStatus}; use bestool_tamanu::ApiServerKind; @@ -31,7 +31,7 @@ pub async fn run(ctx: CheckContext) -> Check { let row = match client.query_opt(SQL, &[]).await { Ok(Some(r)) => r, Ok(None) => return Check::pass(NAME, "lookup table not tracked"), - Err(err) => return Check::fail(NAME, "query failed", fmt_db_error(&err)), + Err(err) => return query_error_check(NAME, &err), }; let last_sync_tick: Option = row.try_get("last_sync_tick").ok(); diff --git a/crates/alertd/src/doctor/checks/sync_restart_loop.rs b/crates/alertd/src/doctor/checks/sync_restart_loop.rs index cf5fb7a0..12afdde5 100644 --- a/crates/alertd/src/doctor/checks/sync_restart_loop.rs +++ b/crates/alertd/src/doctor/checks/sync_restart_loop.rs @@ -6,7 +6,7 @@ use serde_json::{Value, json}; -use super::{CheckContext, fmt_db_error}; +use super::{CheckContext, query_error_check}; use crate::doctor::check::Check; use bestool_tamanu::ApiServerKind; @@ -35,7 +35,7 @@ pub async fn run(ctx: CheckContext) -> Check { let rows = match client.query(SQL, &[]).await { Ok(r) => r, - Err(err) => return Check::fail(NAME, "query failed", fmt_db_error(&err)), + Err(err) => return query_error_check(NAME, &err), }; let mut warn = Vec::new(); diff --git a/crates/alertd/src/doctor/checks/sync_session_errors.rs b/crates/alertd/src/doctor/checks/sync_session_errors.rs index 8b15f0db..d986fe2b 100644 --- a/crates/alertd/src/doctor/checks/sync_session_errors.rs +++ b/crates/alertd/src/doctor/checks/sync_session_errors.rs @@ -50,11 +50,11 @@ pub async fn run(ctx: CheckContext) -> Check { let mobile = match fetch_rows(client, MOBILE_SQL, &[]).await { Ok(set) => set, - Err(err) => return Check::fail(NAME, "query failed", super::fmt_db_error(&err)), + Err(err) => return super::query_error_check(NAME, &err), }; let server = match fetch_rows(client, SERVER_SQL, &[]).await { Ok(set) => set, - Err(err) => return Check::fail(NAME, "query failed", super::fmt_db_error(&err)), + Err(err) => return super::query_error_check(NAME, &err), }; if mobile.is_empty() && server.is_empty() { diff --git a/crates/alertd/src/doctor/checks/sync_sessions.rs b/crates/alertd/src/doctor/checks/sync_sessions.rs index 34252dac..8e2c0dc6 100644 --- a/crates/alertd/src/doctor/checks/sync_sessions.rs +++ b/crates/alertd/src/doctor/checks/sync_sessions.rs @@ -1,6 +1,6 @@ use jiff::Timestamp; -use super::{CheckContext, fmt_db_error}; +use super::{CheckContext, query_error_check}; use crate::doctor::check::Check; pub async fn run(ctx: CheckContext) -> Check { @@ -30,10 +30,13 @@ pub async fn run(ctx: CheckContext) -> Check { if let Some(db) = err.as_db_error() && db.code() == &tokio_postgres::error::SqlState::UNDEFINED_TABLE { - return Check::pass("sync_sessions", "sync_sessions table not present") - .with_detail("skipped", true); + return Check::skip( + "sync_sessions", + "sync_sessions table not present", + "table absent", + ); } - return Check::fail("sync_sessions", "query failed", fmt_db_error(&err)); + return query_error_check("sync_sessions", &err); } }; diff --git a/crates/alertd/src/doctor/checks/tamanu_service.rs b/crates/alertd/src/doctor/checks/tamanu_service.rs index 21110809..288164eb 100644 --- a/crates/alertd/src/doctor/checks/tamanu_service.rs +++ b/crates/alertd/src/doctor/checks/tamanu_service.rs @@ -18,8 +18,11 @@ pub async fn run(ctx: CheckContext) -> Check { } else if cfg!(target_os = "windows") { Supervisor::Pm2 } else { - return Check::pass("tamanu_service", "service check skipped on this platform") - .with_detail("skipped", true); + return Check::skip( + "tamanu_service", + "service check skipped on this platform", + "no supervisor support on this platform", + ); }; // Patient-portal expectation is gated on Tamanu's own `features.patientPortal` diff --git a/crates/alertd/src/doctor/checks/util.rs b/crates/alertd/src/doctor/checks/util.rs index 6ab1cefe..e0f4fc75 100644 --- a/crates/alertd/src/doctor/checks/util.rs +++ b/crates/alertd/src/doctor/checks/util.rs @@ -10,7 +10,7 @@ use std::sync::Arc; use serde_json::Value; use tokio_postgres::{Client as PgClient, types::ToSql}; -use super::fmt_db_error; +use super::query_error_check; use crate::doctor::check::Check; /// Rows reported in `details` are capped here; one extra row is fetched to @@ -112,7 +112,7 @@ pub async fn tiered_rows_check( .with_detail("truncated", set.truncated) .with_detail("count", count) } - Err(err) => Check::fail(name, "query failed", fmt_db_error(&err)), + Err(err) => query_error_check(name, &err), } } diff --git a/crates/alertd/src/doctor/sweep.rs b/crates/alertd/src/doctor/sweep.rs index 6896c5a0..2751c7dc 100644 --- a/crates/alertd/src/doctor/sweep.rs +++ b/crates/alertd/src/doctor/sweep.rs @@ -137,7 +137,7 @@ pub async fn perform_sweep( let overall = OverallResult::from_checks(&results.iter().map(|(c, _)| c.clone()).collect::>()); - let payload = build_payload(&info_value, &results, overall); + let payload = build_payload(&info_value, &results); Ok(SweepResult { server_id, @@ -193,34 +193,24 @@ async fn collect_server_facts( } pub fn overall_from_payload(payload: &Value) -> OverallResult { - let healthy = payload - .get("healthy") - .and_then(Value::as_bool) - .unwrap_or(true); - if !healthy { - return OverallResult::Failing; - } - // `healthy: true` covers both Healthy and Degraded — peek at the - // per-check entries to disambiguate. A `healthy: false` entry in a - // top-level-healthy payload means a warning was logged. - let degraded = payload - .get("health") - .and_then(Value::as_array) - .map(|arr| { - arr.iter().any(|c| { - c.get("healthy") == Some(&Value::Bool(false)) - && c.get("skipped") != Some(&Value::Bool(true)) - }) - }) - .unwrap_or(false); - if degraded { + let results = || { + payload + .get("health") + .and_then(Value::as_array) + .into_iter() + .flatten() + .filter_map(|c| c.get("result").and_then(Value::as_str)) + }; + if results().any(|r| r == "failed") { + OverallResult::Failing + } else if results().any(|r| r == "warning" || r == "broken") { OverallResult::Degraded } else { OverallResult::Healthy } } -fn build_payload(info: &Value, results: &[(Check, bool)], overall: OverallResult) -> Value { +fn build_payload(info: &Value, results: &[(Check, bool)]) -> Value { let mut payload: Map = match info { Value::Object(o) => o.clone(), _ => Map::new(), @@ -242,7 +232,6 @@ fn build_payload(info: &Value, results: &[(Check, bool)], overall: OverallResult .map(|(c, _)| c.to_wire()) .collect(); - payload.insert("healthy".into(), overall.is_healthy_top_level().into()); payload.insert("health".into(), Value::Array(health)); Value::Object(payload) @@ -266,33 +255,46 @@ mod tests { } #[test] - fn payload_all_pass_is_healthy() { + fn payload_all_pass() { let results = vec![pass("a"), pass("b")]; - let overall = - OverallResult::from_checks(&results.iter().map(|(c, _)| c.clone()).collect::>()); - let payload = build_payload(&Value::Object(Default::default()), &results, overall); - assert_eq!(payload["healthy"], true); + let payload = build_payload(&Value::Object(Default::default()), &results); + assert!(payload.get("healthy").is_none()); assert_eq!(payload["health"].as_array().unwrap().len(), 2); - assert!(payload["health"][0]["healthy"].as_bool().unwrap()); + assert_eq!(payload["health"][0]["result"], "passed"); } #[test] - fn payload_warning_keeps_top_healthy_but_check_unhealthy() { - let results = vec![pass("a"), warn("b")]; - let overall = - OverallResult::from_checks(&results.iter().map(|(c, _)| c.clone()).collect::>()); - let payload = build_payload(&Value::Object(Default::default()), &results, overall); - assert_eq!(payload["healthy"], true); - assert_eq!(payload["health"][1]["healthy"], false); + fn payload_per_check_results() { + let results = vec![pass("a"), warn("b"), fail("c")]; + let payload = build_payload(&Value::Object(Default::default()), &results); + assert_eq!(payload["health"][0]["result"], "passed"); + assert_eq!(payload["health"][1]["result"], "warning"); + assert_eq!(payload["health"][2]["result"], "failed"); } #[test] - fn payload_fail_flips_top_level() { - let results = vec![pass("a"), warn("b"), fail("c")]; - let overall = - OverallResult::from_checks(&results.iter().map(|(c, _)| c.clone()).collect::>()); - let payload = build_payload(&Value::Object(Default::default()), &results, overall); - assert_eq!(payload["healthy"], false); + fn overall_from_payload_tiers_on_results() { + let mk = |results: &[&str]| { + serde_json::json!({ + "health": results.iter().map(|r| serde_json::json!({"check": "x", "result": r})).collect::>(), + }) + }; + assert_eq!( + overall_from_payload(&mk(&["passed", "skipped"])), + OverallResult::Healthy + ); + assert_eq!( + overall_from_payload(&mk(&["passed", "warning"])), + OverallResult::Degraded + ); + assert_eq!( + overall_from_payload(&mk(&["passed", "broken"])), + OverallResult::Degraded + ); + assert_eq!( + overall_from_payload(&mk(&["warning", "failed"])), + OverallResult::Failing + ); } #[test] @@ -311,9 +313,7 @@ mod tests { serde_json::json!({"supervisor": "systemd", "expectations": []}), ); let results = vec![(check, true)]; - let overall = - OverallResult::from_checks(&results.iter().map(|(c, _)| c.clone()).collect::>()); - let payload = build_payload(&info_value, &results, overall); + let payload = build_payload(&info_value, &results); assert_eq!(payload["osTimezone"], "Pacific/Auckland"); // Lifted into the top level, alongside osTimezone. @@ -331,9 +331,7 @@ mod tests { (Check::pass("on", "ok"), true), (Check::pass("off", "ok"), false), ]; - let overall = - OverallResult::from_checks(&results.iter().map(|(c, _)| c.clone()).collect::>()); - let payload = build_payload(&Value::Object(Default::default()), &results, overall); + let payload = build_payload(&Value::Object(Default::default()), &results); let names: Vec<&str> = payload["health"] .as_array() .unwrap() @@ -344,15 +342,11 @@ mod tests { } #[test] - fn payload_skip_is_healthy_on_wire() { + fn payload_skip_result_on_wire() { // The whole point of distinguishing Skip from Fail/Warning is that // "we don't know" shouldn't fire alerts downstream of the wire format. let results = vec![pass("a"), skip("b")]; - let overall = - OverallResult::from_checks(&results.iter().map(|(c, _)| c.clone()).collect::>()); - let payload = build_payload(&Value::Object(Default::default()), &results, overall); - assert_eq!(payload["healthy"], true); - assert_eq!(payload["health"][1]["healthy"], true); - assert_eq!(payload["health"][1]["skipped"], true); + let payload = build_payload(&Value::Object(Default::default()), &results); + assert_eq!(payload["health"][1]["result"], "skipped"); } } diff --git a/crates/bestool/src/actions/tamanu/doctor.rs b/crates/bestool/src/actions/tamanu/doctor.rs index c01972b3..ca4c9972 100644 --- a/crates/bestool/src/actions/tamanu/doctor.rs +++ b/crates/bestool/src/actions/tamanu/doctor.rs @@ -527,6 +527,7 @@ fn write_check_line( CheckStatus::Skip(_) => colour_skip(use_colours, "SKIP"), CheckStatus::Warning(_) => colour_warn(use_colours, "WARN"), CheckStatus::Fail(_) => colour_fail(use_colours, "FAIL"), + CheckStatus::Broken(_) => colour_broken(use_colours, "BRKN"), }; writeln!( out, @@ -535,7 +536,11 @@ fn write_check_line( width = name_width, summary = check.summary, )?; - if let CheckStatus::Skip(r) | CheckStatus::Warning(r) | CheckStatus::Fail(r) = &check.status { + if let CheckStatus::Skip(r) + | CheckStatus::Warning(r) + | CheckStatus::Fail(r) + | CheckStatus::Broken(r) = &check.status + { let dim = if use_colours { format!("{}", r.dimmed()) } else { @@ -557,13 +562,14 @@ fn write_result_line( overall: OverallResult, use_colours: bool, ) -> std::io::Result<()> { - let (mut warnings, mut fails, mut skips) = (0usize, 0usize, 0usize); + let (mut warnings, mut fails, mut skips, mut brokens) = (0usize, 0usize, 0usize, 0usize); for (check, _) in results { match &check.status { CheckStatus::Pass => {} CheckStatus::Skip(_) => skips += 1, CheckStatus::Warning(_) => warnings += 1, CheckStatus::Fail(_) => fails += 1, + CheckStatus::Broken(_) => brokens += 1, } } let label = overall.label(); @@ -572,6 +578,11 @@ fn write_result_line( OverallResult::Degraded => colour_warn(use_colours, label), OverallResult::Failing => colour_fail(use_colours, label), }; + let broken_suffix = if brokens > 0 { + format!(", {brokens} broken") + } else { + String::new() + }; let skip_suffix = if skips > 0 { format!(", {skips} skipped") } else { @@ -579,7 +590,7 @@ fn write_result_line( }; writeln!( out, - "Result: {label_coloured} ({fails} failed, {warnings} warning{plural}{skip_suffix})", + "Result: {label_coloured} ({fails} failed, {warnings} warning{plural}{broken_suffix}{skip_suffix})", plural = if warnings == 1 { "" } else { "s" }, ) } @@ -708,6 +719,14 @@ fn colour_fail(use_colours: bool, s: &str) -> String { } } +fn colour_broken(use_colours: bool, s: &str) -> String { + if use_colours { + format!("{}", s.magenta().bold()) + } else { + s.to_string() + } +} + #[cfg(test)] mod tests { use super::*; diff --git a/crates/bestool/src/canopy_contract.rs b/crates/bestool/src/canopy_contract.rs index 313c8128..b072811b 100644 --- a/crates/bestool/src/canopy_contract.rs +++ b/crates/bestool/src/canopy_contract.rs @@ -6,6 +6,11 @@ //! samples decode into bestool's types. These tests need network access, and //! fail honestly when live canopy doesn't (yet) serve an endpoint bestool //! depends on. +//! +//! All tests here are `#[ignore]`d so plain `cargo test` skips them; CI runs +//! them in a dedicated job (`cargo test -p bestool --lib canopy_contract -- +//! --ignored`) so a contract failure is clearly drift against live canopy +//! rather than a fault in bestool's own test suite. use std::collections::BTreeMap; @@ -109,6 +114,7 @@ fn response_schema(path: &str, method: &str) -> String { } #[tokio::test] +#[ignore = "live canopy contract test; run by the dedicated CI job"] async fn events_request_matches_spec() { let spec = spec().await; assert_operation_exists(spec, "/events", "post"); @@ -137,6 +143,7 @@ async fn events_request_matches_spec() { } #[tokio::test] +#[ignore = "live canopy contract test; run by the dedicated CI job"] async fn severity_vocabulary_matches_spec() { let spec = spec().await; let spec_levels: Vec<&str> = resolve(spec, "/components/schemas/Severity") @@ -183,17 +190,20 @@ async fn severity_vocabulary_matches_spec() { } #[tokio::test] +#[ignore = "live canopy contract test; run by the dedicated CI job"] async fn status_request_matches_spec() { let spec = spec().await; assert_operation_exists(spec, "/status/{server_id}", "post"); - // Representative sweep payload: the reserved `healthy`/`health` keys plus - // free-form extras, as posted by alertd's doctor task. + // Representative sweep payload: the reserved `health` key plus free-form + // extras, as posted by alertd's doctor task. let instance = json!({ - "healthy": false, "health": [ - {"check": "uptime", "healthy": true, "uptime_secs": 12345}, - {"check": "disk", "healthy": false, "free_percent": 3}, + {"check": "uptime", "result": "passed", "uptime_secs": 12345}, + {"check": "disk", "result": "failed", "free_percent": 3}, + {"check": "sync_lookup", "result": "broken"}, + {"check": "fhir_jobs", "result": "skipped"}, + {"check": "load", "result": "warning"}, ], "hostname": "test-host", "pg_version": "16.4", @@ -206,6 +216,7 @@ async fn status_request_matches_spec() { } #[tokio::test] +#[ignore = "live canopy contract test; run by the dedicated CI job"] async fn servers_probe_path_exists() { // `GET /servers` is the no-auth probe `CanopyClient` uses to detect the // tailscale path. @@ -213,6 +224,7 @@ async fn servers_probe_path_exists() { } #[tokio::test] +#[ignore = "live canopy contract test; run by the dedicated CI job"] async fn register_begin_matches_spec() { let spec = spec().await; assert_operation_exists(spec, "/servers/register/begin", "post"); @@ -241,6 +253,7 @@ async fn register_begin_matches_spec() { } #[tokio::test] +#[ignore = "live canopy contract test; run by the dedicated CI job"] async fn register_complete_matches_spec() { let spec = spec().await; assert_operation_exists(spec, "/servers/register/complete", "post"); @@ -273,6 +286,7 @@ async fn register_complete_matches_spec() { } #[tokio::test] +#[ignore = "live canopy contract test; run by the dedicated CI job"] async fn tags_response_matches_decode() { let spec = spec().await; assert_operation_exists(spec, "/tags", "get"); @@ -293,6 +307,7 @@ async fn tags_response_matches_decode() { } #[tokio::test] +#[ignore = "live canopy contract test; run by the dedicated CI job"] async fn snippets_response_matches_decode() { let spec = spec().await; assert_operation_exists(spec, "/bestool/snippets", "get"); @@ -308,6 +323,7 @@ async fn snippets_response_matches_decode() { } #[tokio::test] +#[ignore = "live canopy contract test; run by the dedicated CI job"] async fn artifacts_response_matches_decode() { let spec = spec().await; assert_operation_exists(spec, "/versions/{version}/artifacts", "get"); diff --git a/crates/canopy/src/client.rs b/crates/canopy/src/client.rs index 9d60d23c..1c69f028 100644 --- a/crates/canopy/src/client.rs +++ b/crates/canopy/src/client.rs @@ -275,7 +275,8 @@ impl CanopyClient { /// URL is used. In mTLS mode, posts to `{base_url}/status/{server_id}`. /// /// The payload is free-form JSON; the canopy `/status` contract reserves the - /// top-level `healthy: bool` and `health: []` keys. The body is gzip-encoded + /// top-level `health: []` key, whose entries each carry a `result` of + /// `passed | warning | failed | broken | skipped`. The body is gzip-encoded /// with `Content-Encoding: gzip`. pub async fn post_status( &self, @@ -634,7 +635,7 @@ fXLgamTYOa/w9n/Ta64fiYWmN54kEd0DgnflJDLtID321Zz6xswvK/VN use flate2::read::GzDecoder; use std::io::Read; - let original = br#"{"healthy":true,"health":[{"check":"x","healthy":true}]}"#; + let original = br#"{"health":[{"check":"x","result":"passed"}]}"#; let compressed = gzip_bytes(original).expect("gzip should succeed"); assert!( compressed.starts_with(&[0x1f, 0x8b]),