diff --git a/crates/core/src/bin/commands/auto_update.rs b/crates/core/src/bin/commands/auto_update.rs index f2bfea9ab8..8e8845af65 100644 --- a/crates/core/src/bin/commands/auto_update.rs +++ b/crates/core/src/bin/commands/auto_update.rs @@ -142,12 +142,41 @@ impl std::fmt::Display for UpdateNeededError { impl std::error::Error for UpdateNeededError {} +/// Sentinel error returned by [`get_latest_version`] when the global GitHub-poll +/// token bucket is empty. Distinct from a network/parse error so the caller can +/// tell "we deliberately skipped polling to bound load" apart from "GitHub was +/// unreachable" — the two must drive different behaviour (the latter may fall +/// back to a gateway-trust exit 42; the former must NOT, or local rate-limiting +/// would itself trigger the restart loop the limiter exists to prevent). +#[derive(Debug)] +struct RateLimitedError; + +impl std::fmt::Display for RateLimitedError { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "GitHub update poll rate-limited (token bucket empty)") + } +} + +impl std::error::Error for RateLimitedError {} + /// Result of an update check attempt. #[derive(Debug, PartialEq)] pub enum UpdateCheckResult { /// Rate limited, too many failures, or no update available yet - will retry later. /// The caller should NOT clear the version mismatch flag (preserve it for retry). Skipped, + /// The global GitHub-poll rate limiter (#4073) denied this check: we did NOT + /// reach GitHub at all. Distinct from [`Skipped`] because the caller must NOT + /// treat it as a "GitHub says no update / unreachable" signal — in + /// particular it must NOT feed the legacy "max-backoff + 0 connections -> + /// exit 42" gateway-trust fallback, since the supervisor-side `freenet + /// update` shares the same empty bucket and would just exit "already up to + /// date", turning local rate-limiting into a restart loop. The caller should + /// do nothing and retry once the bucket refills (the version-mismatch flag is + /// preserved). + /// + /// [`Skipped`]: UpdateCheckResult::Skipped + RateLimited, /// Checked GitHub, newer version confirmed. /// The caller should clear the version mismatch flag. UpdateAvailable(String), @@ -221,12 +250,21 @@ pub async fn check_if_update_available(current_version: &str) -> UpdateCheckResu return UpdateCheckResult::Skipped; } - // Record that we're checking now - record_check_time(); + // NOTE: the last-check timestamp is recorded only when a GitHub poll is + // actually attempted (see the arms below), NOT here. A token-denied + // (rate-limited) poll must NOT advance `last_update_check`: if it did, the + // backoff gate above would short-circuit the next tick to a plain `Skipped` + // before reaching the `RateLimited` arm — and a `Skipped` at max backoff with + // 0 connections can still trigger the gateway-trust exit 42 (the loop the + // limiter exists to prevent), and the stagger path could wrongly enter its + // 24h cooldown. // Fetch latest version from GitHub match get_latest_version().await { Ok(latest) => { + // A real GitHub poll happened: record the check time so the backoff + // gate spaces out subsequent polls. + record_check_time(); let current = match Version::parse(current_version) { Ok(v) => v, Err(e) => { @@ -252,17 +290,21 @@ pub async fn check_if_update_available(current_version: &str) -> UpdateCheckResu }; if latest_ver > current { - // #4073 crash-loop auto-rollback: never trigger an exit-42 - // update to a version pinned known-bad on this node by a prior - // rollback. The installer would refuse it anyway; skipping here - // also avoids a pointless restart cycle. The mismatch flag is - // kept (Skipped) so a later, strictly-newer fixed release is - // still picked up. - if super::rollback::is_version_pinned_bad(&latest) { + // #4073: never trigger an exit-42 update to a version that is + // locally BLOCKED — either pinned known-bad by a prior crash-loop + // rollback, OR gated after repeatedly failing to install (checksum + // / signature / download / extract). In both cases the installer + // would refuse it anyway, so emitting exit 42 only produces a + // pointless restart cycle. The mismatch flag is kept (handled like + // the pin) so a later, strictly-newer fixed release is still + // picked up. + if super::rollback::is_version_pinned_bad(&latest) + || super::rollback::is_version_install_gated(&latest) + { tracing::warn!( version = %latest, - "Newer version is pinned known-bad after a crash-loop rollback; not \ - triggering auto-update to it (#4073)" + "Newer version is locally blocked (crash-loop known-bad pin or repeated \ + install failures); not triggering auto-update to it (#4073)" ); // Distinct result (NOT Skipped) so the caller clears the // driving signal and does not fall through to the legacy @@ -303,12 +345,28 @@ pub async fn check_if_update_available(current_version: &str) -> UpdateCheckResu UpdateCheckResult::Skipped } } + Err(e) if e.downcast_ref::().is_some() => { + // Our OWN rate limiter denied the poll — NOT a GitHub failure, and no + // network call was made. Deliberately do NOT record a check time or + // grow the backoff: recording would let the backoff gate mask this as + // a plain `Skipped` on the next tick (which can still trigger the + // gateway-trust exit 42 / the stagger cooldown). Return the distinct + // `RateLimited` so the caller does nothing and retries once the bucket + // refills. + tracing::debug!( + "Update check skipped: GitHub poll rate-limited; will retry when the token bucket refills" + ); + UpdateCheckResult::RateLimited + } Err(e) => { + // A real GitHub poll was attempted (token consumed) but failed + // (network/parse). Record the check time + grow backoff so we retry + // later rather than hammering on every tick. + record_check_time(); tracing::warn!( "Failed to check GitHub for updates: {}. Will retry with increased backoff.", e ); - // Network error - increase backoff and retry later increase_backoff(); UpdateCheckResult::Skipped } @@ -317,6 +375,17 @@ pub async fn check_if_update_available(current_version: &str) -> UpdateCheckResu /// Fetch the latest version string from GitHub releases API. async fn get_latest_version() -> Result { + // Global persistent rate limit (#4073): refuse to hit GitHub when the shared + // token bucket is empty. This is the in-node choke point (the node's startup + // check, the #4589 re-poll, and the peer-signal loop all reach GitHub through + // here); the supervisor-side `freenet update` is bounded by the same bucket + // at `get_latest_release`. A denied poll returns Err so the caller + // (`check_if_update_available`) treats it as `Skipped` and retries later — + // the node keeps running, it just does not poll GitHub this tick. + if !try_consume_node_poll() { + return Err(RateLimitedError.into()); + } + let client = reqwest::Client::builder() .user_agent("freenet-updater") .timeout(Duration::from_secs(10)) @@ -458,6 +527,228 @@ pub(crate) fn clear_update_failures_at(dir: &std::path::Path) { let _rm = fs::remove_file(dir.join("update_failures")); } +// ── Persistent GitHub-poll rate limit (token buckets) ────────────────────── +// +// Issue #4073 aggregate-load bounding: every GitHub release poll is gated by an +// on-disk token bucket so that no combination of restarts, peer signals, or +// repeated failed installs can make one node hammer the GitHub REST API. The +// buckets are persisted in `state_dir()` (not in memory) precisely so the fresh, +// short-lived `freenet update` process honours the limit across restarts: an +// in-memory limiter would reset to full on every relaunch and not bound a +// restart loop at all. +// +// There are TWO independent buckets, one per fetch path: +// * the NODE bucket gates the in-process `get_latest_version` (startup check, +// #4589 re-poll, peer-signal loop); +// * the INSTALL bucket gates the supervisor-invoked `get_latest_release` +// (`freenet update`). +// +// They are SEPARATE on purpose. The node always runs before the supervisor's +// `freenet update` in a restart cycle (it detects, exits 42, THEN the installer +// runs), so a single shared bucket would let the node win every token race and +// starve the installer — a low/refilling shared bucket could leave a legitimate +// update unable to actually fetch+install while the node keeps spending the lone +// refill token to re-confirm it. Two buckets bound each path independently and +// remove that ordering bias. +// +// Capacity / refill are tuned so NORMAL operation never trips a limit while a +// runaway loop is firmly capped: +// * Normal load is tiny: ~1 boot poll + the staggered re-poll (a few times a +// day) + the occasional real install — far below capacity, and each consumed +// token refills within ~10 min. +// * A runaway loop is bounded to ~1 token per [`GITHUB_POLL_REFILL_SECS`] once +// the initial capacity drains, i.e. ~6 GitHub calls/hour/node PER PATH +// regardless of restart rate. (The per-target-version install-failure gate +// normally stops the failed-install loop well before then.) + +/// Maximum number of GitHub release polls (per path) in a burst before the +/// refill rate takes over. Comfortably above a real detect→install burst and any +/// normal daily activity, yet far below any rate that would matter to GitHub. +pub(crate) const GITHUB_POLL_BUCKET_CAPACITY: f64 = 8.0; + +/// Refill interval: one token returns every ~10 minutes, so the sustained poll +/// rate of any loop is capped at ~6 GitHub REST calls/hour/node per path. +pub(crate) const GITHUB_POLL_REFILL_SECS: f64 = 600.0; + +/// On-disk bucket file for the in-process node poll path (`get_latest_version`). +const NODE_POLL_BUCKET_FILE: &str = "github_poll_bucket_node"; + +/// On-disk bucket file for the supervisor-side installer (`get_latest_release`). +/// Independent from the node bucket so the node cannot starve the installer. +const INSTALL_POLL_BUCKET_FILE: &str = "github_poll_bucket_install"; + +/// Token-bucket state persisted across processes. +#[derive(Debug, Clone, Copy, PartialEq)] +pub(crate) struct GithubPollBucket { + pub tokens: f64, + pub updated_unix: u64, +} + +/// How an on-disk bucket read resolved. Distinguishes a legitimately-absent +/// bucket (first boot) from a corrupt/torn one so they can be handled +/// differently: missing initialises to a full bucket (so the first real poll is +/// never blocked), whereas corrupt is treated conservatively (deny) so a torn +/// write cannot be used to reset the limiter to full. +#[derive(Debug, Clone, Copy, PartialEq)] +enum BucketRead { + Missing, + Corrupt, + Present(GithubPollBucket), +} + +/// Seconds since the Unix epoch (wall clock). The bucket only needs elapsed +/// real time between polls; tests inject `now_unix` directly into the pure +/// helpers below rather than relying on this. +fn now_unix() -> u64 { + SystemTime::now() + .duration_since(std::time::UNIX_EPOCH) + .map(|d| d.as_secs()) + .unwrap_or(0) +} + +/// Pure token-bucket step. Refills `prev` for the elapsed time, then attempts to +/// spend one token. `prev == None` means "no prior state" and starts from a full +/// bucket so a fresh node's first poll is always allowed. Returns the new state +/// and whether a token was spent (the poll is allowed). +fn github_poll_bucket_step( + prev: Option, + now_unix: u64, + capacity: f64, + refill_secs: f64, +) -> (GithubPollBucket, bool) { + // The stored timestamp must never move BACKWARDS. On a backwards clock + // (suspend/resume, NTP step) `saturating_sub` already prevents a negative + // refill THIS step — but if we then persisted the earlier `now_unix`, a later + // forward step back to the original time would measure elapsed from the + // rewound timestamp and grant refill credit for time that already elapsed + // before the rewind. Anchoring on `max(now, prev)` makes the bucket measure + // elapsed only from the highest timestamp ever seen, so a clock that dips and + // recovers yields zero net credit. + let stored_unix = match prev { + Some(s) => now_unix.max(s.updated_unix), + None => now_unix, + }; + let mut tokens = match prev { + Some(s) => { + let elapsed = now_unix.saturating_sub(s.updated_unix) as f64; + (s.tokens + elapsed / refill_secs).min(capacity) + } + None => capacity, + }; + let allowed = tokens >= 1.0; + if allowed { + tokens -= 1.0; + } + ( + GithubPollBucket { + tokens, + updated_unix: stored_unix, + }, + allowed, + ) +} + +fn read_github_poll_bucket_at(dir: &std::path::Path, file: &str) -> BucketRead { + match fs::read_to_string(dir.join(file)) { + Ok(raw) => { + let mut it = raw.split_whitespace(); + match ( + it.next().and_then(|t| t.parse::().ok()), + it.next().and_then(|t| t.parse::().ok()), + ) { + // Reject non-finite / negative token counts as corrupt: a NaN + // would make every comparison false and could be abused to + // bypass the limiter. + (Some(tokens), Some(updated_unix)) if tokens.is_finite() && tokens >= 0.0 => { + BucketRead::Present(GithubPollBucket { + tokens, + updated_unix, + }) + } + _ => BucketRead::Corrupt, + } + } + Err(e) if e.kind() == std::io::ErrorKind::NotFound => BucketRead::Missing, + // Any other read error (permissions, etc.) is treated as corrupt => + // deny, conservatively, rather than granting a free poll. + Err(_) => BucketRead::Corrupt, + } +} + +fn write_github_poll_bucket_at( + dir: &std::path::Path, + file: &str, + bucket: &GithubPollBucket, +) -> std::io::Result<()> { + fs::create_dir_all(dir)?; + fs::write( + dir.join(file), + format!("{} {}", bucket.tokens, bucket.updated_unix), + ) +} + +/// Try to consume one token from the named on-disk bucket in `dir`. +/// Returns `true` if a poll is permitted (token spent), `false` if rate-limited. +pub(crate) fn try_consume_github_poll_at(dir: &std::path::Path, file: &str, now_unix: u64) -> bool { + let prev = match read_github_poll_bucket_at(dir, file) { + BucketRead::Present(s) => Some(s), + BucketRead::Missing => None, + BucketRead::Corrupt => { + // Deny this poll, and (best-effort) rewrite an empty bucket dated now + // so the limiter self-heals (a token trickles back after the refill + // interval) without ever granting a free token from the corrupt + // state. This makes a corrupt/torn bucket fail closed; if the rewrite + // itself fails the next read stays corrupt and keeps denying. + if write_github_poll_bucket_at( + dir, + file, + &GithubPollBucket { + tokens: 0.0, + updated_unix: now_unix, + }, + ) + .is_err() + { + tracing::debug!("Could not reset corrupt GitHub-poll bucket; staying denied"); + } + return false; + } + }; + let (next, allowed) = github_poll_bucket_step( + prev, + now_unix, + GITHUB_POLL_BUCKET_CAPACITY, + GITHUB_POLL_REFILL_SECS, + ); + // FAIL CLOSED: only allow the poll if we BOTH had a token AND persisted the + // post-consume state. If the write fails (read-only / full state dir), a + // missing bucket would otherwise read as full on every restart and grant an + // unbounded burst — so a non-persistable consume must deny instead. + let persisted = write_github_poll_bucket_at(dir, file, &next).is_ok(); + allowed && persisted +} + +fn try_consume_poll_bucket(file: &str) -> bool { + // Deny when the state directory cannot be resolved — with nowhere to persist + // the bucket we cannot bound a loop, so denying is the safe choice (mirrors + // [`should_attempt_update`]'s `unwrap_or(false)`). + match state_dir() { + Some(dir) => try_consume_github_poll_at(&dir, file, now_unix()), + None => false, + } +} + +/// Try to consume one token from the NODE poll bucket (`get_latest_version`). +pub(crate) fn try_consume_node_poll() -> bool { + try_consume_poll_bucket(NODE_POLL_BUCKET_FILE) +} + +/// Try to consume one token from the INSTALL poll bucket (`get_latest_release`). +/// Independent of the node bucket so the node can never starve the installer. +pub(crate) fn try_consume_install_poll() -> bool { + try_consume_poll_bucket(INSTALL_POLL_BUCKET_FILE) +} + /// Check if we should attempt an update based on failure history. /// /// If the state directory cannot be resolved (e.g. Windows service @@ -995,6 +1286,363 @@ mod tests { ); } + #[test] + fn test_github_poll_bucket_allows_initial_burst_then_caps() { + // Fresh (missing) bucket starts full: the first CAPACITY polls at the + // same instant are allowed, the next is denied (token-bucket cap). + let tmp = tempfile::tempdir().unwrap(); + let dir = tmp.path(); + let now = 1_000_000u64; + + let cap = GITHUB_POLL_BUCKET_CAPACITY as u64; + for i in 0..cap { + assert!( + try_consume_github_poll_at(dir, NODE_POLL_BUCKET_FILE, now), + "poll {i} within capacity must be allowed" + ); + } + assert!( + !try_consume_github_poll_at(dir, NODE_POLL_BUCKET_FILE, now), + "poll beyond capacity at the same instant must be denied" + ); + } + + #[test] + fn test_github_poll_bucket_refills_over_time() { + // After draining, one token returns per refill interval (and no more). + let tmp = tempfile::tempdir().unwrap(); + let dir = tmp.path(); + let mut now = 2_000_000u64; + + let cap = GITHUB_POLL_BUCKET_CAPACITY as u64; + for _ in 0..cap { + assert!(try_consume_github_poll_at(dir, NODE_POLL_BUCKET_FILE, now)); + } + assert!( + !try_consume_github_poll_at(dir, NODE_POLL_BUCKET_FILE, now), + "drained" + ); + + // Half a refill interval: still not enough for a whole token. + now += (GITHUB_POLL_REFILL_SECS as u64) / 2; + assert!( + !try_consume_github_poll_at(dir, NODE_POLL_BUCKET_FILE, now), + "half a refill interval is < 1 token" + ); + + // A full refill interval from the last write: exactly one token back. + now += GITHUB_POLL_REFILL_SECS as u64; + assert!( + try_consume_github_poll_at(dir, NODE_POLL_BUCKET_FILE, now), + "one refill interval should grant exactly one token" + ); + assert!( + !try_consume_github_poll_at(dir, NODE_POLL_BUCKET_FILE, now), + "only one token should have refilled" + ); + } + + #[test] + fn test_github_poll_bucket_refill_is_capped_at_capacity() { + // A long idle period cannot accumulate more than CAPACITY tokens (no + // unbounded credit that would let a later burst exceed the cap). + let tmp = tempfile::tempdir().unwrap(); + let dir = tmp.path(); + let now = 3_000_000u64; + + // Seed an empty bucket, then jump far into the future. + write_github_poll_bucket_at( + dir, + NODE_POLL_BUCKET_FILE, + &GithubPollBucket { + tokens: 0.0, + updated_unix: now, + }, + ) + .unwrap(); + let far_future = now + (GITHUB_POLL_REFILL_SECS as u64) * 10_000; + + let cap = GITHUB_POLL_BUCKET_CAPACITY as u64; + for _ in 0..cap { + assert!(try_consume_github_poll_at( + dir, + NODE_POLL_BUCKET_FILE, + far_future + )); + } + assert!( + !try_consume_github_poll_at(dir, NODE_POLL_BUCKET_FILE, far_future), + "refill must be capped at CAPACITY regardless of idle time" + ); + } + + #[test] + fn test_github_poll_bucket_denies_on_corrupt_file() { + // A corrupt/unparseable bucket must FAIL CLOSED (deny) so a torn write + // cannot be used to reset the limiter to full and bypass it. + let tmp = tempfile::tempdir().unwrap(); + let dir = tmp.path(); + std::fs::write(dir.join(NODE_POLL_BUCKET_FILE), "not-a-bucket").unwrap(); + + assert!( + !try_consume_github_poll_at(dir, NODE_POLL_BUCKET_FILE, 4_000_000), + "corrupt bucket must deny the poll" + ); + + // And it self-heals to an empty-but-valid bucket: a poll one refill + // interval later is allowed again (never a free token from the corrupt + // state). + assert!(matches!( + read_github_poll_bucket_at(dir, NODE_POLL_BUCKET_FILE), + BucketRead::Present(_) + )); + assert!(try_consume_github_poll_at( + dir, + NODE_POLL_BUCKET_FILE, + 4_000_000 + GITHUB_POLL_REFILL_SECS as u64 + )); + } + + #[test] + fn test_github_poll_bucket_nan_and_negative_are_corrupt() { + // Non-finite / negative token counts must be rejected as corrupt rather + // than trusted (a NaN compares false everywhere and could bypass the cap). + let tmp = tempfile::tempdir().unwrap(); + let dir = tmp.path(); + for bad in ["NaN 100", "inf 100", "-1 100", "5", "5 notanint"] { + std::fs::write(dir.join(NODE_POLL_BUCKET_FILE), bad).unwrap(); + assert_eq!( + read_github_poll_bucket_at(dir, NODE_POLL_BUCKET_FILE), + BucketRead::Corrupt, + "{bad:?} must read as corrupt" + ); + } + } + + #[test] + fn test_github_poll_bucket_backwards_clock_gives_no_credit() { + // A backwards clock (suspend/resume, NTP step) must never credit tokens. + let prev = GithubPollBucket { + tokens: 0.0, + updated_unix: 5_000_000, + }; + let (next, allowed) = github_poll_bucket_step( + Some(prev), + 4_000_000, // earlier than updated_unix + GITHUB_POLL_BUCKET_CAPACITY, + GITHUB_POLL_REFILL_SECS, + ); + assert!( + !allowed, + "no token should be available after a backwards clock" + ); + assert_eq!(next.tokens, 0.0, "no negative-time credit"); + // Critically: the stored timestamp must NOT rewind, or a later forward + // step would grant credit for already-elapsed time (Codex finding). + assert_eq!( + next.updated_unix, 5_000_000, + "stored timestamp must not move backwards" + ); + } + + #[test] + fn test_github_poll_bucket_dip_then_recover_grants_no_credit() { + // End-to-end: drain the bucket at T, dip the clock back by a full refill + // interval (denied, no credit), then return to T. The recovery must NOT + // grant a refill token for the window that elapsed before the dip. + let tmp = tempfile::tempdir().unwrap(); + let dir = tmp.path(); + let t = 10_000_000u64; + + let cap = GITHUB_POLL_BUCKET_CAPACITY as u64; + for _ in 0..cap { + assert!(try_consume_github_poll_at(dir, NODE_POLL_BUCKET_FILE, t)); + } + assert!( + !try_consume_github_poll_at(dir, NODE_POLL_BUCKET_FILE, t), + "drained at T" + ); + + // Clock dips back a full refill interval. + assert!( + !try_consume_github_poll_at( + dir, + NODE_POLL_BUCKET_FILE, + t - GITHUB_POLL_REFILL_SECS as u64 + ), + "still empty during the backwards dip" + ); + // Clock returns to T: must NOT have been credited a token by the dip. + assert!( + !try_consume_github_poll_at(dir, NODE_POLL_BUCKET_FILE, t), + "returning to T must not grant credit for pre-dip time" + ); + // A genuine refill interval PAST the high-water mark does grant one token. + assert!(try_consume_github_poll_at( + dir, + NODE_POLL_BUCKET_FILE, + t + GITHUB_POLL_REFILL_SECS as u64 + )); + } + + #[test] + fn test_github_poll_bucket_missing_is_full_not_denied() { + // Regression guard: a legitimately-absent bucket (first boot) must NOT be + // treated like a corrupt one — the first real poll has to go through, or + // a fresh node could never detect an update. + let tmp = tempfile::tempdir().unwrap(); + let dir = tmp.path(); + assert_eq!( + read_github_poll_bucket_at(dir, NODE_POLL_BUCKET_FILE), + BucketRead::Missing + ); + assert!( + try_consume_github_poll_at(dir, NODE_POLL_BUCKET_FILE, 6_000_000), + "first poll on a fresh node must be allowed" + ); + } + + #[cfg(unix)] + #[test] + fn test_github_poll_bucket_fails_closed_when_unpersistable() { + // Codex P2: if the post-consume state cannot be persisted (read-only / + // full state dir), the poll must be DENIED — otherwise a missing bucket + // would read as full on every restart and grant an unbounded burst, + // failing open. Make the dir read-only so the write fails. + use std::os::unix::fs::PermissionsExt; + let tmp = tempfile::tempdir().unwrap(); + let dir = tmp.path(); + let orig = std::fs::metadata(dir).unwrap().permissions(); + std::fs::set_permissions(dir, std::fs::Permissions::from_mode(0o555)).unwrap(); + + let allowed = try_consume_github_poll_at(dir, NODE_POLL_BUCKET_FILE, 8_000_000); + + // Restore perms before asserting so tempdir cleanup always works. + std::fs::set_permissions(dir, orig).unwrap(); + assert!( + !allowed, + "an unpersistable consume must deny (fail closed), not allow" + ); + } + + #[test] + fn test_rate_limited_poll_does_not_record_check_time_or_grow_backoff() { + // Source pin (#4073 Codex): a token-denied poll must not advance the + // last-check timestamp or grow the backoff, or the backoff gate would + // mask `RateLimited` as a plain `Skipped` on the next tick (which can + // still trigger the gateway-trust exit 42). So: (a) record_check_time() + // must NOT be called unconditionally before the get_latest_version match, + // and (b) the RateLimited arm must call neither record_check_time nor + // increase_backoff. + let src = include_str!("auto_update.rs"); + let (_, body) = src + .split_once("pub async fn check_if_update_available(") + .expect("check_if_update_available not found"); + + // (a) Between the backoff gate and the match there must be no + // unconditional record_check_time(). + let gate = body + .find("if !should_check_for_update(current_backoff) {") + .expect("backoff gate not found"); + let match_pos = body + .find("match get_latest_version().await {") + .expect("get_latest_version match not found"); + let between = &body[gate..match_pos]; + // strip line comments so the explanatory NOTE mentioning the function name + // doesn't trip the check + let between_code: String = between + .lines() + .map(|l| l.split_once("//").map(|(c, _)| c).unwrap_or(l)) + .collect::>() + .join("\n"); + assert!( + !between_code.contains("record_check_time()"), + "record_check_time() must NOT run unconditionally before the poll — a \ + rate-limited poll would then advance the backoff gate and mask RateLimited" + ); + + // (b) The RateLimited arm must not record a check time or grow backoff. + let (_, rl_arm) = body + .split_once("e.downcast_ref::().is_some() =>") + .expect("RateLimited arm not found"); + let (rl_body, _) = rl_arm + .split_once("UpdateCheckResult::RateLimited") + .expect("RateLimited arm body not found"); + let rl_code: String = rl_body + .lines() + .map(|l| l.split_once("//").map(|(c, _)| c).unwrap_or(l)) + .collect::>() + .join("\n"); + assert!( + !rl_code.contains("record_check_time(") && !rl_code.contains("increase_backoff("), + "the RateLimited arm must not record a check time or grow backoff" + ); + } + + #[test] + fn test_get_latest_version_consults_rate_limit_bucket() { + // Source pin: the in-node GitHub fetch MUST gate on the persistent token + // bucket at its top, or the loop's GitHub spam is unbounded again (#4073). + let src = include_str!("auto_update.rs"); + let (_, body) = src + .split_once("async fn get_latest_version() -> Result {") + .expect("get_latest_version definition not found"); + let (head, _) = body + .split_once("reqwest::Client::builder()") + .expect("client builder not found"); + assert!( + head.contains("try_consume_node_poll()"), + "get_latest_version must consume a rate-limit token before hitting GitHub" + ); + } + + #[test] + fn test_node_and_install_buckets_are_independent() { + // Codex P1: the node and installer must NOT share a bucket. Draining the + // node bucket completely must leave the install bucket untouched, so the + // node (which always polls first in a restart cycle) can never starve the + // supervisor-side installer of the token it needs to actually update. + let tmp = tempfile::tempdir().unwrap(); + let dir = tmp.path(); + let now = 7_500_000u64; + + // Drain the node bucket to empty. + let cap = GITHUB_POLL_BUCKET_CAPACITY as u64; + for _ in 0..cap { + assert!(try_consume_github_poll_at(dir, NODE_POLL_BUCKET_FILE, now)); + } + assert!( + !try_consume_github_poll_at(dir, NODE_POLL_BUCKET_FILE, now), + "node bucket drained" + ); + + // The install bucket is still full — the installer is not starved. + for _ in 0..cap { + assert!( + try_consume_github_poll_at(dir, INSTALL_POLL_BUCKET_FILE, now), + "install bucket must be unaffected by node-bucket drain" + ); + } + } + + #[test] + fn test_get_latest_release_consults_install_bucket() { + // Source pin: the supervisor-side installer fetch must gate on the + // SEPARATE install bucket (not the node bucket), so the node cannot + // starve it (#4073 Codex P1). + let src = include_str!("update.rs"); + let (_, body) = src + .split_once("async fn get_latest_release(") + .expect("get_latest_release definition not found"); + let (head, _) = body + .split_once("reqwest::Client::builder()") + .expect("client builder not found"); + assert!( + head.contains("try_consume_install_poll()"), + "get_latest_release must consume an INSTALL-bucket token before hitting GitHub" + ); + } + #[test] fn test_backoff_constants() { // Verify backoff progression: 1m -> 2m -> 4m -> 8m -> 16m -> 32m -> 64m (capped to 60m) diff --git a/crates/core/src/bin/commands/rollback.rs b/crates/core/src/bin/commands/rollback.rs index 747dca327c..ac91444ee6 100644 --- a/crates/core/src/bin/commands/rollback.rs +++ b/crates/core/src/bin/commands/rollback.rs @@ -564,6 +564,122 @@ pub(crate) fn is_version_pinned_bad_at(dir: &Path, version: &str) -> bool { read_known_bad_at(dir).as_deref() == Some(version) } +// ── Per-target-version install-failure gate ──────────────────────────────── +// +// Distinct from the crash-loop known-bad pin above: the pin fires when an +// already-INSTALLED version crash-loops, whereas this gate fires when a version +// repeatedly FAILS TO INSTALL (checksum / signature / download / extract). The +// #4586 fail-closed checksum gate turned a bad-manifest install from a +// self-terminating failure into a NON-counting one (it is classified +// `OtherFailure` => `NoChange`, so it never trips the #3934 lockout), which left +// the node free to loop: detect newer X → exit 42 → `freenet update` → install +// fails the gate → no install → re-detect → exit 42 → … forever. +// +// This gate breaks that loop. Each failed install of a target version increments +// a persisted per-version counter; once the SAME version has failed +// [`INSTALL_FAILURE_GATE_THRESHOLD`] times it is gated, and both the node's +// update detection and the installer stop acting on it until a STRICTLY-NEWER +// version appears. Like the pin, the gate is keyed by exact version string, so a +// newer release (a fix) never matches and installs normally. +// +// Degrade-safe (NOT fail-closed like the rate-limit bucket): a missing or +// corrupt gate file reads as "not gated". Treating a corrupt file as "gate +// everything" could brick auto-update entirely (we would not know which version +// to exempt), so the conservative choice here is the opposite of the bucket's — +// the GitHub-spam dimension is already bounded by the rate-limit bucket, and +// atomic tmp+rename writes make corruption unlikely in the first place. + +/// Consecutive failed installs of the SAME target version before that version is +/// gated out of the node's update detection and the installer. Mirrors the spirit +/// of [`ROLLBACK_CRASH_THRESHOLD`] (three confirmations: one is noise, two could +/// be unlucky, by the third the version is demonstrably not installable here). +pub(crate) const INSTALL_FAILURE_GATE_THRESHOLD: u32 = 3; + +/// Per-target-version install-failure record (JSON: version + consecutive count). +const INSTALL_FAILURES_FILE: &str = "install_failures.json"; + +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +pub(crate) struct InstallFailureState { + /// The target version whose install has been failing. + pub version: String, + /// Consecutive failed installs of that version. + pub count: u32, +} + +pub(crate) fn read_install_failures_at(dir: &Path) -> Option { + let raw = std::fs::read_to_string(dir.join(INSTALL_FAILURES_FILE)).ok()?; + serde_json::from_str(&raw).ok() +} + +fn write_install_failures_at(dir: &Path, state: &InstallFailureState) -> Result<()> { + let raw = serde_json::to_vec(state).context("serialize install-failure state")?; + atomic_write(&dir.join(INSTALL_FAILURES_FILE), &raw) +} + +fn clear_install_failures_at(dir: &Path) { + let _rm = std::fs::remove_file(dir.join(INSTALL_FAILURES_FILE)); +} + +/// Record one failed install of `version`. If the record is for a different +/// version (e.g. a newer release became the target), it RESETS to track the new +/// version with a count of 1 — so an old gated version never blocks a new one, +/// and a transient failure of a new version starts its own count. +pub(crate) fn record_install_failure_at(dir: &Path, version: &str) { + let next = match read_install_failures_at(dir) { + Some(prev) if prev.version == version => InstallFailureState { + version: version.to_string(), + count: prev.count.saturating_add(1), + }, + _ => InstallFailureState { + version: version.to_string(), + count: 1, + }, + }; + if let Err(e) = write_install_failures_at(dir, &next) { + // Best-effort: if we cannot persist the counter the gate simply will not + // engage for this version, and the rate-limit bucket still bounds the + // GitHub load. Surface it for diagnosis rather than failing the update. + tracing::warn!( + version, + error = %e, + "Failed to persist per-version install-failure counter (#4073)" + ); + } +} + +/// Record one failed install of `version` against the shared state directory. +pub fn record_install_failure(version: &str) { + if let Some(dir) = state_dir() { + record_install_failure_at(&dir, version); + } +} + +/// Clear the install-failure counter (called after a successful install / when +/// the node is confirmed already up to date — we have moved forward). +pub fn clear_install_failures() { + if let Some(dir) = state_dir() { + clear_install_failures_at(&dir); + } +} + +/// Whether `version` is currently gated by repeated install failures: the stored +/// record is for this exact version AND has reached the threshold. A +/// strictly-newer version never matches (different string), so a fix is never +/// blocked. Degrade-safe: missing/corrupt record => not gated. +pub(crate) fn is_version_install_gated_at(dir: &Path, version: &str) -> bool { + match read_install_failures_at(dir) { + Some(state) => state.version == version && state.count >= INSTALL_FAILURE_GATE_THRESHOLD, + None => false, + } +} + +/// Whether `version` is install-gated against the shared state directory. +pub fn is_version_install_gated(version: &str) -> bool { + state_dir() + .map(|d| is_version_install_gated_at(d.as_path(), version)) + .unwrap_or(false) +} + // ── Post-stop crash classification / handling / rollback ─────────────────── /// Read the raw post-stop status string the supervisor forwarded, if any. May @@ -1306,6 +1422,144 @@ mod tests { ); } + #[test] + fn install_gate_engages_after_threshold_failures_of_same_version() { + // Core #4073 regression: repeated failed installs of the SAME version + // must, after the threshold, gate that version out of update detection — + // this is what bounds the detect → exit 42 → failed install → restart + // loop a bad manifest/checksum would otherwise sustain forever. + let tmp = tempfile::tempdir().unwrap(); + let dir = tmp.path(); + + for n in 1..INSTALL_FAILURE_GATE_THRESHOLD { + record_install_failure_at(dir, "0.2.90"); + assert!( + !is_version_install_gated_at(dir, "0.2.90"), + "below threshold ({n}) must not gate yet" + ); + } + record_install_failure_at(dir, "0.2.90"); + assert!( + is_version_install_gated_at(dir, "0.2.90"), + "threshold reached: version must be gated" + ); + } + + #[test] + fn install_gate_allows_strictly_newer_version() { + // A gated version must NOT block a different (newer) release — the fix. + let tmp = tempfile::tempdir().unwrap(); + let dir = tmp.path(); + for _ in 0..INSTALL_FAILURE_GATE_THRESHOLD { + record_install_failure_at(dir, "0.2.90"); + } + assert!(is_version_install_gated_at(dir, "0.2.90")); + assert!( + !is_version_install_gated_at(dir, "0.2.91"), + "a newer version must not be gated by an older version's failures" + ); + } + + #[test] + fn install_gate_resets_when_target_version_changes() { + // If a newer release becomes the failing target, the counter resets to + // track it (count 1), so the old version's accumulated failures don't + // instantly gate the new one. + let tmp = tempfile::tempdir().unwrap(); + let dir = tmp.path(); + for _ in 0..INSTALL_FAILURE_GATE_THRESHOLD { + record_install_failure_at(dir, "0.2.90"); + } + assert!(is_version_install_gated_at(dir, "0.2.90")); + + record_install_failure_at(dir, "0.2.91"); + let state = read_install_failures_at(dir).unwrap(); + assert_eq!(state.version, "0.2.91"); + assert_eq!(state.count, 1, "new target starts a fresh count"); + assert!(!is_version_install_gated_at(dir, "0.2.91")); + // The old version is no longer tracked, so it is no longer gated either. + assert!(!is_version_install_gated_at(dir, "0.2.90")); + } + + #[test] + fn install_gate_cleared_on_success() { + let tmp = tempfile::tempdir().unwrap(); + let dir = tmp.path(); + for _ in 0..INSTALL_FAILURE_GATE_THRESHOLD { + record_install_failure_at(dir, "0.2.90"); + } + assert!(is_version_install_gated_at(dir, "0.2.90")); + + clear_install_failures_at(dir); + assert!(read_install_failures_at(dir).is_none()); + assert!(!is_version_install_gated_at(dir, "0.2.90")); + // Clearing an already-clear counter is idempotent. + clear_install_failures_at(dir); + assert!(!is_version_install_gated_at(dir, "0.2.90")); + } + + #[test] + fn install_gate_degrades_safe_on_missing_or_corrupt() { + // Degrade-safe (NOT fail-closed): a missing or corrupt record reads as + // "not gated" so a torn file can never brick auto-update entirely. + let tmp = tempfile::tempdir().unwrap(); + let dir = tmp.path(); + + // Missing. + assert!(!is_version_install_gated_at(dir, "0.2.90")); + assert!(read_install_failures_at(dir).is_none()); + + // Corrupt. + std::fs::write(dir.join(INSTALL_FAILURES_FILE), "{not valid json").unwrap(); + assert!(read_install_failures_at(dir).is_none()); + assert!(!is_version_install_gated_at(dir, "0.2.90")); + } + + #[test] + fn install_gate_uses_atomic_write_no_temp_left_behind() { + // Writes go through atomic_write (tmp + rename); no stray temp remains. + let tmp = tempfile::tempdir().unwrap(); + let dir = tmp.path(); + record_install_failure_at(dir, "0.2.90"); + assert!(read_install_failures_at(dir).is_some()); + let leftover_tmp = dir.join(format!(".{INSTALL_FAILURES_FILE}.tmp")); + assert!( + !leftover_tmp.exists(), + "atomic_write must not leave a temp file behind" + ); + } + + #[test] + fn install_failure_loop_is_bounded_by_the_gate() { + // End-to-end bound: simulate the failed-install loop. Each cycle the node + // would detect X and the supervisor's `freenet update` would fail to + // install X (recording a failure). After at most + // INSTALL_FAILURE_GATE_THRESHOLD cycles the node's detection is gated and + // stops emitting exit 42 for X — so the loop cannot run unbounded. + let tmp = tempfile::tempdir().unwrap(); + let dir = tmp.path(); + + let mut emitted_exit_42 = 0u32; + for _cycle in 0..1000 { + // Node detection: would it emit exit 42 for X this cycle? + if is_version_install_gated_at(dir, "0.2.90") { + break; // gated -> node stays put, loop is broken + } + emitted_exit_42 += 1; + // Supervisor runs `freenet update`, install fails -> record. + record_install_failure_at(dir, "0.2.90"); + } + + assert!( + is_version_install_gated_at(dir, "0.2.90"), + "the loop must end with the version gated" + ); + assert_eq!( + emitted_exit_42, INSTALL_FAILURE_GATE_THRESHOLD, + "node must stop emitting exit 42 after exactly the threshold cycles" + ); + } + #[test] fn capture_known_good_records_real_hash() { let tmp = tempfile::tempdir().unwrap(); diff --git a/crates/core/src/bin/commands/service.rs b/crates/core/src/bin/commands/service.rs index 777d77e3b6..c57443d68a 100644 --- a/crates/core/src/bin/commands/service.rs +++ b/crates/core/src/bin/commands/service.rs @@ -1436,6 +1436,75 @@ mod tests { ); } + /// Regression for #4073 (aggregate-load bounding): the macOS wrapper's + /// `while true` loop must give up after a consecutive-failure cap, so a + /// committed version that crash-loops (or an update that never succeeds) does + /// not restart and poll GitHub forever. Mirrors the in-process run-wrapper's + /// WRAPPER_MAX_CONSECUTIVE_FAILURES cap. + #[test] + #[cfg(target_os = "macos")] + fn test_macos_wrapper_caps_consecutive_failures() { + let binary_path = PathBuf::from("/usr/local/bin/freenet"); + let script = generate_wrapper_script(&binary_path); + + assert!( + script.contains("MAX_CONSECUTIVE_FAILURES=50"), + "wrapper must define a consecutive-failure cap" + ); + assert!( + script.contains("give_up_if_failing"), + "wrapper must call the give-up helper to exit the loop on too many failures" + ); + // The helper must actually exit (terminate the loop), not just log. + // It MUST exit 0: the plist sets KeepAlive.SuccessfulExit=false, so a + // non-zero exit would be respawned by launchd (resetting the counter), + // defeating the cap. Only a clean exit 0 is an intentional terminal stop. + let helper_idx = script + .find("give_up_if_failing() {") + .expect("give_up_if_failing helper must be defined"); + let helper_body = &script[helper_idx..]; + let brace_end = helper_body.find("}").expect("helper body"); + let helper = &helper_body[..brace_end]; + assert!( + helper.contains("exit 0"), + "give_up_if_failing must exit 0 so launchd (SuccessfulExit=false) does not respawn" + ); + assert!( + !helper.contains("exit 1"), + "give_up_if_failing must NOT exit non-zero — launchd would respawn and reset the counter" + ); + // The cap must sit above the crash-loop rollback threshold so rollback + // always fires first. + assert!( + super::super::rollback::ROLLBACK_CRASH_THRESHOLD < 50, + "wrapper cap must exceed the rollback crash threshold" + ); + // The cap must target a tight crash LOOP, not occasional crashes: a child + // that ran healthily long enough resets the streak so it never + // accumulates to the cap over a long lifetime. + assert!( + script.contains("MIN_HEALTHY_RUNTIME=300"), + "wrapper must define a healthy-runtime threshold" + ); + assert!( + script.contains("CHILD_RUNTIME") && script.contains("CONSECUTIVE_FAILURES=0"), + "wrapper must reset the consecutive-failure streak after a healthy run" + ); + // A benign updater no-op (exit 2 = already up to date / rate-limited / + // pinned / install-gated) on the exit-42 path must NOT count toward the + // cap. The script captures UPDATE_RC and compares against the injected + // EXIT_CODE_ALREADY_UP_TO_DATE (2). + assert!( + script.contains("UPDATE_RC=$?") + && script.contains(&format!( + "UPDATE_RC\" -eq {}", + super::super::update::EXIT_CODE_ALREADY_UP_TO_DATE + )), + "exit-42 path must treat the updater's already-up-to-date/rate-limited \ + exit as a benign no-op, not a counted failure" + ); + } + /// Regression for issue #3967: on exit 43 the wrapper must self-heal a /// STALE ORPHAN holding the service port instead of unconditionally /// standing down. Standing down (`exit 0`) under launchd diff --git a/crates/core/src/bin/commands/service/linux.rs b/crates/core/src/bin/commands/service/linux.rs index 4dd6ca5c7e..4de245466b 100644 --- a/crates/core/src/bin/commands/service/linux.rs +++ b/crates/core/src/bin/commands/service/linux.rs @@ -332,6 +332,33 @@ Restart=always # Wait 10 seconds before restart to avoid rapid restart loops. The actual # crash-loop cap (StartLimit*) lives in the [Unit] section above (#4551). RestartSec=10 +# Restart backoff (#4073): grow the inter-restart delay from RestartSec up to +# RestartMaxDelaySec over RestartSteps restarts, so a residual crash/exit loop +# slows down (=> up to 300s between restarts) instead of reconnecting to the +# gateway every ~10s -- gentler on the gateways, and it keeps the node trying for +# a fix instead of hammering. +# +# DELIBERATE tradeoff with the #4551 crash-loop limiter (StartLimitBurst=5 / +# StartLimitIntervalSec=120, StartLimitAction=none in [Unit]). The growing delay +# changes WHICH crashes reach #4551's terminal "failed-state stop": +# * A TRUE fast crash (sub-second startup wedge) still clusters its first ~5 +# restarts well inside the 120s window while the delay is still small, so it +# STILL trips StartLimitBurst and the unit terminal-stops. The #4551 +# brick-loop guarantee is preserved for the case it was designed for. +# * A MEDIUM-runtime crash (node dies tens of seconds in) can, once the delay +# has grown toward 300s, be spaced PAST the 120s burst window, so it no +# longer trips StartLimitBurst and instead slow-flaps at <=300s rather than +# terminal-stopping. This is accepted: such a node is not a tight brick-loop, +# the slow cadence is easy on the gateways, and continuing to restart lets a +# later fixed release (via the ExecStopPost `freenet update` below) heal it. +# A bad UPDATE that crash-loops is still disarmed within its probation window +# by the #4073/#4591 auto-rollback regardless of this cadence. +# +# These directives require systemd >= 254; OLDER systemd silently IGNORES unknown +# directives (they are not parse errors), so the unit degrades gracefully to the +# fixed RestartSec=10 above (and #4551's limiter then behaves exactly as before). +RestartSteps=10 +RestartMaxDelaySec=300 # Allow 45 seconds for graceful shutdown before SIGKILL. # The node handles SIGTERM by (1) waiting up to `shutdown-drain-secs` # (default 30s) for in-flight client PUT/GET/UPDATE/SUBSCRIBE drivers @@ -463,6 +490,33 @@ Restart=always # Wait 10 seconds before restart to avoid rapid restart loops. The actual # crash-loop cap (StartLimit*) lives in the [Unit] section above (#4551). RestartSec=10 +# Restart backoff (#4073): grow the inter-restart delay from RestartSec up to +# RestartMaxDelaySec over RestartSteps restarts, so a residual crash/exit loop +# slows down (=> up to 300s between restarts) instead of reconnecting to the +# gateway every ~10s -- gentler on the gateways, and it keeps the node trying for +# a fix instead of hammering. +# +# DELIBERATE tradeoff with the #4551 crash-loop limiter (StartLimitBurst=5 / +# StartLimitIntervalSec=120, StartLimitAction=none in [Unit]). The growing delay +# changes WHICH crashes reach #4551's terminal "failed-state stop": +# * A TRUE fast crash (sub-second startup wedge) still clusters its first ~5 +# restarts well inside the 120s window while the delay is still small, so it +# STILL trips StartLimitBurst and the unit terminal-stops. The #4551 +# brick-loop guarantee is preserved for the case it was designed for. +# * A MEDIUM-runtime crash (node dies tens of seconds in) can, once the delay +# has grown toward 300s, be spaced PAST the 120s burst window, so it no +# longer trips StartLimitBurst and instead slow-flaps at <=300s rather than +# terminal-stopping. This is accepted: such a node is not a tight brick-loop, +# the slow cadence is easy on the gateways, and continuing to restart lets a +# later fixed release (via the ExecStopPost `freenet update` below) heal it. +# A bad UPDATE that crash-loops is still disarmed within its probation window +# by the #4073/#4591 auto-rollback regardless of this cadence. +# +# These directives require systemd >= 254; OLDER systemd silently IGNORES unknown +# directives (they are not parse errors), so the unit degrades gracefully to the +# fixed RestartSec=10 above (and #4551's limiter then behaves exactly as before). +RestartSteps=10 +RestartMaxDelaySec=300 # Allow 45 seconds for graceful shutdown before SIGKILL. # The node handles SIGTERM by (1) waiting up to `shutdown-drain-secs` # (default 30s) for in-flight client PUT/GET/UPDATE/SUBSCRIBE drivers @@ -747,6 +801,42 @@ mod tests { assert_start_limit_directives_are_in_unit_section("system", &unit); } + #[test] + fn systemd_units_have_restart_backoff() { + // #4073: both units must add escalating restart backoff + // (RestartSteps + RestartMaxDelaySec) on top of RestartSec so a residual + // loop that doesn't trip StartLimitBurst still slows down (fewer gateway + // reconnects). These are systemd >= 254 directives that older systemd + // ignores; we only assert they are emitted in [Service]. + let user_unit = generate_user_service_file( + Path::new("/usr/local/bin/freenet"), + Path::new("/home/test/.local/state/freenet"), + ); + let system_unit = generate_system_service_file( + Path::new("/usr/local/bin/freenet"), + Path::new("/home/test/.local/state/freenet"), + "testuser", + Path::new("/home/test"), + ); + for (name, unit) in [("user", &user_unit), ("system", &system_unit)] { + let service = section(unit, "Service"); + assert!( + service.lines().any(|l| l.trim() == "RestartSec=10"), + "{name} unit must keep the base RestartSec" + ); + assert!( + service.lines().any(|l| l.trim() == "RestartSteps=10"), + "{name} unit must emit RestartSteps in [Service]" + ); + assert!( + service + .lines() + .any(|l| l.trim() == "RestartMaxDelaySec=300"), + "{name} unit must emit RestartMaxDelaySec in [Service]" + ); + } + } + #[test] fn systemd_units_keep_auto_update_success_exit_status() { let user_unit = generate_user_service_file( diff --git a/crates/core/src/bin/commands/service/macos.rs b/crates/core/src/bin/commands/service/macos.rs index bc48270393..9556929c6c 100644 --- a/crates/core/src/bin/commands/service/macos.rs +++ b/crates/core/src/bin/commands/service/macos.rs @@ -142,6 +142,24 @@ export {supervised_env}=1 BACKOFF=10 # Initial backoff in seconds MAX_BACKOFF=300 # Maximum backoff (5 minutes) CONSECUTIVE_FAILURES=0 +# Give up (exit the wrapper) after this many consecutive failures, so a +# persistently-failing node — a committed version that crash-loops, or an update +# that never succeeds — does not thrash and poll GitHub forever. Mirrors the +# in-process run-wrapper's WRAPPER_MAX_CONSECUTIVE_FAILURES cap (service.rs) and +# stays well above the #4073 crash-loop rollback threshold (3) so rollback always +# fires first. The give-up path exits 0 (NOT non-zero): the plist sets +# KeepAlive.SuccessfulExit=false, so launchd RESPAWNS on a non-zero exit and only +# treats a clean exit 0 as an intentional terminal stop (same reasoning as the +# exit-0/exit-43 paths below). A non-zero exit here would just relaunch a fresh +# wrapper with CONSECUTIVE_FAILURES reset to 0, defeating the cap entirely. +MAX_CONSECUTIVE_FAILURES=50 +# A child that ran healthily for at least this long before exiting is treated as +# "made progress": its failure does NOT count toward the consecutive cap. This +# keeps the cap aimed at a tight CRASH LOOP (repeated fast failures) rather than +# at a node that runs fine for a long time and then crashes occasionally — the +# latter must keep being restarted, not eventually give up. Comfortably past the +# #4073 60s commit window and the 300s max backoff. +MIN_HEALTHY_RUNTIME=300 PORT_CONFLICT_KILLS=0 MAX_PORT_CONFLICT_KILLS=3 # Give up after this many kill attempts @@ -157,6 +175,18 @@ log_event() {{ logger -t freenet "$1" }} +# Exit the wrapper loop once consecutive failures hit the cap, so a node that +# never comes up healthy stops restarting (and stops polling GitHub) instead of +# looping forever. Called right after each failure increment. Exits 0 so launchd +# (KeepAlive.SuccessfulExit=false) treats it as an intentional stop and does NOT +# respawn — a non-zero exit would be respawned and reset the counter. +give_up_if_failing() {{ + if [ "$CONSECUTIVE_FAILURES" -ge "$MAX_CONSECUTIVE_FAILURES" ]; then + log_event "Giving up after $CONSECUTIVE_FAILURES consecutive failures; stopping wrapper for operator intervention (clean exit so launchd does not respawn)." + exit 0 + fi +}} + # Print a binary's full version identity line, e.g. # Freenet version: 0.2.71 (abc1234) # Bounded by a 5s timeout so a wedged binary can't stall the whole self-heal @@ -285,6 +315,7 @@ while true; do # exit-43 self-heal path avoid mistaking our just-exited child for a # stale orphan still holding the port, and lets the TERM trap above forward # launchd's stop signal to the node for a graceful drain. + CHILD_START=$(date +%s) "{binary}" network 2>"$HOME/Library/Logs/freenet/freenet.error.log.last" & WRAPPER_CHILD_PID=$! # `wait` is interrupted by the trapped TERM; re-wait so we collect the @@ -295,6 +326,13 @@ while true; do wait $WRAPPER_CHILD_PID EXIT_CODE=$? done + # How long the child ran this cycle. A long healthy run before a later + # non-zero exit clears the consecutive-failure streak (see give_up_if_failing + # / MIN_HEALTHY_RUNTIME) so only a tight crash loop trips the cap. + CHILD_RUNTIME=$(( $(date +%s) - CHILD_START )) + if [ "$CHILD_RUNTIME" -ge "$MIN_HEALTHY_RUNTIME" ]; then + CONSECUTIVE_FAILURES=0 + fi if [ $EXIT_CODE -eq 42 ]; then log_event "Update needed, running freenet update..." @@ -308,7 +346,21 @@ while true; do BACKOFF=10 sleep 2 else + UPDATE_RC=$? + if [ "$UPDATE_RC" -eq {already_up_to_date} ]; then + # Benign no-op: `freenet update` exited "already up to date" — no + # update was performed because we are already current, the target + # is rate-limited, pinned known-bad, or install-gated (#4073). + # This is NOT a failure, so it must NOT count toward the give-up + # cap; just restart the same binary after a short pause. + log_event "Update reported nothing to do (exit $UPDATE_RC); restarting without counting a failure." + PORT_CONFLICT_KILLS=0 + BACKOFF=10 + sleep 2 + continue + fi CONSECUTIVE_FAILURES=$((CONSECUTIVE_FAILURES + 1)) + give_up_if_failing log_event "Update failed (attempt $CONSECUTIVE_FAILURES), backing off $BACKOFF seconds..." sleep $BACKOFF BACKOFF=$((BACKOFF * 2)) @@ -364,6 +416,7 @@ while true; do fi fi CONSECUTIVE_FAILURES=$((CONSECUTIVE_FAILURES + 1)) + give_up_if_failing PORT_CONFLICT_KILLS=0 log_event "Exited with code $EXIT_CODE, restarting after backoff..." sleep $BACKOFF @@ -375,6 +428,7 @@ done binary = binary_path.display(), supervised_env = super::super::auto_update::SUPERVISED_ENV_VAR, post_stop_env = super::super::rollback::POST_STOP_EXIT_CODE_ENV_VAR, + already_up_to_date = super::super::update::EXIT_CODE_ALREADY_UP_TO_DATE, ) } diff --git a/crates/core/src/bin/commands/update.rs b/crates/core/src/bin/commands/update.rs index fb1c337138..566762c494 100644 --- a/crates/core/src/bin/commands/update.rs +++ b/crates/core/src/bin/commands/update.rs @@ -174,6 +174,58 @@ pub fn macos_dmg_asset_name(tag_name: &str) -> String { format!("Freenet-{}.dmg", version) } +/// Deterministic "this release's artifacts do not verify" error: a checksum +/// MISMATCH or an invalid/wrong-length release signature. +/// +/// Distinct from TRANSIENT failures (GitHub/download outage, a checksum manifest +/// that is still propagating, extraction/disk errors) so the per-target-version +/// install-failure gate (#4073) engages ONLY for a genuinely bad / un-installable +/// release — never for a flaky network that will succeed once it recovers. +/// Gating on transient failures would permanently suppress a perfectly good +/// release after a brief outage (Codex finding). +#[derive(Debug)] +struct ReleaseVerificationError(String); + +impl std::fmt::Display for ReleaseVerificationError { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "{}", self.0) + } +} + +impl std::error::Error for ReleaseVerificationError {} + +/// Whether `err` is a deterministic release-verification failure (checksum / +/// signature), i.e. a "bad release" signal that should count toward the +/// install-failure gate. Checks the top-level error type; the verification call +/// sites deliberately propagate it without wrapping `.context()` so this +/// downcast stays reliable. +fn is_release_verification_failure(err: &anyhow::Error) -> bool { + err.downcast_ref::().is_some() +} + +/// Outcome of [`UpdateCommand::download_and_install`] that the caller needs to +/// drive the per-target-version install-failure gate (#4073). A plain `Ok(())` +/// is ambiguous on macOS: the bundle path returns `Ok(())` BOTH on a real +/// install (handled by exiting the process) and on a DMG-swap *failure* that is +/// deliberately swallowed to avoid corrupting the signed bundle. The caller must +/// not treat the latter as a success (which would clear the gate and let a +/// failing DMG retry forever), so we distinguish them explicitly. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +enum InstallOutcome { + /// A new binary/bundle was actually installed: clear the install-failure gate. + Installed, + /// The macOS DMG-swap failed and was swallowed (return `Ok` to protect the + /// signed bundle) — NO update was installed. `verification_failure` is true + /// when the cause was a deterministic DMG checksum/signature failure (counts + /// toward the gate) and false for a transient failure (leaves the gate + /// unchanged so a flaky network does not gate a good release). + /// + /// Only constructed on macOS; the variant exists on all targets so the + /// caller's match is uniform. + #[cfg_attr(not(target_os = "macos"), allow(dead_code))] + BundleSkipped { verification_failure: bool }, +} + #[derive(Args, Debug, Clone)] pub struct UpdateCommand { /// Only check if an update is available without installing @@ -265,7 +317,7 @@ impl UpdateCommand { println!("Checking for updates..."); } - let latest = get_latest_release().await?; + let latest = get_latest_release(self.force, self.quiet).await?; let latest_version = latest.tag_name.trim_start_matches('v'); if !self.quiet { @@ -292,6 +344,10 @@ impl UpdateCommand { // take this `AlreadyUpToDate` branch without ever touching // the counter (skeptical-review H1 on PR #3941). super::auto_update::clear_update_failures(); + // We are at (or ahead of) the latest release, so there is no failing + // target install to gate anymore — clear the per-version + // install-failure counter too (#4073). + super::rollback::clear_install_failures(); // Exit with a distinct code so the service wrapper knows no update // was performed and can skip the unnecessary restart. std::process::exit(EXIT_CODE_ALREADY_UP_TO_DATE); @@ -307,21 +363,29 @@ impl UpdateCommand { return Ok(()); } - // #4073 crash-loop auto-rollback: refuse to (re-)install a version that - // a prior crash-loop rollback pinned as known-bad on THIS node, unless - // the operator explicitly forces it. This is what stops the - // update → crash → revert → re-update loop. A strictly-newer release (a - // fix) is not pinned and installs normally. - if !self.force && rollback::is_version_pinned_bad(latest_version) { + // #4073: refuse to (re-)install a version that is locally BLOCKED, unless + // the operator explicitly forces it. Two independent reasons: + // * crash-loop known-bad pin — the version installed and then + // crash-looped; this stops update → crash → revert → re-update. + // * install-failure gate — the version has failed to install + // (checksum / signature / download / extract) too many times; this + // stops detect → exit 42 → failed install → restart → re-detect. + // A strictly-newer release (a fix) matches neither and installs normally. + if !self.force + && (rollback::is_version_pinned_bad(latest_version) + || rollback::is_version_install_gated(latest_version)) + { if !self.quiet { println!( - "Version {latest_version} is pinned as known-bad on this node after a previous \ - crash-loop; refusing to re-install it. Run `freenet update --force` to override." + "Version {latest_version} is locally blocked on this node (crash-loop known-bad \ + pin or repeated install failures); refusing to (re-)install it. Run \ + `freenet update --force` to override." ); } tracing::warn!( version = latest_version, - "Refusing to install a version pinned known-bad after crash-loop rollback (#4073)" + "Refusing to install a locally-blocked version (known-bad pin or install-failure \ + gate) (#4073)" ); // Behave like 'already up to date' so a supervisor does not // restart-loop trying to apply the bad version: there is nothing @@ -333,10 +397,61 @@ impl UpdateCommand { if !self.quiet { println!("Downloading update..."); } - self.download_and_install(&latest, current_version).await + // #4073 per-target-version install-failure gate. Count a DETERMINISTIC + // install failure of this version — a checksum mismatch or invalid + // release signature (`ReleaseVerificationError`) — toward the gate. After + // INSTALL_FAILURE_GATE_THRESHOLD such failures of the SAME version, the + // node's update detection and the installer stop acting on it (see + // `rollback::is_version_install_gated`), breaking the + // detect → exit 42 → failed install → restart loop a bad + // manifest/checksum/signature would otherwise sustain indefinitely. + // + // TRANSIENT failures (GitHub/download outage, a still-propagating + // checksum manifest, extraction/disk errors) are deliberately NOT + // counted: they self-resolve, and gating on them would permanently + // suppress a perfectly good release after a brief outage. A successful + // install clears the counter (we have moved forward). + // + // On the macOS DMG-swap path `download_and_install` exits the process on + // success and never returns here; a swallowed bundle FAILURE returns + // `BundleSkipped { verification_failure }`, which carries the same + // deterministic-vs-transient distinction. + let install_result = self.download_and_install(&latest, current_version).await; + match &install_result { + Ok(InstallOutcome::Installed) => super::rollback::clear_install_failures(), + Ok(InstallOutcome::BundleSkipped { + verification_failure, + }) => { + if *verification_failure { + super::rollback::record_install_failure(latest_version); + } + // transient bundle failure: leave the gate unchanged. + } + Err(e) if is_release_verification_failure(e) => { + super::rollback::record_install_failure(latest_version) + } + // Transient install error: neither record nor clear — retry later. + // + // DELIBERATELY includes the fail-closed "missing artifact" refusals + // (absent SHA256SUMS.txt, no manifest entry for our asset, absent + // .sig when required) as well as download/network/extraction errors. + // A MISSING manifest/entry is the canonical release-PROPAGATION race + // (assets upload after the tag is live), so it self-resolves — gating + // it would risk permanently suppressing a good release that the node + // happened to poll seconds early, which is a worse regression than the + // already rate-limit-bounded retry cadence. Only a PRESENT-but-WRONG + // artifact (checksum mismatch / invalid signature) is an unambiguous + // bad-release signal and is gated above via ReleaseVerificationError. + Err(_) => {} + } + install_result.map(|_| ()) } - async fn download_and_install(&self, release: &Release, current_version: &str) -> Result<()> { + async fn download_and_install( + &self, + release: &Release, + current_version: &str, + ) -> Result { // macOS DMG-swap path: when running from inside a .app bundle, // in-place binary replacement would invalidate the bundle's code // signature and Gatekeeper would refuse to launch the result on @@ -400,7 +515,15 @@ impl UpdateCommand { "Bundle update failed: {e}. Skipping update to avoid corrupting the signed bundle. Next attempt will retry." ); } - return Ok(()); + // No update was installed. Surface as a bundle-skip carrying + // whether the cause was a deterministic DMG checksum/signature + // failure (counts toward the gate, #4073) vs a transient one + // (leaves the gate unchanged), even though we exit 0 to + // protect the signed bundle. + let verification_failure = is_release_verification_failure(&e); + return Ok(InstallOutcome::BundleSkipped { + verification_failure, + }); } Err(e) => { // Not in a bundle: safe to fall through to binary- @@ -680,7 +803,7 @@ impl UpdateCommand { } } - Ok(()) + Ok(InstallOutcome::Installed) } /// Download `SHA256SUMS.txt` and, when the release published it, @@ -1013,7 +1136,33 @@ impl Checksums { } } -async fn get_latest_release() -> Result { +async fn get_latest_release(force: bool, quiet: bool) -> Result { + // Persistent rate limit (#4073): this is the supervisor-side choke point — + // the on-crash / on-update `freenet update` one-shot reaches GitHub through + // here. It uses its OWN on-disk token bucket (the INSTALL bucket), separate + // from the node's `get_latest_version` (NODE bucket), so the node — which + // always runs first in a restart cycle — can never spend the installer's + // tokens and starve a legitimate update. Because each `freenet update` is a + // fresh process, an in-memory limiter would reset on every restart and not + // bound a loop; the on-disk bucket holds the limit across restarts. + // + // `--force` (an explicit operator action) bypasses the limiter. An automated, + // token-denied poll exits EXIT_CODE_ALREADY_UP_TO_DATE so the supervisor + // treats it as "nothing to do" and backs off, rather than as a failure that + // would count toward any lockout. + if !force && !super::auto_update::try_consume_install_poll() { + if !quiet { + eprintln!( + "Update check rate-limited (too many recent GitHub polls); skipping this cycle. \ + Run `freenet update --force` to override." + ); + } + tracing::warn!( + "GitHub update poll rate-limited (token bucket empty); exiting as up-to-date (#4073)" + ); + std::process::exit(EXIT_CODE_ALREADY_UP_TO_DATE); + } + let client = reqwest::Client::builder() .user_agent("freenet-updater") // Bound the request (parity with auto_update::get_latest_version). The @@ -1149,12 +1298,14 @@ fn verify_manifest_signature_with( }; // ed25519 signatures are exactly 64 bytes. A wrong length means the asset - // is truncated or not a raw signature — refuse rather than guess. + // is truncated or not a raw signature — refuse rather than guess. This is a + // deterministic bad-artifact signal (ReleaseVerificationError) so it counts + // toward the install-failure gate (#4073). let sig_array: [u8; 64] = sig_bytes.try_into().map_err(|_| { - anyhow::anyhow!( + ReleaseVerificationError(format!( "Release signature has wrong length ({} bytes, expected 64); refusing to install.", sig_bytes.len() - ) + )) })?; let parsed_signature = Signature::from_bytes(&sig_array); @@ -1163,14 +1314,15 @@ fn verify_manifest_signature_with( // `verify_strict` rejects non-canonical / small-order points (the same // hardening the website contract uses). A failure here means the manifest - // was tampered with or signed by a key we don't trust: fail closed. + // was tampered with or signed by a key we don't trust: fail closed. Typed as + // a ReleaseVerificationError so it counts toward the install-failure gate. verifying_key .verify_strict(manifest_bytes, &parsed_signature) .map_err(|e| { - anyhow::anyhow!( + ReleaseVerificationError(format!( "Release signature verification failed: {e}. The checksum manifest may be \ corrupted or tampered with; refusing to install." - ) + )) })?; if !quiet { @@ -1195,6 +1347,14 @@ fn verify_manifest_signature_with( /// transiently-missing manifest therefore retries under the existing /// exponential backoff instead of permanently disabling auto-update. /// +/// DELIBERATELY a plain error, NOT a [`ReleaseVerificationError`]: a missing +/// manifest/entry is the canonical release-PROPAGATION race (the tag goes live +/// before the assets finish uploading), so it must stay transient and NOT count +/// toward the per-version install-failure gate (#4073) — gating it would risk +/// permanently suppressing a good release polled seconds early. Only a +/// PRESENT-but-WRONG artifact (hash mismatch / invalid signature) is the +/// deterministic bad-release signal that gates. +/// /// Pure (no I/O) so the fail-closed contract is unit-testable on every CI /// runner regardless of target OS. fn required_checksum<'a>(checksums: Option<&'a Checksums>, asset_name: &str) -> Result<&'a str> { @@ -1231,12 +1391,15 @@ fn verify_checksum(file_path: &Path, expected_hash: &str) -> Result<()> { }); if actual_hash != expected_hash { - anyhow::bail!( - "Checksum verification failed!\nExpected: {}\nGot: {}\n\ - The download may be corrupted or tampered with.", - expected_hash, - actual_hash - ); + // Deterministic "bad release" signal: the downloaded artifact does not + // match the (signature-authenticated) manifest. Typed so the caller + // counts it toward the per-version install-failure gate (#4073), unlike a + // transient download error. + return Err(ReleaseVerificationError(format!( + "Checksum verification failed!\nExpected: {expected_hash}\nGot: {actual_hash}\n\ + The download may be corrupted or tampered with." + )) + .into()); } Ok(()) @@ -3577,6 +3740,99 @@ done ); } + #[test] + fn verification_failures_are_classified_transient_are_not() { + // The gate keys off is_release_verification_failure. A checksum mismatch + // and an invalid/wrong-length signature must classify as deterministic + // verification failures (gate-worthy); a generic/transient error (e.g. a + // download outage) must NOT. + let tmp = tempfile::tempdir().unwrap(); + let f = tmp.path().join("artifact"); + std::fs::write(&f, b"the actual bytes").unwrap(); + // Wrong expected hash -> checksum mismatch. + let mismatch = verify_checksum(&f, &"0".repeat(64)).unwrap_err(); + assert!( + is_release_verification_failure(&mismatch), + "checksum mismatch must be a verification failure" + ); + + // Invalid signature (random 64 bytes against the real pubkey). + let bad_sig = [7u8; 64]; + let sig_err = verify_manifest_signature_with( + b"some manifest", + Some(&bad_sig), + &FREENET_RELEASE_PUBKEY, + false, + true, + ) + .unwrap_err(); + assert!( + is_release_verification_failure(&sig_err), + "invalid signature must be a verification failure" + ); + + // Wrong-length signature. + let short_sig_err = verify_manifest_signature_with( + b"some manifest", + Some(&[1u8, 2, 3]), + &FREENET_RELEASE_PUBKEY, + false, + true, + ) + .unwrap_err(); + assert!( + is_release_verification_failure(&short_sig_err), + "wrong-length signature must be a verification failure" + ); + + // A generic/transient error must NOT classify as a verification failure. + let transient = anyhow::anyhow!("Failed to download file: connection reset"); + assert!( + !is_release_verification_failure(&transient), + "a transient download error must NOT count toward the gate" + ); + } + + #[test] + fn install_failure_gate_only_counts_verification_failures() { + // #4073 (Codex): the per-version install-failure gate must engage only on + // DETERMINISTIC verification failures (checksum/signature), NOT on + // transient ones (download/network/extraction) — gating a good release + // after a brief outage is a regression. Source-scrape pins on run_async's + // match (the macOS path isn't exercised on Linux CI). + const SRC: &str = include_str!("update.rs"); + let (_, run_async) = SRC + .split_once("let install_result = self.download_and_install(") + .expect("download_and_install call site not found"); + let (matchbody, _) = run_async + .split_once("install_result.map(|_| ())") + .expect("end of install-result match not found"); + + // The Installed outcome clears the gate. + assert!( + matchbody.contains( + "Ok(InstallOutcome::Installed) => super::rollback::clear_install_failures()" + ), + "the Installed outcome must clear the install-failure gate" + ); + // The deterministic-verification Err arm records a failure. + assert!( + matchbody.contains("Err(e) if is_release_verification_failure(e) =>") + && matchbody.contains("record_install_failure"), + "a verification-failure Err must record toward the gate" + ); + // The catch-all transient Err arm must NOT record (it is a no-op). + let (_, after_catch) = matchbody + .split_once("// Transient install error: neither record nor clear") + .expect("transient Err arm must be present and documented"); + let catch_body = &after_catch[..after_catch.find('}').unwrap_or(after_catch.len())]; + assert!( + !catch_body.contains("record_install_failure") + && !catch_body.contains("clear_install_failures"), + "the transient Err arm must neither record nor clear the gate" + ); + } + // ---- Ed25519 release-manifest signing + verification (auto-update). ---- // // The release workflow signs the raw bytes of SHA256SUMS.txt with the diff --git a/crates/core/src/bin/freenet.rs b/crates/core/src/bin/freenet.rs index 0229eeb249..cb51da9316 100644 --- a/crates/core/src/bin/freenet.rs +++ b/crates/core/src/bin/freenet.rs @@ -397,14 +397,18 @@ async fn run_network_node_with_signals( "Startup update check against GitHub" ); if let Some(new_version) = startup_update_check(build_info::VERSION).await { - // #4073: don't auto-update to a version pinned known-bad on this - // node by a prior crash-loop rollback (the installer would refuse it - // anyway; this avoids a needless restart). - if commands::rollback::is_version_pinned_bad(&new_version) { + // #4073: don't auto-update to a version that is locally BLOCKED — a + // crash-loop known-bad pin OR a version that has repeatedly failed to + // install (checksum / signature / download / extract). The installer + // would refuse it anyway; gating here avoids a needless exit-42 + // restart cycle and is what stops the failed-install loop. + if commands::rollback::is_version_pinned_bad(&new_version) + || commands::rollback::is_version_install_gated(&new_version) + { tracing::warn!( new_version = %new_version, - "Startup check: newer version is pinned known-bad after a crash-loop rollback; \ - not triggering auto-update (#4073)" + "Startup check: newer version is locally blocked (crash-loop known-bad pin or \ + repeated install failures); not triggering auto-update (#4073)" ); } else { tracing::info!( @@ -468,6 +472,12 @@ async fn run_network_node_with_signals( UpdateCheckResult::Skipped => { // GitHub not reachable or no update yet; will retry next tick } + UpdateCheckResult::RateLimited => { + // #4073: our own GitHub-poll rate limiter denied the + // check. Do nothing and retry when the bucket refills — + // do NOT exit 42 (the supervisor's `freenet update` + // shares the empty bucket and would no-op). + } UpdateCheckResult::PinnedKnownBad => { // #4073: the only newer release is pinned known-bad on // this node after a crash-loop rollback. Stop trying to @@ -528,6 +538,12 @@ async fn run_network_node_with_signals( stagger_cooldown_until = Some(Instant::now() + STAGGER_COOLDOWN); } + UpdateCheckResult::RateLimited => { + // #4073: rate-limited by our own bucket — keep the + // stagger deadline armed (do NOT enter the long + // cooldown) so we re-check promptly once the bucket + // refills, without hitting GitHub meanwhile. + } UpdateCheckResult::PinnedKnownBad => { // #4073: discovered version is pinned known-bad // after a crash-loop rollback. Drop the stagger @@ -615,6 +631,16 @@ async fn run_network_node_with_signals( clear_version_mismatch(); } UpdateCheckResult::Skipped => {} + UpdateCheckResult::RateLimited => { + // #4073: our own GitHub-poll rate limiter denied the + // check. Crucially do NOT fall through to the max-backoff + // "trust the gateway, exit 42" fallback above (this is a + // separate match arm, so a RateLimited result never + // reaches it): the supervisor's `freenet update` shares + // the same empty bucket and would just exit "already up to + // date", so exiting here would be a pointless restart loop. + // Keep the mismatch flag and retry once the bucket refills. + } UpdateCheckResult::PinnedKnownBad => { // #4073: the gateway-advertised newer release is pinned // known-bad after a crash-loop rollback. Clear the @@ -1415,6 +1441,38 @@ mod tests { ); } + /// #4073: a `RateLimited` update-check result (our own GitHub-poll bucket was + /// empty) must NEVER drive an exit-42 update. Routing it through the + /// `Skipped` path would let the legacy "max-backoff + 0 connections -> exit + /// 42" gateway-trust fallback fire on a poll we deliberately skipped, and + /// since the supervisor-side `freenet update` shares the same empty bucket it + /// would just no-op — a pointless restart loop. So every `RateLimited` arm + /// must be its own arm that does NOT call `update_tx.send`. + #[test] + fn rate_limited_never_triggers_exit_42() { + let src = strip_line_comments(include_str!("freenet.rs")); + let needle = concat!("UpdateCheckResult::", "RateLimited =>"); + let arms: Vec<&str> = src.split(needle).skip(1).collect(); + assert_eq!( + arms.len(), + 3, + "expected a RateLimited arm in each of the 3 update-trigger priorities \ + (urgent / stagger / legacy mismatch) (#4073)" + ); + for (i, arm) in arms.iter().enumerate() { + let end = arm + .find("UpdateCheckResult::") + .unwrap_or(arm.len()) + .min(400); + let body = &arm[..end]; + assert!( + !body.contains("update_tx.send"), + "RateLimited arm #{i} must NOT send an update — exiting 42 while \ + rate-limited would loop (#4073)" + ); + } + } + /// Source-scrape pin for #4580: each Freenet supervisor must set the /// `FREENET_SUPERVISED` marker on the `freenet network` child it spawns, so /// the node can tell it is supervised and log calmly instead of erroring on