From 397000cf3140b3638e71898594eedba4277d3a7e Mon Sep 17 00:00:00 2001 From: "laul.pogan" Date: Sun, 14 Jun 2026 16:42:35 -0700 Subject: [PATCH] feat(status): bounded `--wait-daemon-running` for SessionStart loops (#284.2) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Issue #284 part 2 (from Willard's Windows report). The SessionStart bringup ran a bash loop: until wire status 2>/dev/null | grep -q 'daemon_running":true'; do sleep 3 done With a never-healthy daemon (because #284.1's wedged probes kept `wire status` from ever returning `daemon_running:true`) the loop ran forever, spawning a fresh hanging `wire status` every 3 seconds. Over many sessions/days it piled up 254 `wire.exe` processes, each wedged on a PowerShell `Get-CimInstance` probe. #284.1 fixes the probe; this fix removes the need for the external loop entirely by giving operators a single bounded in-process primitive. New flag: `wire status --wait-daemon-running [--timeout ]`. - Polls the daemon-liveness snapshot every 200ms IN-PROCESS — no re-spawn pressure, no PowerShell-per-cycle cost. - Default `--timeout 30`, overridable per invocation. - Healthy path: print the full status surface (same JSON / human output `wire status` produces), exit 0. - Timeout path: emit the last-seen `pidfile_pid` and `pgrep_pids` on stderr (so the operator can see what state was lingering), return a non-zero `anyhow::Error` so shell wrappers can branch. Pure-logic `WaitDecision` + `wait_step(pidfile_alive, now, deadline)` extracted so the policy is unit-testable without a real daemon. Tests lock down the three branches plus the "deadline elapsed AND daemon just came alive on the same tick → success wins" case (we don't punish the operator for the missed-by-a-millisecond race). Tests: 4 new pure-logic `wait_step_*`. Full lib suite: 490 passed; 0 failed; 7 ignored on `x86_64-pc-windows-msvc` (rustc 1.96.0). if the operator keeps using it (because status returns fast now), AND there's a built-in alternative that avoids the spawn churn entirely. Stacks on top of #294 (Windows test/clippy hygiene); rebase onto main once #294 lands. Co-Authored-By: Claude Opus 4.7 (1M context) Signed-off-by: laul.pogan --- CHANGELOG.md | 1 + src/cli/mod.rs | 20 ++++++++- src/cli/status.rs | 108 ++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 128 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 6d1057d..726c4e8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -23,6 +23,7 @@ the PR description linked in each section. ### Added - **`wire unclaim` + relay `DELETE /v1/handle/claim/:nick` — release a claimed handle** (#247 finding 1): a handle claim was FCFS-**permanent** (no expiry, no unclaim), so an abandoned/rotated nick squatted the directory forever. You can now release your persona: `wire unclaim` (owner-gated by your slot token) frees the nick so it stops resolving via `.well-known/wire/agent` and can be re-claimed. (Operator-TTL auto-expiry — the other half of #247.1 — needs persisted slot-activity to avoid evicting quiet-but-live agents on relay restart, and stays tracked.) +- **`wire status --wait-daemon-running [--timeout ]`** (#284.2): a bounded, in-process replacement for fragile external shell loops like `until wire status … | grep -q 'daemon_running":true'; do sleep 3; done`. The external pattern + a never-healthy daemon piled up 254 stale `wire.exe` processes on Willard's box (each `wire status` invocation hanging on a wedged probe, the loop spawning a fresh one every 3s). The new flag polls the daemon-liveness snapshot every 200ms in-process, exits 0 with the full status when `daemon_running:true`, or bails after `--timeout` (default 30s) with the last-seen `pidfile_pid` / `pgrep_pids` on stderr so the operator knows what wasn't healthy. Replaces the loop with one bounded subcommand call; no spawn pressure, no orphan accumulation. Pure-logic `wait_step` decision is split out and unit-tested. ### Fixed diff --git a/src/cli/mod.rs b/src/cli/mod.rs index 76e7700..430bf4a 100644 --- a/src/cli/mod.rs +++ b/src/cli/mod.rs @@ -435,6 +435,17 @@ pub enum Command { peer: Option, #[arg(long)] json: bool, + /// Block until `daemon_running:true`, then exit 0. Polls + /// internally every 200ms up to `--timeout` seconds. Exit 1 + /// on timeout (with the last seen status to stderr). Replaces + /// fragile external `until wire status … | grep daemon_running:true` + /// shell loops that piled up hundreds of `wire status` + /// invocations on a never-healthy host (#284.2). + #[arg(long)] + wait_daemon_running: bool, + /// Bound for `--wait-daemon-running`. Default 30s. + #[arg(long, default_value_t = 30)] + timeout: u64, }, /// Publish or inspect auto-responder health for this slot. Responder { @@ -1733,9 +1744,16 @@ pub fn run() -> Result<()> { offline, json, } => cmd_init(relay.as_deref(), offline, json), - Command::Status { peer, json } => { + Command::Status { + peer, + json, + wait_daemon_running, + timeout, + } => { if let Some(peer) = peer { status::cmd_status_peer(&peer, json) + } else if wait_daemon_running { + status::cmd_status_wait_daemon_running(json, timeout) } else { status::cmd_status(json) } diff --git a/src/cli/status.rs b/src/cli/status.rs index 0c785a7..d13b6e8 100644 --- a/src/cli/status.rs +++ b/src/cli/status.rs @@ -5,6 +5,80 @@ use crate::config; // ---------- status ---------- +/// Pure decision for `cmd_status_wait_daemon_running`'s polling loop. +/// Extracted so the policy is unit-testable without spinning a real +/// `wire daemon`. Given a liveness snapshot's `pidfile_alive` flag, +/// the current instant, and the deadline, decide: success now, keep +/// waiting, or time out. +#[derive(Debug, PartialEq, Eq)] +pub(crate) enum WaitDecision { + /// `pidfile_alive == true` — break out of the poll loop and + /// print the full status. + Healthy, + /// Deadline already elapsed — bail with a timeout error. + TimedOut, + /// Deadline still ahead, daemon still down — sleep + retry. + Continue, +} + +pub(crate) fn wait_step( + pidfile_alive: bool, + now: std::time::Instant, + deadline: std::time::Instant, +) -> WaitDecision { + if pidfile_alive { + WaitDecision::Healthy + } else if now >= deadline { + WaitDecision::TimedOut + } else { + WaitDecision::Continue + } +} + +/// `wire status --wait-daemon-running [--timeout ]`: poll the +/// local daemon-liveness snapshot until `daemon_running:true` (the +/// same `pidfile_alive` truth `cmd_status` surfaces), then exit 0. +/// +/// Behavior: +/// - Polls every 200ms. +/// - Bounded by `timeout_secs` (default 30s at the clap layer). +/// - On healthy: prints the same status JSON / human surface +/// `cmd_status` would have, then returns Ok(()). +/// - On timeout: emits the last-seen status (so the operator knows +/// what was wrong — daemon not started? pidfile corrupt?) and +/// exits with a non-zero `anyhow::Error` so shell wrappers can +/// branch. +/// +/// #284.2: replaces fragile external loops like +/// `until wire status … | grep daemon_running:true; do sleep 3; done`, +/// which on a never-healthy host piled up hundreds of `wire status` +/// invocations every few seconds (Willard's 254-`wire.exe` pile-up +/// repro). Each poll cycle here is in-process, so there's no spawn +/// pressure, and timeout guarantees the wrapper exits cleanly. +pub(super) fn cmd_status_wait_daemon_running(as_json: bool, timeout_secs: u64) -> Result<()> { + let deadline = std::time::Instant::now() + std::time::Duration::from_secs(timeout_secs); + loop { + let snap = crate::ensure_up::daemon_liveness(); + match wait_step(snap.pidfile_alive, std::time::Instant::now(), deadline) { + WaitDecision::Healthy => return cmd_status(as_json), + WaitDecision::TimedOut => { + if !as_json { + eprintln!( + "wire status: daemon not running after {timeout_secs}s. \ + Last seen: pidfile_pid={:?}, pgrep_pids={:?}. \ + Run `wire up` to start the daemon.", + snap.pidfile_pid, snap.pgrep_pids + ); + } + bail!("daemon_running stayed false through {timeout_secs}s wait window"); + } + WaitDecision::Continue => { + std::thread::sleep(std::time::Duration::from_millis(200)); + } + } + } +} + pub(super) fn cmd_status(as_json: bool) -> Result<()> { let initialized = config::is_initialized()?; @@ -2132,4 +2206,38 @@ mod doctor_tests { ); }); } + + // ---------- #284.2: wait_step pure-logic policy ---------- + + #[test] + fn wait_step_returns_healthy_when_pidfile_alive() { + let now = std::time::Instant::now(); + let deadline = now + std::time::Duration::from_secs(30); + assert_eq!(wait_step(true, now, deadline), WaitDecision::Healthy); + } + + #[test] + fn wait_step_returns_timed_out_when_deadline_passed_and_dead() { + let now = std::time::Instant::now(); + // Deadline already 1s in the past. + let deadline = now - std::time::Duration::from_secs(1); + assert_eq!(wait_step(false, now, deadline), WaitDecision::TimedOut); + } + + #[test] + fn wait_step_returns_continue_when_deadline_future_and_dead() { + let now = std::time::Instant::now(); + let deadline = now + std::time::Duration::from_secs(5); + assert_eq!(wait_step(false, now, deadline), WaitDecision::Continue); + } + + #[test] + fn wait_step_healthy_wins_over_timeout() { + // If both conditions hold (deadline elapsed AND daemon now + // alive), the success path takes precedence — we don't punish + // an operator who just barely missed the window. + let now = std::time::Instant::now(); + let deadline = now - std::time::Duration::from_secs(1); + assert_eq!(wait_step(true, now, deadline), WaitDecision::Healthy); + } }