diff --git a/CHANGELOG.md b/CHANGELOG.md index 6d1057d..726c4e8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -23,6 +23,7 @@ the PR description linked in each section. ### Added - **`wire unclaim` + relay `DELETE /v1/handle/claim/:nick` — release a claimed handle** (#247 finding 1): a handle claim was FCFS-**permanent** (no expiry, no unclaim), so an abandoned/rotated nick squatted the directory forever. You can now release your persona: `wire unclaim` (owner-gated by your slot token) frees the nick so it stops resolving via `.well-known/wire/agent` and can be re-claimed. (Operator-TTL auto-expiry — the other half of #247.1 — needs persisted slot-activity to avoid evicting quiet-but-live agents on relay restart, and stays tracked.) +- **`wire status --wait-daemon-running [--timeout ]`** (#284.2): a bounded, in-process replacement for fragile external shell loops like `until wire status … | grep -q 'daemon_running":true'; do sleep 3; done`. The external pattern + a never-healthy daemon piled up 254 stale `wire.exe` processes on Willard's box (each `wire status` invocation hanging on a wedged probe, the loop spawning a fresh one every 3s). The new flag polls the daemon-liveness snapshot every 200ms in-process, exits 0 with the full status when `daemon_running:true`, or bails after `--timeout` (default 30s) with the last-seen `pidfile_pid` / `pgrep_pids` on stderr so the operator knows what wasn't healthy. Replaces the loop with one bounded subcommand call; no spawn pressure, no orphan accumulation. Pure-logic `wait_step` decision is split out and unit-tested. ### Fixed diff --git a/src/cli/mod.rs b/src/cli/mod.rs index 76e7700..430bf4a 100644 --- a/src/cli/mod.rs +++ b/src/cli/mod.rs @@ -435,6 +435,17 @@ pub enum Command { peer: Option, #[arg(long)] json: bool, + /// Block until `daemon_running:true`, then exit 0. Polls + /// internally every 200ms up to `--timeout` seconds. Exit 1 + /// on timeout (with the last seen status to stderr). Replaces + /// fragile external `until wire status … | grep daemon_running:true` + /// shell loops that piled up hundreds of `wire status` + /// invocations on a never-healthy host (#284.2). + #[arg(long)] + wait_daemon_running: bool, + /// Bound for `--wait-daemon-running`. Default 30s. + #[arg(long, default_value_t = 30)] + timeout: u64, }, /// Publish or inspect auto-responder health for this slot. Responder { @@ -1733,9 +1744,16 @@ pub fn run() -> Result<()> { offline, json, } => cmd_init(relay.as_deref(), offline, json), - Command::Status { peer, json } => { + Command::Status { + peer, + json, + wait_daemon_running, + timeout, + } => { if let Some(peer) = peer { status::cmd_status_peer(&peer, json) + } else if wait_daemon_running { + status::cmd_status_wait_daemon_running(json, timeout) } else { status::cmd_status(json) } diff --git a/src/cli/status.rs b/src/cli/status.rs index 0c785a7..d13b6e8 100644 --- a/src/cli/status.rs +++ b/src/cli/status.rs @@ -5,6 +5,80 @@ use crate::config; // ---------- status ---------- +/// Pure decision for `cmd_status_wait_daemon_running`'s polling loop. +/// Extracted so the policy is unit-testable without spinning a real +/// `wire daemon`. Given a liveness snapshot's `pidfile_alive` flag, +/// the current instant, and the deadline, decide: success now, keep +/// waiting, or time out. +#[derive(Debug, PartialEq, Eq)] +pub(crate) enum WaitDecision { + /// `pidfile_alive == true` — break out of the poll loop and + /// print the full status. + Healthy, + /// Deadline already elapsed — bail with a timeout error. + TimedOut, + /// Deadline still ahead, daemon still down — sleep + retry. + Continue, +} + +pub(crate) fn wait_step( + pidfile_alive: bool, + now: std::time::Instant, + deadline: std::time::Instant, +) -> WaitDecision { + if pidfile_alive { + WaitDecision::Healthy + } else if now >= deadline { + WaitDecision::TimedOut + } else { + WaitDecision::Continue + } +} + +/// `wire status --wait-daemon-running [--timeout ]`: poll the +/// local daemon-liveness snapshot until `daemon_running:true` (the +/// same `pidfile_alive` truth `cmd_status` surfaces), then exit 0. +/// +/// Behavior: +/// - Polls every 200ms. +/// - Bounded by `timeout_secs` (default 30s at the clap layer). +/// - On healthy: prints the same status JSON / human surface +/// `cmd_status` would have, then returns Ok(()). +/// - On timeout: emits the last-seen status (so the operator knows +/// what was wrong — daemon not started? pidfile corrupt?) and +/// exits with a non-zero `anyhow::Error` so shell wrappers can +/// branch. +/// +/// #284.2: replaces fragile external loops like +/// `until wire status … | grep daemon_running:true; do sleep 3; done`, +/// which on a never-healthy host piled up hundreds of `wire status` +/// invocations every few seconds (Willard's 254-`wire.exe` pile-up +/// repro). Each poll cycle here is in-process, so there's no spawn +/// pressure, and timeout guarantees the wrapper exits cleanly. +pub(super) fn cmd_status_wait_daemon_running(as_json: bool, timeout_secs: u64) -> Result<()> { + let deadline = std::time::Instant::now() + std::time::Duration::from_secs(timeout_secs); + loop { + let snap = crate::ensure_up::daemon_liveness(); + match wait_step(snap.pidfile_alive, std::time::Instant::now(), deadline) { + WaitDecision::Healthy => return cmd_status(as_json), + WaitDecision::TimedOut => { + if !as_json { + eprintln!( + "wire status: daemon not running after {timeout_secs}s. \ + Last seen: pidfile_pid={:?}, pgrep_pids={:?}. \ + Run `wire up` to start the daemon.", + snap.pidfile_pid, snap.pgrep_pids + ); + } + bail!("daemon_running stayed false through {timeout_secs}s wait window"); + } + WaitDecision::Continue => { + std::thread::sleep(std::time::Duration::from_millis(200)); + } + } + } +} + pub(super) fn cmd_status(as_json: bool) -> Result<()> { let initialized = config::is_initialized()?; @@ -2132,4 +2206,38 @@ mod doctor_tests { ); }); } + + // ---------- #284.2: wait_step pure-logic policy ---------- + + #[test] + fn wait_step_returns_healthy_when_pidfile_alive() { + let now = std::time::Instant::now(); + let deadline = now + std::time::Duration::from_secs(30); + assert_eq!(wait_step(true, now, deadline), WaitDecision::Healthy); + } + + #[test] + fn wait_step_returns_timed_out_when_deadline_passed_and_dead() { + let now = std::time::Instant::now(); + // Deadline already 1s in the past. + let deadline = now - std::time::Duration::from_secs(1); + assert_eq!(wait_step(false, now, deadline), WaitDecision::TimedOut); + } + + #[test] + fn wait_step_returns_continue_when_deadline_future_and_dead() { + let now = std::time::Instant::now(); + let deadline = now + std::time::Duration::from_secs(5); + assert_eq!(wait_step(false, now, deadline), WaitDecision::Continue); + } + + #[test] + fn wait_step_healthy_wins_over_timeout() { + // If both conditions hold (deadline elapsed AND daemon now + // alive), the success path takes precedence — we don't punish + // an operator who just barely missed the window. + let now = std::time::Instant::now(); + let deadline = now - std::time::Duration::from_secs(1); + assert_eq!(wait_step(true, now, deadline), WaitDecision::Healthy); + } }