From 397000cf3140b3638e71898594eedba4277d3a7e Mon Sep 17 00:00:00 2001
From: "laul.pogan" <paul@zaibatsuheavy.industries>
Date: Sun, 14 Jun 2026 16:42:35 -0700
Subject: [PATCH] feat(status): bounded `--wait-daemon-running` for
 SessionStart loops (#284.2)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Issue #284 part 2 (from Willard's Windows report). The
SessionStart bringup ran a bash loop:

    until wire status 2>/dev/null | grep -q 'daemon_running":true'; do
        sleep 3
    done

With a never-healthy daemon (because #284.1's wedged probes kept
`wire status` from ever returning `daemon_running:true`) the loop ran
forever, spawning a fresh hanging `wire status` every 3 seconds. Over
many sessions/days it piled up 254 `wire.exe` processes, each wedged
on a PowerShell `Get-CimInstance` probe. #284.1 fixes the probe; this
fix removes the need for the external loop entirely by giving
operators a single bounded in-process primitive.

New flag: `wire status --wait-daemon-running [--timeout <secs>]`.

  - Polls the daemon-liveness snapshot every 200ms IN-PROCESS — no
    re-spawn pressure, no PowerShell-per-cycle cost.
  - Default `--timeout 30`, overridable per invocation.
  - Healthy path: print the full status surface (same JSON / human
    output `wire status` produces), exit 0.
  - Timeout path: emit the last-seen `pidfile_pid` and `pgrep_pids`
    on stderr (so the operator can see what state was lingering),
    return a non-zero `anyhow::Error` so shell wrappers can branch.

Pure-logic `WaitDecision` + `wait_step(pidfile_alive, now, deadline)`
extracted so the policy is unit-testable without a real daemon.
Tests lock down the three branches plus the "deadline elapsed AND
daemon just came alive on the same tick → success wins" case (we
don't punish the operator for the missed-by-a-millisecond race).

Tests: 4 new pure-logic `wait_step_*`. Full lib suite: 490 passed;
0 failed; 7 ignored on `x86_64-pc-windows-msvc` (rustc 1.96.0).

if the operator keeps using it (because status returns fast now),
AND there's a built-in alternative that avoids the spawn churn
entirely.

Stacks on top of #294 (Windows test/clippy hygiene); rebase onto main
once #294 lands.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
Signed-off-by: laul.pogan <paul@zaibatsuheavy.industries>
---
 CHANGELOG.md      |   1 +
 src/cli/mod.rs    |  20 ++++++++-
 src/cli/status.rs | 108 ++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 128 insertions(+), 1 deletion(-)
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 6d1057d..726c4e8 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -23,6 +23,7 @@ the PR description linked in each section.
 ### Added
 
 - **`wire unclaim` + relay `DELETE /v1/handle/claim/:nick` — release a claimed handle** (#247 finding 1): a handle claim was FCFS-**permanent** (no expiry, no unclaim), so an abandoned/rotated nick squatted the directory forever. You can now release your persona: `wire unclaim` (owner-gated by your slot token) frees the nick so it stops resolving via `.well-known/wire/agent` and can be re-claimed. (Operator-TTL auto-expiry — the other half of #247.1 — needs persisted slot-activity to avoid evicting quiet-but-live agents on relay restart, and stays tracked.)
+- **`wire status --wait-daemon-running [--timeout <secs>]`** (#284.2): a bounded, in-process replacement for fragile external shell loops like `until wire status … | grep -q 'daemon_running":true'; do sleep 3; done`. The external pattern + a never-healthy daemon piled up 254 stale `wire.exe` processes on Willard's box (each `wire status` invocation hanging on a wedged probe, the loop spawning a fresh one every 3s). The new flag polls the daemon-liveness snapshot every 200ms in-process, exits 0 with the full status when `daemon_running:true`, or bails after `--timeout` (default 30s) with the last-seen `pidfile_pid` / `pgrep_pids` on stderr so the operator knows what wasn't healthy. Replaces the loop with one bounded subcommand call; no spawn pressure, no orphan accumulation. Pure-logic `wait_step` decision is split out and unit-tested.
 
 ### Fixed
 
diff --git a/src/cli/mod.rs b/src/cli/mod.rs
index 76e7700..430bf4a 100644
--- a/src/cli/mod.rs
+++ b/src/cli/mod.rs
@@ -435,6 +435,17 @@ pub enum Command {
         peer: Option<String>,
         #[arg(long)]
         json: bool,
+        /// Block until `daemon_running:true`, then exit 0. Polls
+        /// internally every 200ms up to `--timeout` seconds. Exit 1
+        /// on timeout (with the last seen status to stderr). Replaces
+        /// fragile external `until wire status … | grep daemon_running:true`
+        /// shell loops that piled up hundreds of `wire status`
+        /// invocations on a never-healthy host (#284.2).
+        #[arg(long)]
+        wait_daemon_running: bool,
+        /// Bound for `--wait-daemon-running`. Default 30s.
+        #[arg(long, default_value_t = 30)]
+        timeout: u64,
     },
     /// Publish or inspect auto-responder health for this slot.
     Responder {
@@ -1733,9 +1744,16 @@ pub fn run() -> Result<()> {
             offline,
             json,
         } => cmd_init(relay.as_deref(), offline, json),
-        Command::Status { peer, json } => {
+        Command::Status {
+            peer,
+            json,
+            wait_daemon_running,
+            timeout,
+        } => {
             if let Some(peer) = peer {
                 status::cmd_status_peer(&peer, json)
+            } else if wait_daemon_running {
+                status::cmd_status_wait_daemon_running(json, timeout)
             } else {
                 status::cmd_status(json)
             }
diff --git a/src/cli/status.rs b/src/cli/status.rs
index 0c785a7..d13b6e8 100644
--- a/src/cli/status.rs
+++ b/src/cli/status.rs
@@ -5,6 +5,80 @@ use crate::config;
 
 // ---------- status ----------
 
+/// Pure decision for `cmd_status_wait_daemon_running`'s polling loop.
+/// Extracted so the policy is unit-testable without spinning a real
+/// `wire daemon`. Given a liveness snapshot's `pidfile_alive` flag,
+/// the current instant, and the deadline, decide: success now, keep
+/// waiting, or time out.
+#[derive(Debug, PartialEq, Eq)]
+pub(crate) enum WaitDecision {
+    /// `pidfile_alive == true` — break out of the poll loop and
+    /// print the full status.
+    Healthy,
+    /// Deadline already elapsed — bail with a timeout error.
+    TimedOut,
+    /// Deadline still ahead, daemon still down — sleep + retry.
+    Continue,
+}
+
+pub(crate) fn wait_step(
+    pidfile_alive: bool,
+    now: std::time::Instant,
+    deadline: std::time::Instant,
+) -> WaitDecision {
+    if pidfile_alive {
+        WaitDecision::Healthy
+    } else if now >= deadline {
+        WaitDecision::TimedOut
+    } else {
+        WaitDecision::Continue
+    }
+}
+
+/// `wire status --wait-daemon-running [--timeout <secs>]`: poll the
+/// local daemon-liveness snapshot until `daemon_running:true` (the
+/// same `pidfile_alive` truth `cmd_status` surfaces), then exit 0.
+///
+/// Behavior:
+/// - Polls every 200ms.
+/// - Bounded by `timeout_secs` (default 30s at the clap layer).
+/// - On healthy: prints the same status JSON / human surface
+///   `cmd_status` would have, then returns Ok(()).
+/// - On timeout: emits the last-seen status (so the operator knows
+///   what was wrong — daemon not started? pidfile corrupt?) and
+///   exits with a non-zero `anyhow::Error` so shell wrappers can
+///   branch.
+///
+/// #284.2: replaces fragile external loops like
+/// `until wire status … | grep daemon_running:true; do sleep 3; done`,
+/// which on a never-healthy host piled up hundreds of `wire status`
+/// invocations every few seconds (Willard's 254-`wire.exe` pile-up
+/// repro). Each poll cycle here is in-process, so there's no spawn
+/// pressure, and timeout guarantees the wrapper exits cleanly.
+pub(super) fn cmd_status_wait_daemon_running(as_json: bool, timeout_secs: u64) -> Result<()> {
+    let deadline = std::time::Instant::now() + std::time::Duration::from_secs(timeout_secs);
+    loop {
+        let snap = crate::ensure_up::daemon_liveness();
+        match wait_step(snap.pidfile_alive, std::time::Instant::now(), deadline) {
+            WaitDecision::Healthy => return cmd_status(as_json),
+            WaitDecision::TimedOut => {
+                if !as_json {
+                    eprintln!(
+                        "wire status: daemon not running after {timeout_secs}s. \
+                         Last seen: pidfile_pid={:?}, pgrep_pids={:?}. \
+                         Run `wire up` to start the daemon.",
+                        snap.pidfile_pid, snap.pgrep_pids
+                    );
+                }
+                bail!("daemon_running stayed false through {timeout_secs}s wait window");
+            }
+            WaitDecision::Continue => {
+                std::thread::sleep(std::time::Duration::from_millis(200));
+            }
+        }
+    }
+}
+
 pub(super) fn cmd_status(as_json: bool) -> Result<()> {
     let initialized = config::is_initialized()?;
 
@@ -2132,4 +2206,38 @@ mod doctor_tests {
             );
         });
     }
+
+    // ---------- #284.2: wait_step pure-logic policy ----------
+
+    #[test]
+    fn wait_step_returns_healthy_when_pidfile_alive() {
+        let now = std::time::Instant::now();
+        let deadline = now + std::time::Duration::from_secs(30);
+        assert_eq!(wait_step(true, now, deadline), WaitDecision::Healthy);
+    }
+
+    #[test]
+    fn wait_step_returns_timed_out_when_deadline_passed_and_dead() {
+        let now = std::time::Instant::now();
+        // Deadline already 1s in the past.
+        let deadline = now - std::time::Duration::from_secs(1);
+        assert_eq!(wait_step(false, now, deadline), WaitDecision::TimedOut);
+    }
+
+    #[test]
+    fn wait_step_returns_continue_when_deadline_future_and_dead() {
+        let now = std::time::Instant::now();
+        let deadline = now + std::time::Duration::from_secs(5);
+        assert_eq!(wait_step(false, now, deadline), WaitDecision::Continue);
+    }
+
+    #[test]
+    fn wait_step_healthy_wins_over_timeout() {
+        // If both conditions hold (deadline elapsed AND daemon now
+        // alive), the success path takes precedence — we don't punish
+        // an operator who just barely missed the window.
+        let now = std::time::Instant::now();
+        let deadline = now - std::time::Duration::from_secs(1);
+        assert_eq!(wait_step(true, now, deadline), WaitDecision::Healthy);
+    }
 }