From 5e0f43289fd8cb1307a39679a40cda42779c1659 Mon Sep 17 00:00:00 2001 From: Robert M1 <50460704+githubrobbi@users.noreply.github.com> Date: Tue, 19 May 2026 18:16:32 -0700 Subject: [PATCH] fix(concurrency): snapshot lock state before awaits in status RPC + MCP dispatch MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Phase 10b hand-audit found 2 of 36 lock-across-await candidate sites where the read guard was held across long-running inner awaits. Neither was a deadlock (no lock-acquisition cycle), but both caused unnecessary contention with concurrent writers on the same lock. ## Fix 1 — crates/uffs-daemon/src/index/stats.rs:78 `IndexManager::status()` held `self.status.read().await` across three inner awaits (`has_drives`, `total_records`, `snapshot`) and the allocator-mem snapshot call. The guard was only used to clone the `DaemonStatus` into the response at the very end. This blocked any concurrent `set_ready` / `set_loading_progress` / `refresh` writer on `self.status` for the full duration of the status RPC (tens of ms on many-drive boxes). Fix: snapshot the `DaemonStatus` upfront via `.read().await.clone()` and use the owned value in the response. `DaemonStatus` is already `Clone` (small enum: bare unit variants, plus Loading{usize,usize}, plus Refreshing{Vec} which is typically empty in steady state). Snapshot is microseconds. ## Fix 2 — crates/uffs-mcp/src/handler/mod.rs:256 + roots.rs:43 `UffsMcpServer::dispatch_tool` held `self.roots.read().await` across the entire tool dispatch chain — `tools::search::run(..., &roots_state).await` can run for seconds against the daemon RPC for large queries. Any concurrent `on_roots_list_changed` notification (which takes `self.roots.write().await`) blocked for the duration of every in-flight tool call across the whole MCP session. Fix: * Derive `Clone` on `RootsState` (cheap: typically <10 short-string `RootScope` entries). * Snapshot via `.read().await.clone()` and pass the owned value by reference to the tool runners. Tool signatures unchanged (they already took `&RootsState`). ## Verification * `cargo check -p uffs-daemon -p uffs-mcp` — clean. * `cargo clippy -p uffs-daemon -p uffs-mcp --all-targets -- -W clippy::await_holding_lock -D warnings` — 0 warnings. * `cargo test -p uffs-daemon --lib` — 298 passed / 0 failed. * `cargo test -p uffs-mcp` — 12 passed / 0 failed + 1 doctest passed. ## Audit verdict for the remaining 34 candidate sites All 34 are textbook-clean and require no changes. See the per-site verdict table in docs/dev/baseline/2026-05-19/phase_10_lock_across_await_audit.md (local-only; gitignored). The dominant patterns are: * Block-scoped extract-then-await: `let x: Vec<_> = { let g = lock.read().await; … .collect() };` (used in dispatch.rs Phase-1, drives.rs, forget_drive.rs, hibernate_shards Phase-1, cascade_demote_one_step Phase-1, etc). * Explicit `drop(guard);` immediately before any further `.await` (used in every write-guard swap path: add_drive, replace_drive, promote/demote, journal apply, etc). * Single-statement guard (no inner await possible): `*lock.write().await = …;` or `lock.read().await.is_empty()`. ## Rule-1 adherence Zero `#[allow(...)]` introductions. No suppression hacks, no disabled lints, no skipped tests. Both fixes are minimal extract- then-await snapshots with full rustdoc justification at each modified site. Refs #302. --- crates/uffs-daemon/src/index/stats.rs | 21 +++++++++++++++++++-- crates/uffs-mcp/src/handler/mod.rs | 19 ++++++++++++++++++- crates/uffs-mcp/src/roots.rs | 9 ++++++++- 3 files changed, 45 insertions(+), 4 deletions(-) diff --git a/crates/uffs-daemon/src/index/stats.rs b/crates/uffs-daemon/src/index/stats.rs index f270ee274..c5d204542 100644 --- a/crates/uffs-daemon/src/index/stats.rs +++ b/crates/uffs-daemon/src/index/stats.rs @@ -74,8 +74,25 @@ impl IndexManager { /// Get current daemon status. /// /// Includes `has_drives` and `total_records` for completeness. + /// + /// # Concurrency + /// + /// Snapshots the `DaemonStatus` upfront via `.read().await.clone()` + /// rather than holding the read guard across the inner awaits + /// below. Without the snapshot, the guard would be held across + /// [`Self::has_drives`], [`Self::total_records`], and + /// [`Self::snapshot`] — three independent `self.index.read().await` + /// acquisitions — blocking any concurrent + /// [`Self::set_ready`] / [`Self::set_loading_progress`] / + /// [`crate::index::refresh::IndexManager::refresh`] writer on + /// `self.status` for the duration of the status RPC (which on a + /// many-drive box with a slow snapshot path can be tens of + /// milliseconds). `DaemonStatus` is a small `Clone` enum + /// (`Loading { usize, usize }` or `Refreshing { Vec }` + /// in the worst case), so the snapshot is microseconds and never + /// crosses an inner `.await`. See Phase 10b audit findings. pub(crate) async fn status(&self, connections: usize) -> StatusResponse { - let status = self.status.read().await; + let status_snapshot = self.status.read().await.clone(); let loaded = self.has_drives().await; let records = self.total_records().await; tracing::trace!( @@ -115,7 +132,7 @@ impl IndexManager { }); StatusResponse { - status: status.clone(), + status: status_snapshot, uptime_secs: self.start_time.elapsed().as_secs(), connections, pid: std::process::id(), diff --git a/crates/uffs-mcp/src/handler/mod.rs b/crates/uffs-mcp/src/handler/mod.rs index 51ac87d57..46878f0d6 100644 --- a/crates/uffs-mcp/src/handler/mod.rs +++ b/crates/uffs-mcp/src/handler/mod.rs @@ -239,6 +239,23 @@ impl UffsMcpServer { /// /// Separated from `call_tool` so the retry-on-reconnect logic can /// call it a second time with the same arguments. + /// + /// # Concurrency + /// + /// Snapshots the [`RootsState`] upfront via `.read().await.clone()` + /// rather than holding the read guard across the daemon-RPC tool + /// dispatch awaits below. Without the snapshot, the guard would + /// be held for the duration of every tool call (potentially + /// seconds for a large `uffs_search` / `uffs_aggregate`), blocking + /// any concurrent [`ServerHandler::on_roots_list_changed`] writer + /// — which acquires `self.roots.write().await` to refresh the + /// state — for the duration of every in-flight tool call across + /// the whole MCP session. Cloning `RootsState` is cheap (small + /// `Vec` of typically `< 10` short-string entries; see + /// the type's rustdoc). See Phase 10b audit findings. + /// + /// [`RootsState`]: crate::roots::RootsState + /// [`ServerHandler::on_roots_list_changed`]: rmcp::handler::server::ServerHandler::on_roots_list_changed async fn dispatch_tool( &self, tool_name: &str, @@ -253,7 +270,7 @@ impl UffsMcpServer { Self::readiness_gate(&mut client).await?; } - let roots_state = self.roots.read().await; + let roots_state = self.roots.read().await.clone(); match tool_name { "uffs_search" => { diff --git a/crates/uffs-mcp/src/roots.rs b/crates/uffs-mcp/src/roots.rs index 19814ee14..ba85d719f 100644 --- a/crates/uffs-mcp/src/roots.rs +++ b/crates/uffs-mcp/src/roots.rs @@ -33,7 +33,14 @@ pub struct RootScope { } /// Shared roots state held by the MCP server. -#[derive(Debug, Default)] +/// +/// `Clone` is derived so the MCP handler can snapshot the current +/// state under a brief read guard and then release the lock before +/// dispatching tools that await on the daemon RPC. Cloning is cheap: +/// the `roots` and `warnings` `Vec`s are typically `< 10` short +/// `String`s each (one per workspace root). See Phase 10b audit +/// findings and `crate::handler::UffsMcpServer::call_tool_with_args`. +#[derive(Debug, Default, Clone)] pub struct RootsState { /// Whether the client has advertised roots at least once. pub advertised: bool,