From 8728919440f9f993278f08de300594ea35f991ed Mon Sep 17 00:00:00 2001 From: Yu Liu <60283975+LYZJU2019@users.noreply.github.com> Date: Thu, 30 Apr 2026 14:53:13 -0700 Subject: [PATCH 01/39] feat(tonic-xds): add OutlierDetector sweep engine (gRFC A50) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Implement the core gRFC A50 outlier-detection algorithm: per-endpoint success/failure counters, the success-rate and failure-percentage ejection algorithms, the ejection-multiplier state machine, and a periodic sweep task that emits ejection/un-ejection decisions on a channel. `run_sweep` is pure (returns a Vec); the sweep loop spawned by `OutlierDetector::spawn` owns the channel sender and forwards decisions, so dropping the returned `AbortOnDrop` ends the loop and closes the receiver. Tests drive `run_sweep` directly without the channel or tokio time mechanics. Algorithm coverage matches the gRFC: - Success-rate ejection with configurable `stdev_factor`, `enforcing_success_rate`, `minimum_hosts`, `request_volume`. - Failure-percentage ejection with `threshold`, `enforcing_failure_ percentage`, `minimum_hosts`, `request_volume`. - Ejection multiplier increments on each ejection, decays on healthy intervals; ejection duration is `base * multiplier` capped at `max(base, max_ejection_time)`. - `max_ejection_percent` caps total concurrent ejections. Probability rolls go through an injectable `Rng` trait (defaulting to `fastrand`) so tests can pin enforcement decisions. Standalone in this PR — no integration with the load balancer yet. That lands in a follow-up alongside the per-endpoint outcome interception layer. Refs: https://github.com/grpc/proposal/blob/master/A50-xds-outlier-detection.md --- tonic-xds/src/client/loadbalance/mod.rs | 1 + .../client/loadbalance/outlier_detection.rs | 856 ++++++++++++++++++ 2 files changed, 857 insertions(+) create mode 100644 tonic-xds/src/client/loadbalance/outlier_detection.rs diff --git a/tonic-xds/src/client/loadbalance/mod.rs b/tonic-xds/src/client/loadbalance/mod.rs index 66ccb1772..1c4ffa395 100644 --- a/tonic-xds/src/client/loadbalance/mod.rs +++ b/tonic-xds/src/client/loadbalance/mod.rs @@ -3,4 +3,5 @@ pub(crate) mod channel_state; pub(crate) mod errors; pub(crate) mod keyed_futures; pub(crate) mod loadbalancer; +pub(crate) mod outlier_detection; pub(crate) mod pickers; diff --git a/tonic-xds/src/client/loadbalance/outlier_detection.rs b/tonic-xds/src/client/loadbalance/outlier_detection.rs new file mode 100644 index 000000000..93e63ed46 --- /dev/null +++ b/tonic-xds/src/client/loadbalance/outlier_detection.rs @@ -0,0 +1,856 @@ +//! gRFC A50 outlier-detection sweep engine. +//! +//! Owns per-endpoint counters and an ejection state machine. Periodically +//! reads the counters, runs the success-rate and failure-percentage +//! ejection algorithms, and emits [`EjectionDecision`]s. Knows nothing +//! about the data path: callers feed it RPC outcomes via the lock-free +//! [`EndpointCounters`] handle returned by [`OutlierDetector::add_endpoint`], +//! and consume decisions from a channel returned by [`OutlierDetector::spawn`]. +//! +//! [gRFC A50]: https://github.com/grpc/proposal/blob/master/A50-xds-outlier-detection.md + +use std::collections::HashMap; +use std::sync::Arc; +use std::sync::Mutex; +use std::sync::atomic::{AtomicU64, Ordering}; +use std::time::Instant; + +use tokio::sync::mpsc; + +use crate::client::endpoint::EndpointAddress; +use crate::common::async_util::AbortOnDrop; +use crate::xds::resource::outlier_detection::{ + FailurePercentageConfig, OutlierDetectionConfig, SuccessRateConfig, +}; + +/// Lock-free per-endpoint success/failure counter handle. +/// +/// Cloned freely. Callers (typically a request-outcome interceptor) +/// invoke [`record_success`] / [`record_failure`] from the data path. +/// The detector reads and resets the counters during each sweep. +/// +/// [`record_success`]: EndpointCounters::record_success +/// [`record_failure`]: EndpointCounters::record_failure +#[derive(Debug, Default)] +pub(crate) struct EndpointCounters { + success: AtomicU64, + failure: AtomicU64, +} + +impl EndpointCounters { + pub(crate) fn record_success(&self) { + self.success.fetch_add(1, Ordering::Relaxed); + } + + pub(crate) fn record_failure(&self) { + self.failure.fetch_add(1, Ordering::Relaxed); + } + + /// Atomically read and zero both counters. Returns `(success, failure)`. + fn snapshot_and_reset(&self) -> (u64, u64) { + let s = self.success.swap(0, Ordering::Relaxed); + let f = self.failure.swap(0, Ordering::Relaxed); + (s, f) + } +} + +/// A decision emitted by an [`OutlierDetector`] sweep. +#[derive(Debug, Clone, PartialEq, Eq)] +pub(crate) enum EjectionDecision { + /// Eject this endpoint from the load-balancing pool. The caller + /// should keep its underlying connection alive (A50 requires + /// preserving connections across ejection). + Eject(EndpointAddress), + /// Restore a previously-ejected endpoint to the pool. + Uneject(EndpointAddress), +} + +/// Probability source for `enforcing_*` rolls. Abstracted so tests can +/// inject deterministic outcomes. +pub(crate) trait Rng: Send + Sync + 'static { + /// Return a uniform random `u32` in `0..100`. + fn pct_roll(&self) -> u32; +} + +/// Default RNG backed by `fastrand` (already a workspace dep). +struct FastRandRng; + +impl Rng for FastRandRng { + fn pct_roll(&self) -> u32 { + fastrand::u32(0..100) + } +} + +/// Per-endpoint state held inside the detector. +struct EndpointState { + counters: Arc, + /// Number of times this endpoint has been ejected. Grows on each + /// re-ejection and decays on each healthy interval. + ejection_multiplier: u32, + /// `Some(at)` when currently ejected; `None` otherwise. + ejected_at: Option, +} + +impl EndpointState { + fn new() -> Self { + Self { + counters: Arc::new(EndpointCounters::default()), + ejection_multiplier: 0, + ejected_at: None, + } + } +} + +/// gRFC A50 outlier detector. +/// +/// `run_sweep` is pure — it returns a list of [`EjectionDecision`]s +/// rather than sending them. The sweep loop spawned by [`spawn`] owns +/// the channel sender and forwards decisions to the receiver, so +/// dropping the [`AbortOnDrop`] handle ends the loop and closes the +/// receiver. `OutlierDetector` itself holds no I/O resources, which +/// makes algorithm-level tests trivial to write. +/// +/// [`spawn`]: OutlierDetector::spawn +pub(crate) struct OutlierDetector { + config: OutlierDetectionConfig, + state: Mutex>, + rng: Box, +} + +impl OutlierDetector { + /// Build the detector and spawn its sweep task on the current Tokio + /// runtime. The sweep runs every `config.interval` until the returned + /// [`AbortOnDrop`] is dropped. + pub(crate) fn spawn( + config: OutlierDetectionConfig, + ) -> ( + Arc, + mpsc::UnboundedReceiver, + AbortOnDrop, + ) { + Self::spawn_with_rng(config, Box::new(FastRandRng)) + } + + /// Variant of [`spawn`] that accepts an injected [`Rng`]. + pub(crate) fn spawn_with_rng( + config: OutlierDetectionConfig, + rng: Box, + ) -> ( + Arc, + mpsc::UnboundedReceiver, + AbortOnDrop, + ) { + let (tx, rx) = mpsc::unbounded_channel(); + let detector = Arc::new(Self { + config, + state: Mutex::new(HashMap::new()), + rng, + }); + let task = tokio::spawn(sweep_loop(detector.clone(), tx)); + (detector, rx, AbortOnDrop(task)) + } + + /// Register an endpoint and return its lock-free counter handle. + /// The caller wires this handle into the data-path RPC interceptor so + /// that completed calls increment success/failure atomics. + /// + /// Adding an already-registered address is a no-op and returns the + /// existing handle (so callers can re-add idempotently). + pub(crate) fn add_endpoint(&self, addr: EndpointAddress) -> Arc { + let mut state = self.state.lock().expect("outlier_detector mutex poisoned"); + state + .entry(addr) + .or_insert_with(EndpointState::new) + .counters + .clone() + } + + /// Forget a previously-registered endpoint. Drops its counters and + /// any ejection state. If the endpoint was ejected, no `Uneject` + /// decision is emitted — the caller is expected to handle the removal + /// directly (e.g., by dropping its slot in the load balancer). + pub(crate) fn remove_endpoint(&self, addr: &EndpointAddress) { + let mut state = self.state.lock().expect("outlier_detector mutex poisoned"); + state.remove(addr); + } + + /// Run a single sweep at logical time `now` and return the resulting + /// ejection/un-ejection decisions. Pure — does no I/O. The sweep loop + /// invokes this on each interval tick and forwards the decisions on + /// the channel; tests call it directly. + pub(crate) fn run_sweep(&self, now: Instant) -> Vec { + let mut state = self.state.lock().expect("outlier_detector mutex poisoned"); + + // Snapshot per-endpoint stats and update ejection-time multiplier + // bookkeeping. A50: for each endpoint that received traffic and is + // not currently ejected, decrement the multiplier toward zero. + let mut snapshots: Vec<(EndpointAddress, u64, u64)> = Vec::with_capacity(state.len()); + for (addr, ep) in state.iter_mut() { + let (success, failure) = ep.counters.snapshot_and_reset(); + let total = success + failure; + if ep.ejected_at.is_none() && total > 0 { + ep.ejection_multiplier = ep.ejection_multiplier.saturating_sub(1); + } + snapshots.push((addr.clone(), success, failure)); + } + + // Un-eject endpoints whose backoff has elapsed. A50: + // actual_duration = min(base * multiplier, max(base, max_ejection_time)) + let cap = self + .config + .base_ejection_time + .max(self.config.max_ejection_time); + let mut to_uneject: Vec = Vec::new(); + for (addr, ep) in state.iter_mut() { + if let Some(at) = ep.ejected_at + && let Some(scaled) = self + .config + .base_ejection_time + .checked_mul(ep.ejection_multiplier) + && now.duration_since(at) >= scaled.min(cap) + { + ep.ejected_at = None; + to_uneject.push(addr.clone()); + } + } + + // Build candidate list (non-ejected endpoints) once for both + // algorithms. A50 wants both algorithms to share the snapshot. + // Note: we only build the rate slice; per-algorithm filters + // (request_volume, minimum_hosts) are applied below. + let candidates: Vec = snapshots + .iter() + .filter_map(|(addr, success, failure)| { + let total = success + failure; + let ep = state.get(addr)?; + if ep.ejected_at.is_some() { + return None; + } + Some(Candidate { + addr: addr.clone(), + success: *success, + failure: *failure, + total, + }) + }) + .collect(); + + // Compute the cap on currently-ejected endpoints. A50: + // if ejected_count >= max_ejection_percent of total, stop ejecting. + // We compute the cap once and decrement the available budget as + // each algorithm ejects. + let total_endpoints = state.len(); + let max_ejections = (total_endpoints as u64 + * u64::from(self.config.max_ejection_percent.get()) + / 100) as usize; + let already_ejected = state.values().filter(|ep| ep.ejected_at.is_some()).count(); + let mut budget = max_ejections.saturating_sub(already_ejected); + + let mut to_eject: Vec = Vec::new(); + + if let Some(sr) = self.config.success_rate.as_ref() { + self.run_success_rate(sr, &candidates, &mut budget, &mut to_eject); + } + if let Some(fp) = self.config.failure_percentage.as_ref() { + self.run_failure_percentage(fp, &candidates, &mut budget, &mut to_eject); + } + + for addr in &to_eject { + if let Some(ep) = state.get_mut(addr) { + ep.ejected_at = Some(now); + ep.ejection_multiplier = ep.ejection_multiplier.saturating_add(1); + } + } + + drop(state); + + let mut decisions = Vec::with_capacity(to_uneject.len() + to_eject.len()); + for addr in to_uneject { + decisions.push(EjectionDecision::Uneject(addr)); + } + for addr in to_eject { + decisions.push(EjectionDecision::Eject(addr)); + } + decisions + } + + /// A50 success-rate algorithm. + fn run_success_rate( + &self, + cfg: &SuccessRateConfig, + all: &[Candidate], + budget: &mut usize, + out: &mut Vec, + ) { + // Filter to candidates with enough traffic. + let qualifying: Vec<&Candidate> = all + .iter() + .filter(|c| c.total >= u64::from(cfg.request_volume)) + .collect(); + if qualifying.len() < cfg.minimum_hosts as usize { + return; + } + + // success_rate = success / total (in [0.0, 1.0]). + let rates: Vec = qualifying + .iter() + .map(|c| c.success as f64 / c.total as f64) + .collect(); + let n = rates.len() as f64; + let mean = rates.iter().sum::() / n; + let variance = rates.iter().map(|r| (r - mean).powi(2)).sum::() / n; + let stdev = variance.sqrt(); + + // threshold = mean - stdev * (stdev_factor / 1000) + let factor = f64::from(cfg.stdev_factor) / 1000.0; + let threshold = mean - stdev * factor; + + for (c, rate) in qualifying.iter().zip(rates.iter()) { + if *budget == 0 { + break; + } + if *rate < threshold && self.roll(cfg.enforcing_success_rate.get()) { + out.push(c.addr.clone()); + *budget -= 1; + } + } + } + + /// A50 failure-percentage algorithm. + fn run_failure_percentage( + &self, + cfg: &FailurePercentageConfig, + all: &[Candidate], + budget: &mut usize, + out: &mut Vec, + ) { + let qualifying: Vec<&Candidate> = all + .iter() + .filter(|c| c.total >= u64::from(cfg.request_volume)) + .filter(|c| !out.contains(&c.addr)) // skip endpoints already ejected this sweep + .collect(); + if qualifying.len() < cfg.minimum_hosts as usize { + return; + } + + let threshold = u64::from(cfg.threshold.get()); + for c in qualifying { + if *budget == 0 { + break; + } + // failure_pct = 100 * failure / total + let failure_pct = 100 * c.failure / c.total; + if failure_pct >= threshold && self.roll(cfg.enforcing_failure_percentage.get()) { + out.push(c.addr.clone()); + *budget -= 1; + } + } + } + + /// Return true with probability `pct / 100` (clamped at 100 ⇒ always). + fn roll(&self, pct: u8) -> bool { + if pct >= 100 { + return true; + } + if pct == 0 { + return false; + } + self.rng.pct_roll() < u32::from(pct) + } +} + +/// Cached per-endpoint snapshot used during a sweep. +struct Candidate { + addr: EndpointAddress, + success: u64, + failure: u64, + total: u64, +} + +/// Background task: runs `detector.run_sweep` on each interval tick and +/// forwards each decision on the channel. The task ends (and `tx` is +/// dropped, closing the receiver) when [`AbortOnDrop`] is dropped or +/// when the receiver itself is dropped. +async fn sweep_loop(detector: Arc, tx: mpsc::UnboundedSender) { + let mut ticker = tokio::time::interval(detector.config.interval); + // Skip missed ticks rather than burst-catching up — the goal is + // periodic observation, not making up for paused time. + ticker.set_missed_tick_behavior(tokio::time::MissedTickBehavior::Skip); + // The first tick fires immediately; consume it so the first real + // sweep is `interval` after spawn (matches A50 semantics). + ticker.tick().await; + + loop { + ticker.tick().await; + for decision in detector.run_sweep(Instant::now()) { + if tx.send(decision).is_err() { + // Receiver gone — nobody is listening. + return; + } + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::xds::resource::outlier_detection::Percentage; + use std::sync::atomic::AtomicU32; + use std::time::Duration; + + // ----- Fixtures ----- + + fn addr(port: u16) -> EndpointAddress { + EndpointAddress::new("10.0.0.1", port) + } + + fn pct(v: u32) -> Percentage { + Percentage::new(v).unwrap() + } + + /// Base config with both algorithms disabled; tests opt in. + fn base_config() -> OutlierDetectionConfig { + OutlierDetectionConfig { + interval: Duration::from_secs(1), + base_ejection_time: Duration::from_secs(30), + max_ejection_time: Duration::from_secs(300), + max_ejection_percent: pct(100), + success_rate: None, + failure_percentage: None, + } + } + + /// Deterministic RNG: `pct_roll()` returns a fixed value, configurable. + struct FixedRng(AtomicU32); + + impl FixedRng { + fn new(value: u32) -> Self { + Self(AtomicU32::new(value)) + } + fn boxed(value: u32) -> Box { + Box::new(Self::new(value)) + } + } + + impl Rng for FixedRng { + fn pct_roll(&self) -> u32 { + self.0.load(Ordering::Relaxed) + } + } + + /// Build a detector with no sweep loop running. Tests drive + /// `run_sweep` directly and inspect the returned decisions. + fn detector_no_loop(config: OutlierDetectionConfig, rng: Box) -> Arc { + Arc::new(OutlierDetector { + config, + state: Mutex::new(HashMap::new()), + rng, + }) + } + + /// Sort a decision list deterministically so equality checks can rely + /// on a canonical order without coupling to `HashMap` iteration order. + fn sort(mut ds: Vec) -> Vec { + ds.sort_by(|a, b| format!("{a:?}").cmp(&format!("{b:?}"))); + ds + } + + // ----- EndpointCounters ----- + + #[test] + fn counters_record_and_reset() { + let c = EndpointCounters::default(); + c.record_success(); + c.record_success(); + c.record_failure(); + assert_eq!(c.snapshot_and_reset(), (2, 1)); + assert_eq!(c.snapshot_and_reset(), (0, 0)); + } + + // ----- add_endpoint / remove_endpoint ----- + + #[test] + fn add_endpoint_returns_shared_counter() { + let detector = detector_no_loop(base_config(), FixedRng::boxed(99)); + let h1 = detector.add_endpoint(addr(8080)); + let h2 = detector.add_endpoint(addr(8080)); + assert!( + Arc::ptr_eq(&h1, &h2), + "second add should return same handle" + ); + h1.record_success(); + assert_eq!(h2.snapshot_and_reset(), (1, 0)); + } + + #[test] + fn remove_endpoint_drops_state() { + let detector = detector_no_loop(base_config(), FixedRng::boxed(99)); + detector.add_endpoint(addr(8080)); + detector.remove_endpoint(&addr(8080)); + assert!(detector.state.lock().unwrap().is_empty()); + } + + // ----- Failure-percentage algorithm ----- + + fn fp_config( + threshold: u32, + request_volume: u32, + minimum_hosts: u32, + ) -> OutlierDetectionConfig { + let mut c = base_config(); + c.failure_percentage = Some(FailurePercentageConfig { + threshold: pct(threshold), + enforcing_failure_percentage: pct(100), + minimum_hosts, + request_volume, + }); + c + } + + #[test] + fn failure_percentage_ejects_above_threshold() { + let detector = detector_no_loop(fp_config(50, 10, 3), FixedRng::boxed(99)); + // 4 healthy endpoints + 1 bad one. + for port in 8080..=8083 { + let h = detector.add_endpoint(addr(port)); + for _ in 0..100 { + h.record_success(); + } + } + let bad = detector.add_endpoint(addr(8084)); + for _ in 0..90 { + bad.record_failure(); + } + for _ in 0..10 { + bad.record_success(); + } + + let decisions = detector.run_sweep(Instant::now()); + assert_eq!(decisions, vec![EjectionDecision::Eject(addr(8084))]); + } + + #[test] + fn failure_percentage_skips_below_threshold() { + let detector = detector_no_loop(fp_config(50, 10, 3), FixedRng::boxed(99)); + for port in 8080..=8084 { + let h = detector.add_endpoint(addr(port)); + // 30% failure → below threshold of 50%. + for _ in 0..70 { + h.record_success(); + } + for _ in 0..30 { + h.record_failure(); + } + } + assert!(detector.run_sweep(Instant::now()).is_empty()); + } + + #[test] + fn minimum_hosts_gates_failure_percentage() { + let detector = detector_no_loop(fp_config(50, 10, 5), FixedRng::boxed(99)); + // Only 2 hosts have request_volume ≥ 10; minimum_hosts is 5 ⇒ skip. + for port in 8080..=8081 { + let h = detector.add_endpoint(addr(port)); + for _ in 0..100 { + h.record_failure(); + } + } + assert!(detector.run_sweep(Instant::now()).is_empty()); + } + + #[test] + fn request_volume_filters_low_traffic_endpoints() { + let detector = detector_no_loop(fp_config(50, 100, 3), FixedRng::boxed(99)); + // Bad endpoint, but only 5 requests — below request_volume=100. + let bad = detector.add_endpoint(addr(8080)); + for _ in 0..5 { + bad.record_failure(); + } + for port in 8081..=8084 { + let h = detector.add_endpoint(addr(port)); + for _ in 0..200 { + h.record_success(); + } + } + assert!(detector.run_sweep(Instant::now()).is_empty()); + } + + #[test] + fn enforcement_zero_percent_never_ejects() { + let mut config = fp_config(50, 10, 3); + config + .failure_percentage + .as_mut() + .unwrap() + .enforcing_failure_percentage = pct(0); + // Roll = 0 wouldn't trigger anyway since `roll(0)` short-circuits; + // pin the RNG to 0 just to be explicit. + let detector = detector_no_loop(config, FixedRng::boxed(0)); + for port in 8080..=8084 { + let h = detector.add_endpoint(addr(port)); + for _ in 0..100 { + h.record_failure(); + } + } + assert!(detector.run_sweep(Instant::now()).is_empty()); + } + + // ----- Success-rate algorithm ----- + + fn sr_config( + stdev_factor: u32, + request_volume: u32, + minimum_hosts: u32, + ) -> OutlierDetectionConfig { + let mut c = base_config(); + c.success_rate = Some(SuccessRateConfig { + stdev_factor, + enforcing_success_rate: pct(100), + minimum_hosts, + request_volume, + }); + c + } + + #[test] + fn success_rate_ejects_outlier_below_threshold() { + let detector = detector_no_loop(sr_config(1900, 10, 5), FixedRng::boxed(99)); + // 4 endpoints at 99% success, 1 at 50% — outlier. + for port in 8080..=8083 { + let h = detector.add_endpoint(addr(port)); + for _ in 0..99 { + h.record_success(); + } + h.record_failure(); + } + let bad = detector.add_endpoint(addr(8084)); + for _ in 0..50 { + bad.record_success(); + } + for _ in 0..50 { + bad.record_failure(); + } + assert_eq!( + detector.run_sweep(Instant::now()), + vec![EjectionDecision::Eject(addr(8084))], + ); + } + + #[test] + fn success_rate_no_ejection_when_all_uniform() { + let detector = detector_no_loop(sr_config(1900, 10, 5), FixedRng::boxed(99)); + for port in 8080..=8084 { + let h = detector.add_endpoint(addr(port)); + for _ in 0..95 { + h.record_success(); + } + for _ in 0..5 { + h.record_failure(); + } + } + assert!(detector.run_sweep(Instant::now()).is_empty()); + } + + // ----- Ejection multiplier / un-ejection ----- + + #[test] + fn unejects_after_base_time() { + let mut config = fp_config(50, 10, 3); + config.base_ejection_time = Duration::from_secs(10); + config.max_ejection_time = Duration::from_secs(60); + let detector = detector_no_loop(config, FixedRng::boxed(99)); + + for port in 8080..=8084 { + let h = detector.add_endpoint(addr(port)); + if port == 8084 { + for _ in 0..100 { + h.record_failure(); + } + } else { + for _ in 0..100 { + h.record_success(); + } + } + } + + let t0 = Instant::now(); + assert_eq!( + detector.run_sweep(t0), + vec![EjectionDecision::Eject(addr(8084))], + ); + + // Still ejected just before base_ejection_time elapses. + assert!(detector.run_sweep(t0 + Duration::from_secs(9)).is_empty()); + + // Un-eject after `base * multiplier(=1)` = 10s. + assert_eq!( + detector.run_sweep(t0 + Duration::from_secs(10)), + vec![EjectionDecision::Uneject(addr(8084))], + ); + } + + #[test] + fn re_ejection_doubles_duration() { + // The multiplier doubles only when un-ejection and re-ejection + // happen in the *same* sweep — at that point the multiplier- + // decrement step has skipped the (still-ejected-at-start) + // endpoint, so re-ejection increments it from 1 to 2. + let mut config = fp_config(50, 10, 3); + config.base_ejection_time = Duration::from_secs(10); + config.max_ejection_time = Duration::from_secs(60); + let detector = detector_no_loop(config, FixedRng::boxed(99)); + + let bad = addr(8084); + let bad_h = detector.add_endpoint(bad.clone()); + for port in 8080..=8083 { + let h = detector.add_endpoint(addr(port)); + for _ in 0..100 { + h.record_success(); + } + } + for _ in 0..100 { + bad_h.record_failure(); + } + + // Sweep 1: eject. Multiplier 0 → 1. + let t0 = Instant::now(); + assert_eq!( + detector.run_sweep(t0), + vec![EjectionDecision::Eject(bad.clone())], + ); + + // Re-record stats so sweep 2's snapshot has volume to evaluate. + for port in 8080..=8083 { + let h = detector.add_endpoint(addr(port)); + for _ in 0..100 { + h.record_success(); + } + } + for _ in 0..100 { + bad_h.record_failure(); + } + + // Sweep 2 at t0+10: same-sweep un-eject + re-eject. + // Multiplier stays 1 through un-eject, then 1 → 2 on re-eject. + assert_eq!( + detector.run_sweep(t0 + Duration::from_secs(10)), + vec![ + EjectionDecision::Uneject(bad.clone()), + EjectionDecision::Eject(bad.clone()), + ], + ); + + // Re-ejection started at t0+10 with multiplier=2 → duration 20s. + // Still ejected 19s later (29s after t0). + assert!(detector.run_sweep(t0 + Duration::from_secs(29)).is_empty()); + + // Un-ejects at the 20s mark (30s after t0). + assert_eq!( + detector.run_sweep(t0 + Duration::from_secs(30)), + vec![EjectionDecision::Uneject(bad)], + ); + } + + #[test] + fn ejection_capped_by_max_ejection_time() { + // base=10s, max=15s, multiplier=10 → cap at 15s rather than 100s. + let mut config = fp_config(50, 10, 3); + config.base_ejection_time = Duration::from_secs(10); + config.max_ejection_time = Duration::from_secs(15); + let detector = detector_no_loop(config, FixedRng::boxed(99)); + + for port in 8080..=8084 { + detector.add_endpoint(addr(port)); + } + let t0 = Instant::now(); + // Force multiplier=10 directly. + { + let mut state = detector.state.lock().unwrap(); + let ep = state.get_mut(&addr(8084)).unwrap(); + ep.ejection_multiplier = 10; + ep.ejected_at = Some(t0); + } + // After base*multiplier (= 100s) the cap (= 15s) has long passed, + // so a sweep at 16s should un-eject. + let decisions = detector.run_sweep(t0 + Duration::from_secs(16)); + assert_eq!(decisions, vec![EjectionDecision::Uneject(addr(8084))]); + } + + #[test] + fn max_ejection_percent_caps_concurrent_ejections() { + // 5 hosts, all bad, but max_ejection_percent=20 ⇒ at most 1 ejected. + let mut config = fp_config(50, 10, 3); + config.max_ejection_percent = pct(20); + let detector = detector_no_loop(config, FixedRng::boxed(99)); + + for port in 8080..=8084 { + let h = detector.add_endpoint(addr(port)); + for _ in 0..100 { + h.record_failure(); + } + } + let decisions = sort(detector.run_sweep(Instant::now())); + let ejects = decisions + .iter() + .filter(|d| matches!(d, EjectionDecision::Eject(_))) + .count(); + assert_eq!(ejects, 1, "max_ejection_percent=20% of 5 hosts ⇒ 1"); + } + + #[test] + fn multiplier_decrements_on_healthy_interval() { + let detector = detector_no_loop(base_config(), FixedRng::boxed(99)); + let h = detector.add_endpoint(addr(8080)); + // Force multiplier to 3 without ejecting. + { + let mut state = detector.state.lock().unwrap(); + state.get_mut(&addr(8080)).unwrap().ejection_multiplier = 3; + } + // Healthy interval (some traffic, no ejection). + h.record_success(); + detector.run_sweep(Instant::now()); + let state = detector.state.lock().unwrap(); + assert_eq!(state.get(&addr(8080)).unwrap().ejection_multiplier, 2); + } + + // ----- Sweep loop ----- + + #[tokio::test(start_paused = true)] + async fn sweep_loop_emits_decisions_on_tick() { + let mut config = fp_config(50, 10, 3); + config.interval = Duration::from_millis(100); + let (detector, mut rx, _abort) = + OutlierDetector::spawn_with_rng(config, FixedRng::boxed(99)); + + for port in 8080..=8083 { + let h = detector.add_endpoint(addr(port)); + for _ in 0..100 { + h.record_success(); + } + } + let bad = detector.add_endpoint(addr(8084)); + for _ in 0..100 { + bad.record_failure(); + } + + // Advance just past the first sweep tick. + tokio::time::sleep(Duration::from_millis(150)).await; + + let decision = rx.recv().await.expect("sweep should emit a decision"); + assert_eq!(decision, EjectionDecision::Eject(addr(8084))); + } + + #[tokio::test(start_paused = true)] + async fn dropping_abort_stops_sweep_loop() { + let mut config = base_config(); + config.interval = Duration::from_millis(50); + let (_detector, mut rx, abort) = OutlierDetector::spawn(config); + + // Drop the AbortOnDrop; the loop must terminate. + drop(abort); + tokio::time::sleep(Duration::from_millis(200)).await; + + // Sender should be dropped along with the task; recv returns None. + assert!(rx.recv().await.is_none()); + } +} From b03fb6982d78ec16bfbfb07e634b6c6b510e90db Mon Sep 17 00:00:00 2001 From: Yu Liu <60283975+LYZJU2019@users.noreply.github.com> Date: Thu, 30 Apr 2026 14:58:19 -0700 Subject: [PATCH 02/39] docs(tonic-xds): clarify outlier-detection config docs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Address two follow-up review comments from #2604 (the merged config PR) by folding the doc updates into this PR: - Module docstring: describe the actual integration plan (an mpsc channel of EjectionDecisions polled by LoadBalancer, leveraging EjectedChannel) instead of the original "filter on the Discover stream" wording. Add intra-doc links to the relevant types. - enforcing_success_rate / enforcing_failure_percentage: clarify that each is the *enforcement probability* — distinct from the per-algorithm threshold (stdev_factor for success-rate, threshold for failure-percentage). Note that 0 disables enforcement while still computing statistics. Also fix an unresolved intra-doc link in the algorithm module. --- .../client/loadbalance/outlier_detection.rs | 2 +- .../src/xds/resource/outlier_detection.rs | 21 +++++++++++++------ 2 files changed, 16 insertions(+), 7 deletions(-) diff --git a/tonic-xds/src/client/loadbalance/outlier_detection.rs b/tonic-xds/src/client/loadbalance/outlier_detection.rs index 93e63ed46..5e53883c0 100644 --- a/tonic-xds/src/client/loadbalance/outlier_detection.rs +++ b/tonic-xds/src/client/loadbalance/outlier_detection.rs @@ -131,7 +131,7 @@ impl OutlierDetector { Self::spawn_with_rng(config, Box::new(FastRandRng)) } - /// Variant of [`spawn`] that accepts an injected [`Rng`]. + /// Variant of [`Self::spawn`] that accepts an injected [`Rng`]. pub(crate) fn spawn_with_rng( config: OutlierDetectionConfig, rng: Box, diff --git a/tonic-xds/src/xds/resource/outlier_detection.rs b/tonic-xds/src/xds/resource/outlier_detection.rs index a31fd6c60..159ff7735 100644 --- a/tonic-xds/src/xds/resource/outlier_detection.rs +++ b/tonic-xds/src/xds/resource/outlier_detection.rs @@ -4,12 +4,17 @@ //! algorithm. The two sub-configs gate which ejection algorithms run. //! //! Note: A50 specifies outlier detection as a load-balancing policy -//! wrapping a `child_policy`. `tonic-xds` currently runs P2C as its only -//! load balancer and integrates outlier detection as a filter on the -//! `Discover` stream feeding it, so there is no `child_policy` field -//! here yet. It will be added when more balancers are supported. +//! wrapping a `child_policy`. `tonic-xds` currently runs P2C as its +//! only load balancer, so there is no `child_policy` field here yet — +//! it will be added when more balancers are supported. Integration +//! with the data path is via an mpsc channel of ejection decisions +//! polled by the [`LoadBalancer`] tower service, which marks the +//! corresponding [`ReadyChannel`] as ejected via [`EjectedChannel`]. //! //! [gRFC A50]: https://github.com/grpc/proposal/blob/master/A50-xds-outlier-detection.md +//! [`LoadBalancer`]: crate::client::loadbalance::loadbalancer::LoadBalancer +//! [`ReadyChannel`]: crate::client::loadbalance::channel_state::ReadyChannel +//! [`EjectedChannel`]: crate::client::loadbalance::channel_state::EjectedChannel use std::time::Duration; @@ -68,7 +73,9 @@ pub(crate) struct SuccessRateConfig { /// An endpoint is a candidate for ejection when its success rate falls /// below `mean - stdev * (stdev_factor / 1000.0)`. pub stdev_factor: u32, - /// Probability that a candidate is actually ejected. + /// Probability that a flagged candidate is actually ejected — *not* + /// the success-rate threshold (which is derived from `stdev_factor`). + /// Set to 0 to disable enforcement while still computing statistics. pub enforcing_success_rate: Percentage, /// Minimum number of candidate endpoints required to run the algorithm. pub minimum_hosts: u32, @@ -83,7 +90,9 @@ pub(crate) struct FailurePercentageConfig { /// Failure rate at or above which an endpoint is a candidate for /// ejection. pub threshold: Percentage, - /// Probability that a candidate is actually ejected. + /// Probability that a flagged candidate is actually ejected — *not* + /// the failure-rate threshold (that is `threshold` above). Set to 0 + /// to disable enforcement while still computing statistics. pub enforcing_failure_percentage: Percentage, /// Minimum number of candidate endpoints required to run the algorithm. pub minimum_hosts: u32, From 1da4063e55c4ab397addd483299dccd4f0e97880 Mon Sep 17 00:00:00 2001 From: Yu Liu <60283975+LYZJU2019@users.noreply.github.com> Date: Thu, 30 Apr 2026 15:11:19 -0700 Subject: [PATCH 03/39] fix(tonic-xds): align outlier-detection algorithm with gRFC A50 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Three spec-compliance fixes to `run_sweep` and the failure-percentage algorithm: 1. Reorder the sweep to match A50 step order: snapshot counters → run success-rate algorithm → run failure-percentage algorithm → step-5 housekeeping (decrement non-ejected multipliers, un-eject elapsed ejections). The previous order (un-eject before algorithms) caused spurious `Uneject` decisions whenever the same sweep also re-ejected the address. Per spec, re-ejection refreshes `ejected_at` to `now` before the un-eject check runs, so no transient un-eject is emitted. 2. Drop the `total > 0` traffic gate from the multiplier-decrement step. A50 says a non-ejected address with multiplier > 0 has its multiplier decremented every sweep, regardless of whether it received traffic that interval. 3. Failure-percentage now uses strict `>` against the threshold (was `>=`). Per A50: "If the address's failure percentage is greater than `failure_percentage_ejection.threshold`..." — an address sitting exactly at the threshold is not ejected. Also: drop the explicit "skip ejected hosts from candidate list" pre- filter. Per spec the algorithms iterate every address; ejected hosts naturally fail the `request_volume` gate since they receive no traffic in production. Behavior on real workloads is unchanged. Test changes: - `re_ejection_doubles_duration` now asserts a single `Eject` decision (no transient `Uneject`) under the corrected sweep order. - New `failure_percentage_at_threshold_does_not_eject` covers the strict-`>` boundary. - New `multiplier_decrements_even_without_traffic` covers the no-traffic-gate fix. --- .../client/loadbalance/outlier_detection.rs | 167 +++++++++++------- 1 file changed, 101 insertions(+), 66 deletions(-) diff --git a/tonic-xds/src/client/loadbalance/outlier_detection.rs b/tonic-xds/src/client/loadbalance/outlier_detection.rs index 5e53883c0..04ba1f734 100644 --- a/tonic-xds/src/client/loadbalance/outlier_detection.rs +++ b/tonic-xds/src/client/loadbalance/outlier_detection.rs @@ -178,67 +178,34 @@ impl OutlierDetector { /// ejection/un-ejection decisions. Pure — does no I/O. The sweep loop /// invokes this on each interval tick and forwards the decisions on /// the channel; tests call it directly. + /// + /// The order of operations follows gRFC A50: + /// 1. Record the timestamp. + /// 2. Swap each address's call-counter buckets. + /// 3. Run the success-rate algorithm if configured. + /// 4. Run the failure-percentage algorithm if configured. + /// 5. For each address: decrement the multiplier of non-ejected + /// addresses with multiplier > 0, and un-eject ejected addresses + /// whose backoff has elapsed. pub(crate) fn run_sweep(&self, now: Instant) -> Vec { let mut state = self.state.lock().expect("outlier_detector mutex poisoned"); - // Snapshot per-endpoint stats and update ejection-time multiplier - // bookkeeping. A50: for each endpoint that received traffic and is - // not currently ejected, decrement the multiplier toward zero. - let mut snapshots: Vec<(EndpointAddress, u64, u64)> = Vec::with_capacity(state.len()); + // Step 2: snapshot every endpoint's counters. + let mut snapshots: Vec = Vec::with_capacity(state.len()); for (addr, ep) in state.iter_mut() { let (success, failure) = ep.counters.snapshot_and_reset(); - let total = success + failure; - if ep.ejected_at.is_none() && total > 0 { - ep.ejection_multiplier = ep.ejection_multiplier.saturating_sub(1); - } - snapshots.push((addr.clone(), success, failure)); - } - - // Un-eject endpoints whose backoff has elapsed. A50: - // actual_duration = min(base * multiplier, max(base, max_ejection_time)) - let cap = self - .config - .base_ejection_time - .max(self.config.max_ejection_time); - let mut to_uneject: Vec = Vec::new(); - for (addr, ep) in state.iter_mut() { - if let Some(at) = ep.ejected_at - && let Some(scaled) = self - .config - .base_ejection_time - .checked_mul(ep.ejection_multiplier) - && now.duration_since(at) >= scaled.min(cap) - { - ep.ejected_at = None; - to_uneject.push(addr.clone()); - } - } - - // Build candidate list (non-ejected endpoints) once for both - // algorithms. A50 wants both algorithms to share the snapshot. - // Note: we only build the rate slice; per-algorithm filters - // (request_volume, minimum_hosts) are applied below. - let candidates: Vec = snapshots - .iter() - .filter_map(|(addr, success, failure)| { - let total = success + failure; - let ep = state.get(addr)?; - if ep.ejected_at.is_some() { - return None; - } - Some(Candidate { - addr: addr.clone(), - success: *success, - failure: *failure, - total, - }) - }) - .collect(); - - // Compute the cap on currently-ejected endpoints. A50: - // if ejected_count >= max_ejection_percent of total, stop ejecting. - // We compute the cap once and decrement the available budget as - // each algorithm ejects. + snapshots.push(Candidate { + addr: addr.clone(), + success, + failure, + total: success + failure, + }); + } + + // Compute a cap on the number of new ejections this sweep so we + // don't exceed `max_ejection_percent` of the total. Per A50, the + // check is performed before each candidate ejection; we model that + // as a budget that algorithms decrement. let total_endpoints = state.len(); let max_ejections = (total_endpoints as u64 * u64::from(self.config.max_ejection_percent.get()) @@ -246,13 +213,18 @@ impl OutlierDetector { let already_ejected = state.values().filter(|ep| ep.ejected_at.is_some()).count(); let mut budget = max_ejections.saturating_sub(already_ejected); + // Steps 3 & 4: run the algorithms on the snapshot. Hosts that are + // currently ejected naturally fail the `request_volume` gate + // because they receive no traffic in production, so iterating + // every address (per spec) and ejected-only candidates produce + // the same outcome on real workloads. let mut to_eject: Vec = Vec::new(); if let Some(sr) = self.config.success_rate.as_ref() { - self.run_success_rate(sr, &candidates, &mut budget, &mut to_eject); + self.run_success_rate(sr, &snapshots, &mut budget, &mut to_eject); } if let Some(fp) = self.config.failure_percentage.as_ref() { - self.run_failure_percentage(fp, &candidates, &mut budget, &mut to_eject); + self.run_failure_percentage(fp, &snapshots, &mut budget, &mut to_eject); } for addr in &to_eject { @@ -262,6 +234,32 @@ impl OutlierDetector { } } + // Step 5: decrement multipliers for non-ejected addresses, and + // un-eject any ejected addresses whose backoff has elapsed. This + // runs *after* re-ejection, so a same-sweep re-ejection updates + // `ejected_at` to `now` and the un-eject check sees zero elapsed + // time — no spurious uneject decision is emitted. + let cap = self + .config + .base_ejection_time + .max(self.config.max_ejection_time); + let mut to_uneject: Vec = Vec::new(); + for (addr, ep) in state.iter_mut() { + if let Some(at) = ep.ejected_at { + if let Some(scaled) = self + .config + .base_ejection_time + .checked_mul(ep.ejection_multiplier) + && now.duration_since(at) >= scaled.min(cap) + { + ep.ejected_at = None; + to_uneject.push(addr.clone()); + } + } else if ep.ejection_multiplier > 0 { + ep.ejection_multiplier -= 1; + } + } + drop(state); let mut decisions = Vec::with_capacity(to_uneject.len() + to_eject.len()); @@ -338,9 +336,11 @@ impl OutlierDetector { if *budget == 0 { break; } - // failure_pct = 100 * failure / total + // failure_pct = 100 * failure / total. A50 specifies a strict + // "greater than" comparison: an address sitting exactly at + // the threshold is not ejected. let failure_pct = 100 * c.failure / c.total; - if failure_pct >= threshold && self.roll(cfg.enforcing_failure_percentage.get()) { + if failure_pct > threshold && self.roll(cfg.enforcing_failure_percentage.get()) { out.push(c.addr.clone()); *budget -= 1; } @@ -545,6 +545,24 @@ mod tests { assert!(detector.run_sweep(Instant::now()).is_empty()); } + #[test] + fn failure_percentage_at_threshold_does_not_eject() { + // A50 specifies a strict "greater than" comparison: an address + // sitting exactly at the threshold should *not* be ejected. + let detector = detector_no_loop(fp_config(50, 10, 3), FixedRng::boxed(0)); + for port in 8080..=8084 { + let h = detector.add_endpoint(addr(port)); + // Exactly 50% failure rate — equal to the threshold. + for _ in 0..50 { + h.record_success(); + } + for _ in 0..50 { + h.record_failure(); + } + } + assert!(detector.run_sweep(Instant::now()).is_empty()); + } + #[test] fn minimum_hosts_gates_failure_percentage() { let detector = detector_no_loop(fp_config(50, 10, 5), FixedRng::boxed(99)); @@ -730,14 +748,14 @@ mod tests { bad_h.record_failure(); } - // Sweep 2 at t0+10: same-sweep un-eject + re-eject. - // Multiplier stays 1 through un-eject, then 1 → 2 on re-eject. + // Sweep 2 at t0+10: re-ejection happens before the un-eject + // housekeeping step (per A50 ordering), so `ejected_at` is + // refreshed to `now` and the un-eject check sees zero elapsed + // time. Only an Eject decision is emitted; the multiplier moves + // 1 → 2. assert_eq!( detector.run_sweep(t0 + Duration::from_secs(10)), - vec![ - EjectionDecision::Uneject(bad.clone()), - EjectionDecision::Eject(bad.clone()), - ], + vec![EjectionDecision::Eject(bad.clone())], ); // Re-ejection started at t0+10 with multiplier=2 → duration 20s. @@ -813,6 +831,23 @@ mod tests { assert_eq!(state.get(&addr(8080)).unwrap().ejection_multiplier, 2); } + #[test] + fn multiplier_decrements_even_without_traffic() { + // A50: a non-ejected address with multiplier > 0 has its + // multiplier decremented every sweep, regardless of whether it + // received any RPCs that interval. + let detector = detector_no_loop(base_config(), FixedRng::boxed(99)); + detector.add_endpoint(addr(8080)); + { + let mut state = detector.state.lock().unwrap(); + state.get_mut(&addr(8080)).unwrap().ejection_multiplier = 3; + } + // No traffic recorded. + detector.run_sweep(Instant::now()); + let state = detector.state.lock().unwrap(); + assert_eq!(state.get(&addr(8080)).unwrap().ejection_multiplier, 2); + } + // ----- Sweep loop ----- #[tokio::test(start_paused = true)] From 1663b1c85ff99211ddb31cb1fe2ec2bf1622f3f1 Mon Sep 17 00:00:00 2001 From: Yu Liu <60283975+LYZJU2019@users.noreply.github.com> Date: Thu, 30 Apr 2026 15:15:25 -0700 Subject: [PATCH 04/39] refactor(tonic-xds): defer success-rate algorithm to a follow-up PR MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Drop the success-rate algorithm and its tests from this PR so the outlier-detection PR is minimal and stand-alone. The scaffolding (sweep loop, multiplier state, counters, max-ejection-percent budget) is unchanged and still exercised by the failure-percentage algorithm plus the multiplier / un-eject / cap tests. If `OutlierDetectionConfig.success_rate` is set on the cluster, it is currently ignored. Documented in the module docstring with a pointer to the follow-up PR. Removes: - `OutlierDetector::run_success_rate` (mean / variance / sqrt math). - `success_rate` dispatch in `run_sweep`. - `run_failure_percentage`'s `!out.contains` filter — dead now that only one algorithm runs per sweep. - `success_rate_ejects_outlier_below_threshold` test. - `success_rate_no_ejection_when_all_uniform` test. - The `sr_config` test helper. - Unused `SuccessRateConfig` import. --- .../client/loadbalance/outlier_detection.rs | 126 +++--------------- 1 file changed, 16 insertions(+), 110 deletions(-) diff --git a/tonic-xds/src/client/loadbalance/outlier_detection.rs b/tonic-xds/src/client/loadbalance/outlier_detection.rs index 04ba1f734..a30286c98 100644 --- a/tonic-xds/src/client/loadbalance/outlier_detection.rs +++ b/tonic-xds/src/client/loadbalance/outlier_detection.rs @@ -1,13 +1,20 @@ //! gRFC A50 outlier-detection sweep engine. //! //! Owns per-endpoint counters and an ejection state machine. Periodically -//! reads the counters, runs the success-rate and failure-percentage -//! ejection algorithms, and emits [`EjectionDecision`]s. Knows nothing -//! about the data path: callers feed it RPC outcomes via the lock-free -//! [`EndpointCounters`] handle returned by [`OutlierDetector::add_endpoint`], -//! and consume decisions from a channel returned by [`OutlierDetector::spawn`]. +//! reads the counters, runs the failure-percentage ejection algorithm, +//! and emits [`EjectionDecision`]s. Knows nothing about the data path: +//! callers feed it RPC outcomes via the lock-free [`EndpointCounters`] +//! handle returned by [`OutlierDetector::add_endpoint`], and consume +//! decisions from a channel returned by [`OutlierDetector::spawn`]. +//! +//! Only the **failure-percentage** algorithm is implemented in this +//! module. The success-rate algorithm — which adds float-math (mean +//! and standard deviation across the qualifying hosts) — lands in a +//! follow-up PR. If [`OutlierDetectionConfig::success_rate`] is set, +//! it is currently ignored. //! //! [gRFC A50]: https://github.com/grpc/proposal/blob/master/A50-xds-outlier-detection.md +//! [`OutlierDetectionConfig::success_rate`]: crate::xds::resource::outlier_detection::OutlierDetectionConfig::success_rate use std::collections::HashMap; use std::sync::Arc; @@ -19,9 +26,7 @@ use tokio::sync::mpsc; use crate::client::endpoint::EndpointAddress; use crate::common::async_util::AbortOnDrop; -use crate::xds::resource::outlier_detection::{ - FailurePercentageConfig, OutlierDetectionConfig, SuccessRateConfig, -}; +use crate::xds::resource::outlier_detection::{FailurePercentageConfig, OutlierDetectionConfig}; /// Lock-free per-endpoint success/failure counter handle. /// @@ -218,11 +223,11 @@ impl OutlierDetector { // because they receive no traffic in production, so iterating // every address (per spec) and ejected-only candidates produce // the same outcome on real workloads. + // + // Step 3 (`success_rate_ejection`) is intentionally not yet + // dispatched in this PR; it lands in a follow-up. let mut to_eject: Vec = Vec::new(); - if let Some(sr) = self.config.success_rate.as_ref() { - self.run_success_rate(sr, &snapshots, &mut budget, &mut to_eject); - } if let Some(fp) = self.config.failure_percentage.as_ref() { self.run_failure_percentage(fp, &snapshots, &mut budget, &mut to_eject); } @@ -272,48 +277,6 @@ impl OutlierDetector { decisions } - /// A50 success-rate algorithm. - fn run_success_rate( - &self, - cfg: &SuccessRateConfig, - all: &[Candidate], - budget: &mut usize, - out: &mut Vec, - ) { - // Filter to candidates with enough traffic. - let qualifying: Vec<&Candidate> = all - .iter() - .filter(|c| c.total >= u64::from(cfg.request_volume)) - .collect(); - if qualifying.len() < cfg.minimum_hosts as usize { - return; - } - - // success_rate = success / total (in [0.0, 1.0]). - let rates: Vec = qualifying - .iter() - .map(|c| c.success as f64 / c.total as f64) - .collect(); - let n = rates.len() as f64; - let mean = rates.iter().sum::() / n; - let variance = rates.iter().map(|r| (r - mean).powi(2)).sum::() / n; - let stdev = variance.sqrt(); - - // threshold = mean - stdev * (stdev_factor / 1000) - let factor = f64::from(cfg.stdev_factor) / 1000.0; - let threshold = mean - stdev * factor; - - for (c, rate) in qualifying.iter().zip(rates.iter()) { - if *budget == 0 { - break; - } - if *rate < threshold && self.roll(cfg.enforcing_success_rate.get()) { - out.push(c.addr.clone()); - *budget -= 1; - } - } - } - /// A50 failure-percentage algorithm. fn run_failure_percentage( &self, @@ -325,7 +288,6 @@ impl OutlierDetector { let qualifying: Vec<&Candidate> = all .iter() .filter(|c| c.total >= u64::from(cfg.request_volume)) - .filter(|c| !out.contains(&c.addr)) // skip endpoints already ejected this sweep .collect(); if qualifying.len() < cfg.minimum_hosts as usize { return; @@ -613,62 +575,6 @@ mod tests { assert!(detector.run_sweep(Instant::now()).is_empty()); } - // ----- Success-rate algorithm ----- - - fn sr_config( - stdev_factor: u32, - request_volume: u32, - minimum_hosts: u32, - ) -> OutlierDetectionConfig { - let mut c = base_config(); - c.success_rate = Some(SuccessRateConfig { - stdev_factor, - enforcing_success_rate: pct(100), - minimum_hosts, - request_volume, - }); - c - } - - #[test] - fn success_rate_ejects_outlier_below_threshold() { - let detector = detector_no_loop(sr_config(1900, 10, 5), FixedRng::boxed(99)); - // 4 endpoints at 99% success, 1 at 50% — outlier. - for port in 8080..=8083 { - let h = detector.add_endpoint(addr(port)); - for _ in 0..99 { - h.record_success(); - } - h.record_failure(); - } - let bad = detector.add_endpoint(addr(8084)); - for _ in 0..50 { - bad.record_success(); - } - for _ in 0..50 { - bad.record_failure(); - } - assert_eq!( - detector.run_sweep(Instant::now()), - vec![EjectionDecision::Eject(addr(8084))], - ); - } - - #[test] - fn success_rate_no_ejection_when_all_uniform() { - let detector = detector_no_loop(sr_config(1900, 10, 5), FixedRng::boxed(99)); - for port in 8080..=8084 { - let h = detector.add_endpoint(addr(port)); - for _ in 0..95 { - h.record_success(); - } - for _ in 0..5 { - h.record_failure(); - } - } - assert!(detector.run_sweep(Instant::now()).is_empty()); - } - // ----- Ejection multiplier / un-ejection ----- #[test] From 83530f88706b00a976e029cb57c08684e7b262c7 Mon Sep 17 00:00:00 2001 From: Yu Liu <60283975+LYZJU2019@users.noreply.github.com> Date: Thu, 30 Apr 2026 16:22:57 -0700 Subject: [PATCH 05/39] refactor(tonic-xds): use bounded mpsc for ejection decisions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Switch from `mpsc::unbounded_channel` to `mpsc::channel(256)` for the ejection-decision stream that the sweep loop emits. The decisions are edge-triggered (`Eject`/`Uneject` transitions, not state snapshots), so the consumer must process every event in order; we can't drop or coalesce. But we don't want unbounded memory growth either if the consumer stalls. A bounded channel gives us: - Same correctness as unbounded — no events dropped, ordered delivery. - Bounded memory. - Natural backpressure: when the buffer fills, `tx.send().await` parks the sweep task, which (combined with `MissedTickBehavior:: Skip`) throttles sweep cadence to whatever rate the consumer can drain. Computing more decisions than the consumer can apply just widens the desync. Capacity is 256 — at most `2 * num_endpoints` decisions per sweep, so this buffers several sweeps' worth of decisions for clusters of typical size. A docstring on `DECISIONS_CHANNEL_CAPACITY` captures the rationale for future readers. --- .../client/loadbalance/outlier_detection.rs | 38 ++++++++++++------- 1 file changed, 25 insertions(+), 13 deletions(-) diff --git a/tonic-xds/src/client/loadbalance/outlier_detection.rs b/tonic-xds/src/client/loadbalance/outlier_detection.rs index a30286c98..43ae8e2fd 100644 --- a/tonic-xds/src/client/loadbalance/outlier_detection.rs +++ b/tonic-xds/src/client/loadbalance/outlier_detection.rs @@ -28,6 +28,22 @@ use crate::client::endpoint::EndpointAddress; use crate::common::async_util::AbortOnDrop; use crate::xds::resource::outlier_detection::{FailurePercentageConfig, OutlierDetectionConfig}; +/// Capacity of the bounded mpsc channel that carries ejection decisions +/// from the sweep loop to the consumer. +/// +/// Decisions are edge-triggered (`Eject`/`Uneject` transitions, not +/// state snapshots), so the consumer must process every event in order +/// to stay in sync with the detector. We therefore can't drop or +/// coalesce — but we don't want unbounded growth either if the consumer +/// stalls. With sweep cadence on the order of seconds and at most +/// `2 * num_endpoints` decisions per sweep, 256 buffers several sweeps' +/// worth of decisions for clusters of typical size. When the buffer +/// fills, `tx.send().await` parks the sweep task, which naturally +/// throttles sweep cadence to whatever rate the consumer can drain — +/// the right behavior, since computing more decisions than the consumer +/// can apply just widens the desync. +const DECISIONS_CHANNEL_CAPACITY: usize = 256; + /// Lock-free per-endpoint success/failure counter handle. /// /// Cloned freely. Callers (typically a request-outcome interceptor) @@ -128,11 +144,7 @@ impl OutlierDetector { /// [`AbortOnDrop`] is dropped. pub(crate) fn spawn( config: OutlierDetectionConfig, - ) -> ( - Arc, - mpsc::UnboundedReceiver, - AbortOnDrop, - ) { + ) -> (Arc, mpsc::Receiver, AbortOnDrop) { Self::spawn_with_rng(config, Box::new(FastRandRng)) } @@ -140,12 +152,8 @@ impl OutlierDetector { pub(crate) fn spawn_with_rng( config: OutlierDetectionConfig, rng: Box, - ) -> ( - Arc, - mpsc::UnboundedReceiver, - AbortOnDrop, - ) { - let (tx, rx) = mpsc::unbounded_channel(); + ) -> (Arc, mpsc::Receiver, AbortOnDrop) { + let (tx, rx) = mpsc::channel(DECISIONS_CHANNEL_CAPACITY); let detector = Arc::new(Self { config, state: Mutex::new(HashMap::new()), @@ -333,7 +341,11 @@ struct Candidate { /// forwards each decision on the channel. The task ends (and `tx` is /// dropped, closing the receiver) when [`AbortOnDrop`] is dropped or /// when the receiver itself is dropped. -async fn sweep_loop(detector: Arc, tx: mpsc::UnboundedSender) { +/// +/// `tx.send().await` is fallible (returns `Err` if the receiver was +/// dropped) and may park briefly when the channel is full — see +/// [`DECISIONS_CHANNEL_CAPACITY`]. +async fn sweep_loop(detector: Arc, tx: mpsc::Sender) { let mut ticker = tokio::time::interval(detector.config.interval); // Skip missed ticks rather than burst-catching up — the goal is // periodic observation, not making up for paused time. @@ -345,7 +357,7 @@ async fn sweep_loop(detector: Arc, tx: mpsc::UnboundedSender Date: Fri, 1 May 2026 13:55:45 -0700 Subject: [PATCH 06/39] refactor(tonic-xds): make OutlierDetector runtime options configurable MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replace `spawn_with_rng` with `spawn_with`, taking an `OutlierDetectorOptions` struct that bundles the RNG and the new configurable `decisions_channel_capacity`. Defaults are unchanged (`fastrand` RNG, capacity 256). The hard-coded constant becomes `DEFAULT_DECISIONS_CHANNEL_CAPACITY` and is no longer the only knob — production callers may want to bump the bound for clusters with very large endpoint sets (worst case `2 * num_endpoints` decisions per sweep) or unusually slow consumers. Using a struct instead of a long argument list means future runtime knobs (custom Tokio runtime, alternate backoff policies, observability hooks, …) can be added without breaking call sites — callers typically construct via `..Default::default()`. The xDS-derived `OutlierDetectionConfig` stays separate from these host-side runtime knobs, keeping a clean line between "what the xDS proto specifies" and "how this binary chooses to host it." --- .../client/loadbalance/outlier_detection.rs | 75 +++++++++++++++---- 1 file changed, 61 insertions(+), 14 deletions(-) diff --git a/tonic-xds/src/client/loadbalance/outlier_detection.rs b/tonic-xds/src/client/loadbalance/outlier_detection.rs index 43ae8e2fd..a76594d80 100644 --- a/tonic-xds/src/client/loadbalance/outlier_detection.rs +++ b/tonic-xds/src/client/loadbalance/outlier_detection.rs @@ -28,8 +28,8 @@ use crate::client::endpoint::EndpointAddress; use crate::common::async_util::AbortOnDrop; use crate::xds::resource::outlier_detection::{FailurePercentageConfig, OutlierDetectionConfig}; -/// Capacity of the bounded mpsc channel that carries ejection decisions -/// from the sweep loop to the consumer. +/// Default capacity of the bounded mpsc channel that carries ejection +/// decisions from the sweep loop to the consumer. /// /// Decisions are edge-triggered (`Eject`/`Uneject` transitions, not /// state snapshots), so the consumer must process every event in order @@ -42,7 +42,11 @@ use crate::xds::resource::outlier_detection::{FailurePercentageConfig, OutlierDe /// throttles sweep cadence to whatever rate the consumer can drain — /// the right behavior, since computing more decisions than the consumer /// can apply just widens the desync. -const DECISIONS_CHANNEL_CAPACITY: usize = 256; +/// +/// Override via [`OutlierDetectorOptions::decisions_channel_capacity`] +/// for clusters with very large endpoint sets or unusually slow +/// consumers. +pub(crate) const DEFAULT_DECISIONS_CHANNEL_CAPACITY: usize = 256; /// Lock-free per-endpoint success/failure counter handle. /// @@ -122,6 +126,44 @@ impl EndpointState { } } +/// Runtime knobs that don't come from the xDS config (`OutlierDetection` +/// proto) — the channel capacity, the RNG, etc. Kept separate from +/// [`OutlierDetectionConfig`] so xDS-derived state stays distinct from +/// host-side runtime tuning. +/// +/// New fields can be added without breaking call sites because callers +/// typically construct via `..Default::default()`. +pub(crate) struct OutlierDetectorOptions { + /// Capacity of the bounded mpsc channel that carries + /// [`EjectionDecision`]s from the sweep loop to the consumer. + /// See [`DEFAULT_DECISIONS_CHANNEL_CAPACITY`] for the rationale. + pub decisions_channel_capacity: usize, + /// Probability source for the `enforcing_*` rolls. Tests inject a + /// deterministic [`Rng`]; production uses `fastrand`. + pub rng: Box, +} + +impl Default for OutlierDetectorOptions { + fn default() -> Self { + Self { + decisions_channel_capacity: DEFAULT_DECISIONS_CHANNEL_CAPACITY, + rng: Box::new(FastRandRng), + } + } +} + +impl std::fmt::Debug for OutlierDetectorOptions { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("OutlierDetectorOptions") + .field( + "decisions_channel_capacity", + &self.decisions_channel_capacity, + ) + .field("rng", &"") + .finish() + } +} + /// gRFC A50 outlier detector. /// /// `run_sweep` is pure — it returns a list of [`EjectionDecision`]s @@ -139,25 +181,25 @@ pub(crate) struct OutlierDetector { } impl OutlierDetector { - /// Build the detector and spawn its sweep task on the current Tokio - /// runtime. The sweep runs every `config.interval` until the returned - /// [`AbortOnDrop`] is dropped. + /// Build the detector with default runtime options and spawn its + /// sweep task on the current Tokio runtime. The sweep runs every + /// `config.interval` until the returned [`AbortOnDrop`] is dropped. pub(crate) fn spawn( config: OutlierDetectionConfig, ) -> (Arc, mpsc::Receiver, AbortOnDrop) { - Self::spawn_with_rng(config, Box::new(FastRandRng)) + Self::spawn_with(config, OutlierDetectorOptions::default()) } - /// Variant of [`Self::spawn`] that accepts an injected [`Rng`]. - pub(crate) fn spawn_with_rng( + /// Variant of [`Self::spawn`] that accepts custom runtime options. + pub(crate) fn spawn_with( config: OutlierDetectionConfig, - rng: Box, + options: OutlierDetectorOptions, ) -> (Arc, mpsc::Receiver, AbortOnDrop) { - let (tx, rx) = mpsc::channel(DECISIONS_CHANNEL_CAPACITY); + let (tx, rx) = mpsc::channel(options.decisions_channel_capacity); let detector = Arc::new(Self { config, state: Mutex::new(HashMap::new()), - rng, + rng: options.rng, }); let task = tokio::spawn(sweep_loop(detector.clone(), tx)); (detector, rx, AbortOnDrop(task)) @@ -772,8 +814,13 @@ mod tests { async fn sweep_loop_emits_decisions_on_tick() { let mut config = fp_config(50, 10, 3); config.interval = Duration::from_millis(100); - let (detector, mut rx, _abort) = - OutlierDetector::spawn_with_rng(config, FixedRng::boxed(99)); + let (detector, mut rx, _abort) = OutlierDetector::spawn_with( + config, + OutlierDetectorOptions { + rng: FixedRng::boxed(99), + ..Default::default() + }, + ); for port in 8080..=8083 { let h = detector.add_endpoint(addr(port)); From 8eacb782a6c7e961916570d28f74182fad618c19 Mon Sep 17 00:00:00 2001 From: Yu Liu <60283975+LYZJU2019@users.noreply.github.com> Date: Mon, 4 May 2026 09:44:03 -0700 Subject: [PATCH 07/39] test(tonic-xds): use tokio::time::advance instead of sleep in paused tests MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Both `sweep_loop_emits_decisions_on_tick` and `dropping_abort_stops_sweep_loop` previously used `tokio::time::sleep` in `start_paused = true` mode. That works through the runtime's auto-advance heuristic for parked tasks, but the heuristic is sensitive to the order of pending wake-ups across multiple tasks and can be flaky in practice. - `sweep_loop_emits_decisions_on_tick`: switch to `tokio::time::advance(150ms)` which explicitly moves the clock and yields until pending wake-ups have been polled — deterministic. - `dropping_abort_stops_sweep_loop`: drop the artificial sleep altogether. Aborting the JoinHandle wakes the spawned task synchronously; the runtime polls it, the harness observes the abort, and the task ends — dropping its sender. `rx.recv().await` parks briefly while that happens and then returns `None`. No time advancement needed. Stress-tested both tests 50× back-to-back: all pass. --- .../client/loadbalance/outlier_detection.rs | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/tonic-xds/src/client/loadbalance/outlier_detection.rs b/tonic-xds/src/client/loadbalance/outlier_detection.rs index a76594d80..345a2ee85 100644 --- a/tonic-xds/src/client/loadbalance/outlier_detection.rs +++ b/tonic-xds/src/client/loadbalance/outlier_detection.rs @@ -833,8 +833,12 @@ mod tests { bad.record_failure(); } - // Advance just past the first sweep tick. - tokio::time::sleep(Duration::from_millis(150)).await; + // Explicitly advance virtual time past the first sweep tick. + // `advance` is preferred over `sleep` for paused-time tests — it + // moves the clock deterministically and yields until pending + // task wake-ups have been polled, instead of relying on the + // runtime's auto-advance heuristic for parked tasks. + tokio::time::advance(Duration::from_millis(150)).await; let decision = rx.recv().await.expect("sweep should emit a decision"); assert_eq!(decision, EjectionDecision::Eject(addr(8084))); @@ -846,11 +850,13 @@ mod tests { config.interval = Duration::from_millis(50); let (_detector, mut rx, abort) = OutlierDetector::spawn(config); - // Drop the AbortOnDrop; the loop must terminate. + // Aborting the JoinHandle wakes the spawned task synchronously; + // the runtime polls it, the task harness observes the abort, + // and the task ends — dropping its sender clone. No time + // advancement is needed: `rx.recv().await` parks briefly, the + // runtime drives the aborted task to completion, then `recv` + // returns `None` because the sender is gone. drop(abort); - tokio::time::sleep(Duration::from_millis(200)).await; - - // Sender should be dropped along with the task; recv returns None. assert!(rx.recv().await.is_none()); } } From 54255c38fcdc8d35113dbf7e45de101761971214 Mon Sep 17 00:00:00 2001 From: Yu Liu <60283975+LYZJU2019@users.noreply.github.com> Date: Mon, 4 May 2026 10:29:00 -0700 Subject: [PATCH 08/39] docs(tonic-xds): tighten DEFAULT_DECISIONS_CHANNEL_CAPACITY doc Rewrite the doc comment to be reference documentation rather than a design narrative. Drops the editorializing ("the right behavior") and the first-person reasoning, keeps the three things a developer needs: what the constant controls, why this size, what happens at capacity (and why decisions can't be dropped or coalesced), and how to override. --- .../client/loadbalance/outlier_detection.rs | 24 +++++++------------ 1 file changed, 8 insertions(+), 16 deletions(-) diff --git a/tonic-xds/src/client/loadbalance/outlier_detection.rs b/tonic-xds/src/client/loadbalance/outlier_detection.rs index 345a2ee85..15dddb5a5 100644 --- a/tonic-xds/src/client/loadbalance/outlier_detection.rs +++ b/tonic-xds/src/client/loadbalance/outlier_detection.rs @@ -28,24 +28,16 @@ use crate::client::endpoint::EndpointAddress; use crate::common::async_util::AbortOnDrop; use crate::xds::resource::outlier_detection::{FailurePercentageConfig, OutlierDetectionConfig}; -/// Default capacity of the bounded mpsc channel that carries ejection -/// decisions from the sweep loop to the consumer. +/// Default capacity for the channel that delivers [`EjectionDecision`]s +/// from the sweep task to its consumer. /// -/// Decisions are edge-triggered (`Eject`/`Uneject` transitions, not -/// state snapshots), so the consumer must process every event in order -/// to stay in sync with the detector. We therefore can't drop or -/// coalesce — but we don't want unbounded growth either if the consumer -/// stalls. With sweep cadence on the order of seconds and at most -/// `2 * num_endpoints` decisions per sweep, 256 buffers several sweeps' -/// worth of decisions for clusters of typical size. When the buffer -/// fills, `tx.send().await` parks the sweep task, which naturally -/// throttles sweep cadence to whatever rate the consumer can drain — -/// the right behavior, since computing more decisions than the consumer -/// can apply just widens the desync. +/// Sized for several sweeps' worth of decisions on typical clusters — +/// each sweep emits at most `2 * num_endpoints`. At capacity, the sweep +/// task waits on `send` rather than dropping or coalescing decisions: +/// the channel is edge-triggered, so missing or merging events would +/// desynchronize the consumer's view of which endpoints are ejected. /// -/// Override via [`OutlierDetectorOptions::decisions_channel_capacity`] -/// for clusters with very large endpoint sets or unusually slow -/// consumers. +/// Override via [`OutlierDetectorOptions::decisions_channel_capacity`]. pub(crate) const DEFAULT_DECISIONS_CHANNEL_CAPACITY: usize = 256; /// Lock-free per-endpoint success/failure counter handle. From ab5be120634638dcb8ca25b8fc074750ee60b5d4 Mon Sep 17 00:00:00 2001 From: Yu Liu <60283975+LYZJU2019@users.noreply.github.com> Date: Mon, 4 May 2026 10:32:26 -0700 Subject: [PATCH 09/39] docs(tonic-xds): drop "workspace dep" parenthetical from FastRandRng --- tonic-xds/src/client/loadbalance/outlier_detection.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tonic-xds/src/client/loadbalance/outlier_detection.rs b/tonic-xds/src/client/loadbalance/outlier_detection.rs index 15dddb5a5..84335c970 100644 --- a/tonic-xds/src/client/loadbalance/outlier_detection.rs +++ b/tonic-xds/src/client/loadbalance/outlier_detection.rs @@ -89,7 +89,7 @@ pub(crate) trait Rng: Send + Sync + 'static { fn pct_roll(&self) -> u32; } -/// Default RNG backed by `fastrand` (already a workspace dep). +/// Default RNG backed by `fastrand`. struct FastRandRng; impl Rng for FastRandRng { From 0cc008565bc8830bb2c0f1b0b0cdfc889e870763 Mon Sep 17 00:00:00 2001 From: Yu Liu <60283975+LYZJU2019@users.noreply.github.com> Date: Mon, 4 May 2026 10:38:44 -0700 Subject: [PATCH 10/39] refactor(tonic-xds): pack EndpointCounters into a single AtomicU64 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The previous design used two separate `AtomicU64`s and snapshotted via two independent `swap` calls — the doc comment claimed this was atomic across the pair, but it isn't: an RPC completing between the two swaps inflates the next snapshot by one event, biasing the failure-percentage computation slightly under contention. Pack both counters into one `AtomicU64` (high 32 bits: successes, low 32 bits: failures). `record_*` becomes a single `fetch_add` (same hot- path cost as before), `snapshot_and_reset` becomes a single `swap(0)`, and the snapshot is now genuinely atomic across the pair — matching the bucket-swap semantics the gRFC describes. Each counter is capped at `u32::MAX` per sweep interval. Exceeding it would carry into the other counter's bits, but the cap is unreachable for realistic workloads (> 4 × 10⁹ RPCs to one endpoint within one interval). Documented on the struct. --- .../client/loadbalance/outlier_detection.rs | 30 ++++++++++++++----- 1 file changed, 23 insertions(+), 7 deletions(-) diff --git a/tonic-xds/src/client/loadbalance/outlier_detection.rs b/tonic-xds/src/client/loadbalance/outlier_detection.rs index 84335c970..04a9cd055 100644 --- a/tonic-xds/src/client/loadbalance/outlier_detection.rs +++ b/tonic-xds/src/client/loadbalance/outlier_detection.rs @@ -46,28 +46,44 @@ pub(crate) const DEFAULT_DECISIONS_CHANNEL_CAPACITY: usize = 256; /// invoke [`record_success`] / [`record_failure`] from the data path. /// The detector reads and resets the counters during each sweep. /// +/// Both counters are packed into a single `AtomicU64` (high 32 bits: +/// successes, low 32 bits: failures) so each increment is a single +/// `fetch_add` and a sweep is a single `swap(0)` — the snapshot is +/// truly atomic across the pair. Each counter is capped at +/// `u32::MAX` per sweep interval; exceeding that carries into the +/// other counter's bits, but the cap is unreachable for realistic +/// workloads (> 4 × 10⁹ RPCs to one endpoint within a single +/// interval). +/// /// [`record_success`]: EndpointCounters::record_success /// [`record_failure`]: EndpointCounters::record_failure #[derive(Debug, Default)] pub(crate) struct EndpointCounters { - success: AtomicU64, - failure: AtomicU64, + /// High 32 bits: successes since last sweep. + /// Low 32 bits: failures since last sweep. + packed: AtomicU64, } +/// Increment to apply to [`EndpointCounters::packed`] for one success. +const SUCCESS_INC: u64 = 1 << 32; +/// Increment to apply to [`EndpointCounters::packed`] for one failure. +const FAILURE_INC: u64 = 1; +/// Mask for the failure half of the packed counter. +const FAILURE_MASK: u64 = 0xFFFF_FFFF; + impl EndpointCounters { pub(crate) fn record_success(&self) { - self.success.fetch_add(1, Ordering::Relaxed); + self.packed.fetch_add(SUCCESS_INC, Ordering::Relaxed); } pub(crate) fn record_failure(&self) { - self.failure.fetch_add(1, Ordering::Relaxed); + self.packed.fetch_add(FAILURE_INC, Ordering::Relaxed); } /// Atomically read and zero both counters. Returns `(success, failure)`. fn snapshot_and_reset(&self) -> (u64, u64) { - let s = self.success.swap(0, Ordering::Relaxed); - let f = self.failure.swap(0, Ordering::Relaxed); - (s, f) + let v = self.packed.swap(0, Ordering::Relaxed); + (v >> 32, v & FAILURE_MASK) } } From eb10e3f03abd41306ca545045c5cff6e775a1aeb Mon Sep 17 00:00:00 2001 From: Yu Liu <60283975+LYZJU2019@users.noreply.github.com> Date: Mon, 4 May 2026 10:45:27 -0700 Subject: [PATCH 11/39] docs(tonic-xds): consolidate EndpointCounters doc comment --- .../client/loadbalance/outlier_detection.rs | 22 ++++++------------- 1 file changed, 7 insertions(+), 15 deletions(-) diff --git a/tonic-xds/src/client/loadbalance/outlier_detection.rs b/tonic-xds/src/client/loadbalance/outlier_detection.rs index 04a9cd055..7729de92f 100644 --- a/tonic-xds/src/client/loadbalance/outlier_detection.rs +++ b/tonic-xds/src/client/loadbalance/outlier_detection.rs @@ -40,23 +40,15 @@ use crate::xds::resource::outlier_detection::{FailurePercentageConfig, OutlierDe /// Override via [`OutlierDetectorOptions::decisions_channel_capacity`]. pub(crate) const DEFAULT_DECISIONS_CHANNEL_CAPACITY: usize = 256; -/// Lock-free per-endpoint success/failure counter handle. +/// Lock-free success/failure counter for one endpoint. The data path +/// records RPC outcomes via `record_success` / `record_failure`; the +/// sweep snapshots and resets atomically. /// -/// Cloned freely. Callers (typically a request-outcome interceptor) -/// invoke [`record_success`] / [`record_failure`] from the data path. -/// The detector reads and resets the counters during each sweep. -/// -/// Both counters are packed into a single `AtomicU64` (high 32 bits: -/// successes, low 32 bits: failures) so each increment is a single -/// `fetch_add` and a sweep is a single `swap(0)` — the snapshot is -/// truly atomic across the pair. Each counter is capped at +/// Counts are packed into a single `AtomicU64` (high 32 bits: +/// successes, low 32 bits: failures), so each record is one `fetch_add` +/// and a snapshot is one `swap(0)`. Each counter is capped at /// `u32::MAX` per sweep interval; exceeding that carries into the -/// other counter's bits, but the cap is unreachable for realistic -/// workloads (> 4 × 10⁹ RPCs to one endpoint within a single -/// interval). -/// -/// [`record_success`]: EndpointCounters::record_success -/// [`record_failure`]: EndpointCounters::record_failure +/// other counter's bits but is unreachable for realistic workloads. #[derive(Debug, Default)] pub(crate) struct EndpointCounters { /// High 32 bits: successes since last sweep. From 0d2c2644a02798fb43173784c6ec8b26fe465b14 Mon Sep 17 00:00:00 2001 From: Yu Liu <60283975+LYZJU2019@users.noreply.github.com> Date: Mon, 4 May 2026 15:23:40 -0700 Subject: [PATCH 12/39] fix(tonic-xds): skip zero-traffic candidates in failure-percentage algo Guard the `100 * failure / total` division against `total == 0`. gRFC A50 doesn't forbid `request_volume == 0`, in which case the qualifying filter `c.total >= request_volume` admits candidates with zero traffic; the spec is silent on `0/0`, so skip those endpoints rather than panic. --- tonic-xds/src/client/loadbalance/outlier_detection.rs | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/tonic-xds/src/client/loadbalance/outlier_detection.rs b/tonic-xds/src/client/loadbalance/outlier_detection.rs index 7729de92f..61a064030 100644 --- a/tonic-xds/src/client/loadbalance/outlier_detection.rs +++ b/tonic-xds/src/client/loadbalance/outlier_detection.rs @@ -348,6 +348,12 @@ impl OutlierDetector { if *budget == 0 { break; } + // A50 doesn't forbid `request_volume == 0`, in which case a + // candidate may have `total == 0`. The spec is silent on + // `0/0`; skip these endpoints rather than divide by zero. + if c.total == 0 { + continue; + } // failure_pct = 100 * failure / total. A50 specifies a strict // "greater than" comparison: an address sitting exactly at // the threshold is not ejected. From 3c946845b199d1da6d16fd526e6b9a03f9e1b907 Mon Sep 17 00:00:00 2001 From: Yu Liu <60283975+LYZJU2019@users.noreply.github.com> Date: Mon, 4 May 2026 15:27:20 -0700 Subject: [PATCH 13/39] docs(tonic-xds): fix stale DECISIONS_CHANNEL_CAPACITY doc link --- tonic-xds/src/client/loadbalance/outlier_detection.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tonic-xds/src/client/loadbalance/outlier_detection.rs b/tonic-xds/src/client/loadbalance/outlier_detection.rs index 61a064030..44c11feec 100644 --- a/tonic-xds/src/client/loadbalance/outlier_detection.rs +++ b/tonic-xds/src/client/loadbalance/outlier_detection.rs @@ -392,7 +392,7 @@ struct Candidate { /// /// `tx.send().await` is fallible (returns `Err` if the receiver was /// dropped) and may park briefly when the channel is full — see -/// [`DECISIONS_CHANNEL_CAPACITY`]. +/// [`DEFAULT_DECISIONS_CHANNEL_CAPACITY`]. async fn sweep_loop(detector: Arc, tx: mpsc::Sender) { let mut ticker = tokio::time::interval(detector.config.interval); // Skip missed ticks rather than burst-catching up — the goal is From 5aca8c088255bbca43104e30a55a69e8b7369115 Mon Sep 17 00:00:00 2001 From: Yu Liu <60283975+LYZJU2019@users.noreply.github.com> Date: Tue, 5 May 2026 11:00:15 -0700 Subject: [PATCH 14/39] refactor(tonic-xds): use derived Ord for EjectionDecision sorting in tests Drop the test-only `sort` helper that compared `EjectionDecision`s by their `Debug` string representation, which was fragile (any change to the `Debug` impl would silently change ordering). Derive `PartialOrd` and `Ord` on `EjectionDecision` (and on `EndpointAddress` / `EndpointHost`, since the address is the inner field) and call `Vec::sort` directly at the one test site. --- tonic-xds/src/client/endpoint.rs | 4 ++-- .../src/client/loadbalance/outlier_detection.rs | 12 +++--------- 2 files changed, 5 insertions(+), 11 deletions(-) diff --git a/tonic-xds/src/client/endpoint.rs b/tonic-xds/src/client/endpoint.rs index 81767414d..ec23012bb 100644 --- a/tonic-xds/src/client/endpoint.rs +++ b/tonic-xds/src/client/endpoint.rs @@ -5,7 +5,7 @@ use std::task::{Context, Poll}; use tower::{Service, load::Load}; /// Represents the host part of an endpoint address -#[derive(Debug, Clone, PartialEq, Eq, Hash)] +#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)] enum EndpointHost { Ipv4(std::net::Ipv4Addr), Ipv6(std::net::Ipv6Addr), @@ -25,7 +25,7 @@ impl From for EndpointHost { } /// Represents a validated endpoint address extracted from xDS -#[derive(Debug, Clone, PartialEq, Eq, Hash)] +#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)] pub(crate) struct EndpointAddress { /// The IP address or hostname host: EndpointHost, diff --git a/tonic-xds/src/client/loadbalance/outlier_detection.rs b/tonic-xds/src/client/loadbalance/outlier_detection.rs index 44c11feec..f9b46f037 100644 --- a/tonic-xds/src/client/loadbalance/outlier_detection.rs +++ b/tonic-xds/src/client/loadbalance/outlier_detection.rs @@ -80,7 +80,7 @@ impl EndpointCounters { } /// A decision emitted by an [`OutlierDetector`] sweep. -#[derive(Debug, Clone, PartialEq, Eq)] +#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord)] pub(crate) enum EjectionDecision { /// Eject this endpoint from the load-balancing pool. The caller /// should keep its underlying connection alive (A50 requires @@ -470,13 +470,6 @@ mod tests { }) } - /// Sort a decision list deterministically so equality checks can rely - /// on a canonical order without coupling to `HashMap` iteration order. - fn sort(mut ds: Vec) -> Vec { - ds.sort_by(|a, b| format!("{a:?}").cmp(&format!("{b:?}"))); - ds - } - // ----- EndpointCounters ----- #[test] @@ -773,7 +766,8 @@ mod tests { h.record_failure(); } } - let decisions = sort(detector.run_sweep(Instant::now())); + let mut decisions = detector.run_sweep(Instant::now()); + decisions.sort(); let ejects = decisions .iter() .filter(|d| matches!(d, EjectionDecision::Eject(_))) From bbf935cb1798197e04c9c3eef9c3eaef76224126 Mon Sep 17 00:00:00 2001 From: Yu Liu <60283975+LYZJU2019@users.noreply.github.com> Date: Tue, 5 May 2026 11:24:49 -0700 Subject: [PATCH 15/39] fix(tonic-xds): exclude re-ejections from max_ejection_percent budget MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When an already-ejected endpoint has in-flight RPCs that complete during its ejection backoff, those completions accumulate on its counter. At the next sweep the algorithm may "re-eject" the host (refreshing its `ejected_at` timestamp and bumping the multiplier). That action does not change the count of currently-ejected addresses, so per A50's `max_ejection_percent` check it must not consume a slot in the cap — but the previous code decremented the budget for it, under-counting how many *new* ejections the cap allows. Track the pre-sweep ejection state on each `Candidate` and only decrement the budget for new ejections in the failure-percentage algorithm. Add a regression test covering the specific scenario. --- .../client/loadbalance/outlier_detection.rs | 59 ++++++++++++++++++- 1 file changed, 58 insertions(+), 1 deletion(-) diff --git a/tonic-xds/src/client/loadbalance/outlier_detection.rs b/tonic-xds/src/client/loadbalance/outlier_detection.rs index f9b46f037..761d843a5 100644 --- a/tonic-xds/src/client/loadbalance/outlier_detection.rs +++ b/tonic-xds/src/client/loadbalance/outlier_detection.rs @@ -254,6 +254,7 @@ impl OutlierDetector { success, failure, total: success + failure, + already_ejected: ep.ejected_at.is_some(), }); } @@ -360,7 +361,13 @@ impl OutlierDetector { let failure_pct = 100 * c.failure / c.total; if failure_pct > threshold && self.roll(cfg.enforcing_failure_percentage.get()) { out.push(c.addr.clone()); - *budget -= 1; + // Only NEW ejections consume a budget slot; re-ejecting + // an already-ejected address only refreshes its + // timestamp and multiplier, leaving the count of + // currently-ejected addresses unchanged. + if !c.already_ejected { + *budget -= 1; + } } } } @@ -383,6 +390,12 @@ struct Candidate { success: u64, failure: u64, total: u64, + /// Whether this address was already ejected at the start of the sweep. + /// "Re-ejecting" an already-ejected address only refreshes its + /// ejection timestamp and bumps the multiplier; it does not change + /// the count of currently-ejected addresses, so it must not consume + /// a `max_ejection_percent` budget slot. + already_ejected: bool, } /// Background task: runs `detector.run_sweep` on each interval tick and @@ -775,6 +788,50 @@ mod tests { assert_eq!(ejects, 1, "max_ejection_percent=20% of 5 hosts ⇒ 1"); } + #[test] + fn already_ejected_re_ejection_does_not_consume_budget() { + // 5 hosts: one already ejected (with stats from in-flight RPCs + // accumulated during its backoff), four newly bad. Cap permits + // 3 concurrently ejected hosts (60% of 5), with 1 already taken + // by the pre-ejected host — so 2 new ejections remain in budget. + // + // This test would fail before the fix that excludes re-ejections + // from budget accounting: the algorithm would "re-eject" the + // already-ejected host (consuming the second slot), leaving only + // 1 new ejection from the four bad hosts. + let mut config = fp_config(50, 10, 3); + config.max_ejection_percent = pct(60); + let detector = detector_no_loop(config, FixedRng::boxed(99)); + + // Pre-eject host 8080 directly and give it bad in-flight stats. + let already_bad = detector.add_endpoint(addr(8080)); + for _ in 0..100 { + already_bad.record_failure(); + } + { + let mut state = detector.state.lock().unwrap(); + let ep = state.get_mut(&addr(8080)).unwrap(); + ep.ejected_at = Some(Instant::now()); + ep.ejection_multiplier = 1; + } + + // Four more bad hosts. + for port in 8081..=8084 { + let h = detector.add_endpoint(addr(port)); + for _ in 0..100 { + h.record_failure(); + } + } + + let mut decisions = detector.run_sweep(Instant::now()); + decisions.sort(); + let new_ejects = decisions + .iter() + .filter(|d| matches!(d, EjectionDecision::Eject(a) if *a != addr(8080))) + .count(); + assert_eq!(new_ejects, 2, "expected 2 new ejections under the cap"); + } + #[test] fn multiplier_decrements_on_healthy_interval() { let detector = detector_no_loop(base_config(), FixedRng::boxed(99)); From b8ea266497208c6e61b3eb843bfa69bf0fea2796 Mon Sep 17 00:00:00 2001 From: Yu Liu <60283975+LYZJU2019@users.noreply.github.com> Date: Tue, 5 May 2026 11:45:19 -0700 Subject: [PATCH 16/39] refactor(tonic-xds): drive sweeps on demand from poll_ready MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replace the spawned sweep loop + mpsc channel with an on-demand model: the detector exposes `maybe_run_sweep(&mut self, now: Instant) -> Vec ` and the consumer (the load balancer in a follow-up PR) calls it from its own event loop — typically `poll_ready` — gated by wallclock time. This eliminates a significant amount of machinery: - `tokio::spawn`, `sweep_loop`, `AbortOnDrop`, the mpsc channel. - The bounded-channel capacity option, its constant, and its docs (`OutlierDetectorOptions::decisions_channel_capacity`, `DEFAULT_DECISIONS_CHANNEL_CAPACITY`). - `OutlierDetectorOptions` itself — collapses to two constructors `new(config)` and `with_rng(config, rng)`. - The `Mutex` on `state` — the consumer's `&mut self` already serializes access. - Two `#[tokio::test(start_paused = true)]` tests that exercised the spawned task and its abort handle. Sweep timing now depends on RPC traffic: when no RPCs flow, no sweeps run. This matches A50's intent (sweeps happen approximately every `interval` while traffic is flowing) and is observably equivalent because ejection only matters during endpoint picking, which only happens during RPCs. Suggested by the PR review. Tests: - All algorithm-level tests rewritten to use owned `OutlierDetector` + `&mut self` calls, no `Mutex::lock()`, no Arc. - Three new `maybe_run_sweep_*` tests cover the interval gate: runs on first call, skips before interval elapsed, runs after. - Existing failure-percentage and multiplier/un-ejection tests unchanged in spirit; just adjusted to the new ownership model. --- .../client/loadbalance/outlier_detection.rs | 485 ++++++++---------- 1 file changed, 224 insertions(+), 261 deletions(-) diff --git a/tonic-xds/src/client/loadbalance/outlier_detection.rs b/tonic-xds/src/client/loadbalance/outlier_detection.rs index 761d843a5..f3614e403 100644 --- a/tonic-xds/src/client/loadbalance/outlier_detection.rs +++ b/tonic-xds/src/client/loadbalance/outlier_detection.rs @@ -1,11 +1,14 @@ //! gRFC A50 outlier-detection sweep engine. //! -//! Owns per-endpoint counters and an ejection state machine. Periodically -//! reads the counters, runs the failure-percentage ejection algorithm, -//! and emits [`EjectionDecision`]s. Knows nothing about the data path: +//! Owns per-endpoint counters and an ejection state machine. Runs the +//! failure-percentage ejection algorithm on demand and returns the +//! resulting [`EjectionDecision`]s. Knows nothing about the data path: //! callers feed it RPC outcomes via the lock-free [`EndpointCounters`] -//! handle returned by [`OutlierDetector::add_endpoint`], and consume -//! decisions from a channel returned by [`OutlierDetector::spawn`]. +//! handle returned by [`OutlierDetector::add_endpoint`], and pump the +//! sweep by calling [`OutlierDetector::maybe_run_sweep`] from their own +//! event loop (typically the load balancer's `poll_ready`). The wall +//! clock supplied to `maybe_run_sweep` decides when each sweep actually +//! runs — at most once per `config.interval`. //! //! Only the **failure-percentage** algorithm is implemented in this //! module. The success-rate algorithm — which adds float-math (mean @@ -18,28 +21,12 @@ use std::collections::HashMap; use std::sync::Arc; -use std::sync::Mutex; use std::sync::atomic::{AtomicU64, Ordering}; use std::time::Instant; -use tokio::sync::mpsc; - use crate::client::endpoint::EndpointAddress; -use crate::common::async_util::AbortOnDrop; use crate::xds::resource::outlier_detection::{FailurePercentageConfig, OutlierDetectionConfig}; -/// Default capacity for the channel that delivers [`EjectionDecision`]s -/// from the sweep task to its consumer. -/// -/// Sized for several sweeps' worth of decisions on typical clusters — -/// each sweep emits at most `2 * num_endpoints`. At capacity, the sweep -/// task waits on `send` rather than dropping or coalescing decisions: -/// the channel is edge-triggered, so missing or merging events would -/// desynchronize the consumer's view of which endpoints are ejected. -/// -/// Override via [`OutlierDetectorOptions::decisions_channel_capacity`]. -pub(crate) const DEFAULT_DECISIONS_CHANNEL_CAPACITY: usize = 256; - /// Lock-free success/failure counter for one endpoint. The data path /// records RPC outcomes via `record_success` / `record_failure`; the /// sweep snapshots and resets atomically. @@ -126,83 +113,38 @@ impl EndpointState { } } -/// Runtime knobs that don't come from the xDS config (`OutlierDetection` -/// proto) — the channel capacity, the RNG, etc. Kept separate from -/// [`OutlierDetectionConfig`] so xDS-derived state stays distinct from -/// host-side runtime tuning. -/// -/// New fields can be added without breaking call sites because callers -/// typically construct via `..Default::default()`. -pub(crate) struct OutlierDetectorOptions { - /// Capacity of the bounded mpsc channel that carries - /// [`EjectionDecision`]s from the sweep loop to the consumer. - /// See [`DEFAULT_DECISIONS_CHANNEL_CAPACITY`] for the rationale. - pub decisions_channel_capacity: usize, - /// Probability source for the `enforcing_*` rolls. Tests inject a - /// deterministic [`Rng`]; production uses `fastrand`. - pub rng: Box, -} - -impl Default for OutlierDetectorOptions { - fn default() -> Self { - Self { - decisions_channel_capacity: DEFAULT_DECISIONS_CHANNEL_CAPACITY, - rng: Box::new(FastRandRng), - } - } -} - -impl std::fmt::Debug for OutlierDetectorOptions { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - f.debug_struct("OutlierDetectorOptions") - .field( - "decisions_channel_capacity", - &self.decisions_channel_capacity, - ) - .field("rng", &"") - .finish() - } -} - /// gRFC A50 outlier detector. /// -/// `run_sweep` is pure — it returns a list of [`EjectionDecision`]s -/// rather than sending them. The sweep loop spawned by [`spawn`] owns -/// the channel sender and forwards decisions to the receiver, so -/// dropping the [`AbortOnDrop`] handle ends the loop and closes the -/// receiver. `OutlierDetector` itself holds no I/O resources, which -/// makes algorithm-level tests trivial to write. -/// -/// [`spawn`]: OutlierDetector::spawn +/// State is owned (no `Mutex`, no `Arc`): the consumer holds the +/// detector by `&mut` and calls [`Self::maybe_run_sweep`] from its own +/// event loop, typically the load balancer's `poll_ready`. The wall +/// clock argument decides when each sweep actually runs — at most once +/// per `config.interval`. pub(crate) struct OutlierDetector { config: OutlierDetectionConfig, - state: Mutex>, + state: HashMap, + /// Wall-clock time of the last sweep that actually ran. `None` + /// before the first sweep, so the first call to `maybe_run_sweep` + /// always runs. + last_sweep_at: Option, rng: Box, } impl OutlierDetector { - /// Build the detector with default runtime options and spawn its - /// sweep task on the current Tokio runtime. The sweep runs every - /// `config.interval` until the returned [`AbortOnDrop`] is dropped. - pub(crate) fn spawn( - config: OutlierDetectionConfig, - ) -> (Arc, mpsc::Receiver, AbortOnDrop) { - Self::spawn_with(config, OutlierDetectorOptions::default()) - } - - /// Variant of [`Self::spawn`] that accepts custom runtime options. - pub(crate) fn spawn_with( - config: OutlierDetectionConfig, - options: OutlierDetectorOptions, - ) -> (Arc, mpsc::Receiver, AbortOnDrop) { - let (tx, rx) = mpsc::channel(options.decisions_channel_capacity); - let detector = Arc::new(Self { + /// Build the detector with the default RNG (`fastrand`). + pub(crate) fn new(config: OutlierDetectionConfig) -> Self { + Self::with_rng(config, Box::new(FastRandRng)) + } + + /// Build the detector with an injected [`Rng`]. Tests use this to + /// pin the `enforcing_*` rolls. + pub(crate) fn with_rng(config: OutlierDetectionConfig, rng: Box) -> Self { + Self { config, - state: Mutex::new(HashMap::new()), - rng: options.rng, - }); - let task = tokio::spawn(sweep_loop(detector.clone(), tx)); - (detector, rx, AbortOnDrop(task)) + state: HashMap::new(), + last_sweep_at: None, + rng, + } } /// Register an endpoint and return its lock-free counter handle. @@ -211,9 +153,8 @@ impl OutlierDetector { /// /// Adding an already-registered address is a no-op and returns the /// existing handle (so callers can re-add idempotently). - pub(crate) fn add_endpoint(&self, addr: EndpointAddress) -> Arc { - let mut state = self.state.lock().expect("outlier_detector mutex poisoned"); - state + pub(crate) fn add_endpoint(&mut self, addr: EndpointAddress) -> Arc { + self.state .entry(addr) .or_insert_with(EndpointState::new) .counters @@ -224,15 +165,30 @@ impl OutlierDetector { /// any ejection state. If the endpoint was ejected, no `Uneject` /// decision is emitted — the caller is expected to handle the removal /// directly (e.g., by dropping its slot in the load balancer). - pub(crate) fn remove_endpoint(&self, addr: &EndpointAddress) { - let mut state = self.state.lock().expect("outlier_detector mutex poisoned"); - state.remove(addr); + pub(crate) fn remove_endpoint(&mut self, addr: &EndpointAddress) { + self.state.remove(addr); + } + + /// Run a sweep at logical time `now` if at least `config.interval` + /// has elapsed since the last sweep, returning the resulting + /// ejection / un-ejection decisions. Otherwise returns an empty + /// vector and leaves the detector state untouched. + /// + /// The first call after construction always runs a sweep + /// (`last_sweep_at` starts as `None`). + pub(crate) fn maybe_run_sweep(&mut self, now: Instant) -> Vec { + if let Some(last) = self.last_sweep_at + && now.duration_since(last) < self.config.interval + { + return Vec::new(); + } + self.last_sweep_at = Some(now); + self.run_sweep(now) } - /// Run a single sweep at logical time `now` and return the resulting - /// ejection/un-ejection decisions. Pure — does no I/O. The sweep loop - /// invokes this on each interval tick and forwards the decisions on - /// the channel; tests call it directly. + /// Unconditionally run one sweep at logical time `now` and return the + /// resulting decisions. Used by [`Self::maybe_run_sweep`] and by tests + /// that want to drive sweeps without modeling the interval gate. /// /// The order of operations follows gRFC A50: /// 1. Record the timestamp. @@ -242,12 +198,10 @@ impl OutlierDetector { /// 5. For each address: decrement the multiplier of non-ejected /// addresses with multiplier > 0, and un-eject ejected addresses /// whose backoff has elapsed. - pub(crate) fn run_sweep(&self, now: Instant) -> Vec { - let mut state = self.state.lock().expect("outlier_detector mutex poisoned"); - + pub(crate) fn run_sweep(&mut self, now: Instant) -> Vec { // Step 2: snapshot every endpoint's counters. - let mut snapshots: Vec = Vec::with_capacity(state.len()); - for (addr, ep) in state.iter_mut() { + let mut snapshots: Vec = Vec::with_capacity(self.state.len()); + for (addr, ep) in self.state.iter_mut() { let (success, failure) = ep.counters.snapshot_and_reset(); snapshots.push(Candidate { addr: addr.clone(), @@ -262,11 +216,15 @@ impl OutlierDetector { // don't exceed `max_ejection_percent` of the total. Per A50, the // check is performed before each candidate ejection; we model that // as a budget that algorithms decrement. - let total_endpoints = state.len(); + let total_endpoints = self.state.len(); let max_ejections = (total_endpoints as u64 * u64::from(self.config.max_ejection_percent.get()) / 100) as usize; - let already_ejected = state.values().filter(|ep| ep.ejected_at.is_some()).count(); + let already_ejected = self + .state + .values() + .filter(|ep| ep.ejected_at.is_some()) + .count(); let mut budget = max_ejections.saturating_sub(already_ejected); // Steps 3 & 4: run the algorithms on the snapshot. Hosts that are @@ -280,11 +238,11 @@ impl OutlierDetector { let mut to_eject: Vec = Vec::new(); if let Some(fp) = self.config.failure_percentage.as_ref() { - self.run_failure_percentage(fp, &snapshots, &mut budget, &mut to_eject); + run_failure_percentage(fp, &snapshots, &mut budget, &mut to_eject, &*self.rng); } for addr in &to_eject { - if let Some(ep) = state.get_mut(addr) { + if let Some(ep) = self.state.get_mut(addr) { ep.ejected_at = Some(now); ep.ejection_multiplier = ep.ejection_multiplier.saturating_add(1); } @@ -300,7 +258,7 @@ impl OutlierDetector { .base_ejection_time .max(self.config.max_ejection_time); let mut to_uneject: Vec = Vec::new(); - for (addr, ep) in state.iter_mut() { + for (addr, ep) in self.state.iter_mut() { if let Some(at) = ep.ejected_at { if let Some(scaled) = self .config @@ -316,8 +274,6 @@ impl OutlierDetector { } } - drop(state); - let mut decisions = Vec::with_capacity(to_uneject.len() + to_eject.len()); for addr in to_uneject { decisions.push(EjectionDecision::Uneject(addr)); @@ -327,61 +283,61 @@ impl OutlierDetector { } decisions } +} - /// A50 failure-percentage algorithm. - fn run_failure_percentage( - &self, - cfg: &FailurePercentageConfig, - all: &[Candidate], - budget: &mut usize, - out: &mut Vec, - ) { - let qualifying: Vec<&Candidate> = all - .iter() - .filter(|c| c.total >= u64::from(cfg.request_volume)) - .collect(); - if qualifying.len() < cfg.minimum_hosts as usize { - return; - } - - let threshold = u64::from(cfg.threshold.get()); - for c in qualifying { - if *budget == 0 { - break; - } - // A50 doesn't forbid `request_volume == 0`, in which case a - // candidate may have `total == 0`. The spec is silent on - // `0/0`; skip these endpoints rather than divide by zero. - if c.total == 0 { - continue; - } - // failure_pct = 100 * failure / total. A50 specifies a strict - // "greater than" comparison: an address sitting exactly at - // the threshold is not ejected. - let failure_pct = 100 * c.failure / c.total; - if failure_pct > threshold && self.roll(cfg.enforcing_failure_percentage.get()) { - out.push(c.addr.clone()); - // Only NEW ejections consume a budget slot; re-ejecting - // an already-ejected address only refreshes its - // timestamp and multiplier, leaving the count of - // currently-ejected addresses unchanged. - if !c.already_ejected { - *budget -= 1; - } +/// A50 failure-percentage algorithm. +fn run_failure_percentage( + cfg: &FailurePercentageConfig, + all: &[Candidate], + budget: &mut usize, + out: &mut Vec, + rng: &dyn Rng, +) { + let qualifying: Vec<&Candidate> = all + .iter() + .filter(|c| c.total >= u64::from(cfg.request_volume)) + .collect(); + if qualifying.len() < cfg.minimum_hosts as usize { + return; + } + + let threshold = u64::from(cfg.threshold.get()); + for c in qualifying { + if *budget == 0 { + break; + } + // A50 doesn't forbid `request_volume == 0`, in which case a + // candidate may have `total == 0`. The spec is silent on + // `0/0`; skip these endpoints rather than divide by zero. + if c.total == 0 { + continue; + } + // failure_pct = 100 * failure / total. A50 specifies a strict + // "greater than" comparison: an address sitting exactly at + // the threshold is not ejected. + let failure_pct = 100 * c.failure / c.total; + if failure_pct > threshold && roll(rng, cfg.enforcing_failure_percentage.get()) { + out.push(c.addr.clone()); + // Only NEW ejections consume a budget slot; re-ejecting + // an already-ejected address only refreshes its + // timestamp and multiplier, leaving the count of + // currently-ejected addresses unchanged. + if !c.already_ejected { + *budget -= 1; } } } +} - /// Return true with probability `pct / 100` (clamped at 100 ⇒ always). - fn roll(&self, pct: u8) -> bool { - if pct >= 100 { - return true; - } - if pct == 0 { - return false; - } - self.rng.pct_roll() < u32::from(pct) +/// Return true with probability `pct / 100` (clamped at 100 ⇒ always). +fn roll(rng: &dyn Rng, pct: u8) -> bool { + if pct >= 100 { + return true; + } + if pct == 0 { + return false; } + rng.pct_roll() < u32::from(pct) } /// Cached per-endpoint snapshot used during a sweep. @@ -398,34 +354,6 @@ struct Candidate { already_ejected: bool, } -/// Background task: runs `detector.run_sweep` on each interval tick and -/// forwards each decision on the channel. The task ends (and `tx` is -/// dropped, closing the receiver) when [`AbortOnDrop`] is dropped or -/// when the receiver itself is dropped. -/// -/// `tx.send().await` is fallible (returns `Err` if the receiver was -/// dropped) and may park briefly when the channel is full — see -/// [`DEFAULT_DECISIONS_CHANNEL_CAPACITY`]. -async fn sweep_loop(detector: Arc, tx: mpsc::Sender) { - let mut ticker = tokio::time::interval(detector.config.interval); - // Skip missed ticks rather than burst-catching up — the goal is - // periodic observation, not making up for paused time. - ticker.set_missed_tick_behavior(tokio::time::MissedTickBehavior::Skip); - // The first tick fires immediately; consume it so the first real - // sweep is `interval` after spawn (matches A50 semantics). - ticker.tick().await; - - loop { - ticker.tick().await; - for decision in detector.run_sweep(Instant::now()) { - if tx.send(decision).await.is_err() { - // Receiver gone — nobody is listening. - return; - } - } - } -} - #[cfg(test)] mod tests { use super::*; @@ -473,14 +401,8 @@ mod tests { } } - /// Build a detector with no sweep loop running. Tests drive - /// `run_sweep` directly and inspect the returned decisions. - fn detector_no_loop(config: OutlierDetectionConfig, rng: Box) -> Arc { - Arc::new(OutlierDetector { - config, - state: Mutex::new(HashMap::new()), - rng, - }) + fn detector_with_rng(config: OutlierDetectionConfig, rng: Box) -> OutlierDetector { + OutlierDetector::with_rng(config, rng) } // ----- EndpointCounters ----- @@ -499,7 +421,7 @@ mod tests { #[test] fn add_endpoint_returns_shared_counter() { - let detector = detector_no_loop(base_config(), FixedRng::boxed(99)); + let mut detector = detector_with_rng(base_config(), FixedRng::boxed(99)); let h1 = detector.add_endpoint(addr(8080)); let h2 = detector.add_endpoint(addr(8080)); assert!( @@ -512,10 +434,10 @@ mod tests { #[test] fn remove_endpoint_drops_state() { - let detector = detector_no_loop(base_config(), FixedRng::boxed(99)); + let mut detector = detector_with_rng(base_config(), FixedRng::boxed(99)); detector.add_endpoint(addr(8080)); detector.remove_endpoint(&addr(8080)); - assert!(detector.state.lock().unwrap().is_empty()); + assert!(detector.state.is_empty()); } // ----- Failure-percentage algorithm ----- @@ -537,7 +459,7 @@ mod tests { #[test] fn failure_percentage_ejects_above_threshold() { - let detector = detector_no_loop(fp_config(50, 10, 3), FixedRng::boxed(99)); + let mut detector = detector_with_rng(fp_config(50, 10, 3), FixedRng::boxed(99)); // 4 healthy endpoints + 1 bad one. for port in 8080..=8083 { let h = detector.add_endpoint(addr(port)); @@ -559,7 +481,7 @@ mod tests { #[test] fn failure_percentage_skips_below_threshold() { - let detector = detector_no_loop(fp_config(50, 10, 3), FixedRng::boxed(99)); + let mut detector = detector_with_rng(fp_config(50, 10, 3), FixedRng::boxed(99)); for port in 8080..=8084 { let h = detector.add_endpoint(addr(port)); // 30% failure → below threshold of 50%. @@ -577,7 +499,7 @@ mod tests { fn failure_percentage_at_threshold_does_not_eject() { // A50 specifies a strict "greater than" comparison: an address // sitting exactly at the threshold should *not* be ejected. - let detector = detector_no_loop(fp_config(50, 10, 3), FixedRng::boxed(0)); + let mut detector = detector_with_rng(fp_config(50, 10, 3), FixedRng::boxed(0)); for port in 8080..=8084 { let h = detector.add_endpoint(addr(port)); // Exactly 50% failure rate — equal to the threshold. @@ -593,7 +515,7 @@ mod tests { #[test] fn minimum_hosts_gates_failure_percentage() { - let detector = detector_no_loop(fp_config(50, 10, 5), FixedRng::boxed(99)); + let mut detector = detector_with_rng(fp_config(50, 10, 5), FixedRng::boxed(99)); // Only 2 hosts have request_volume ≥ 10; minimum_hosts is 5 ⇒ skip. for port in 8080..=8081 { let h = detector.add_endpoint(addr(port)); @@ -606,7 +528,7 @@ mod tests { #[test] fn request_volume_filters_low_traffic_endpoints() { - let detector = detector_no_loop(fp_config(50, 100, 3), FixedRng::boxed(99)); + let mut detector = detector_with_rng(fp_config(50, 100, 3), FixedRng::boxed(99)); // Bad endpoint, but only 5 requests — below request_volume=100. let bad = detector.add_endpoint(addr(8080)); for _ in 0..5 { @@ -631,7 +553,7 @@ mod tests { .enforcing_failure_percentage = pct(0); // Roll = 0 wouldn't trigger anyway since `roll(0)` short-circuits; // pin the RNG to 0 just to be explicit. - let detector = detector_no_loop(config, FixedRng::boxed(0)); + let mut detector = detector_with_rng(config, FixedRng::boxed(0)); for port in 8080..=8084 { let h = detector.add_endpoint(addr(port)); for _ in 0..100 { @@ -648,7 +570,7 @@ mod tests { let mut config = fp_config(50, 10, 3); config.base_ejection_time = Duration::from_secs(10); config.max_ejection_time = Duration::from_secs(60); - let detector = detector_no_loop(config, FixedRng::boxed(99)); + let mut detector = detector_with_rng(config, FixedRng::boxed(99)); for port in 8080..=8084 { let h = detector.add_endpoint(addr(port)); @@ -688,7 +610,7 @@ mod tests { let mut config = fp_config(50, 10, 3); config.base_ejection_time = Duration::from_secs(10); config.max_ejection_time = Duration::from_secs(60); - let detector = detector_no_loop(config, FixedRng::boxed(99)); + let mut detector = detector_with_rng(config, FixedRng::boxed(99)); let bad = addr(8084); let bad_h = detector.add_endpoint(bad.clone()); @@ -747,7 +669,7 @@ mod tests { let mut config = fp_config(50, 10, 3); config.base_ejection_time = Duration::from_secs(10); config.max_ejection_time = Duration::from_secs(15); - let detector = detector_no_loop(config, FixedRng::boxed(99)); + let mut detector = detector_with_rng(config, FixedRng::boxed(99)); for port in 8080..=8084 { detector.add_endpoint(addr(port)); @@ -755,8 +677,7 @@ mod tests { let t0 = Instant::now(); // Force multiplier=10 directly. { - let mut state = detector.state.lock().unwrap(); - let ep = state.get_mut(&addr(8084)).unwrap(); + let ep = detector.state.get_mut(&addr(8084)).unwrap(); ep.ejection_multiplier = 10; ep.ejected_at = Some(t0); } @@ -771,7 +692,7 @@ mod tests { // 5 hosts, all bad, but max_ejection_percent=20 ⇒ at most 1 ejected. let mut config = fp_config(50, 10, 3); config.max_ejection_percent = pct(20); - let detector = detector_no_loop(config, FixedRng::boxed(99)); + let mut detector = detector_with_rng(config, FixedRng::boxed(99)); for port in 8080..=8084 { let h = detector.add_endpoint(addr(port)); @@ -801,7 +722,7 @@ mod tests { // 1 new ejection from the four bad hosts. let mut config = fp_config(50, 10, 3); config.max_ejection_percent = pct(60); - let detector = detector_no_loop(config, FixedRng::boxed(99)); + let mut detector = detector_with_rng(config, FixedRng::boxed(99)); // Pre-eject host 8080 directly and give it bad in-flight stats. let already_bad = detector.add_endpoint(addr(8080)); @@ -809,8 +730,7 @@ mod tests { already_bad.record_failure(); } { - let mut state = detector.state.lock().unwrap(); - let ep = state.get_mut(&addr(8080)).unwrap(); + let ep = detector.state.get_mut(&addr(8080)).unwrap(); ep.ejected_at = Some(Instant::now()); ep.ejection_multiplier = 1; } @@ -834,18 +754,21 @@ mod tests { #[test] fn multiplier_decrements_on_healthy_interval() { - let detector = detector_no_loop(base_config(), FixedRng::boxed(99)); + let mut detector = detector_with_rng(base_config(), FixedRng::boxed(99)); let h = detector.add_endpoint(addr(8080)); // Force multiplier to 3 without ejecting. - { - let mut state = detector.state.lock().unwrap(); - state.get_mut(&addr(8080)).unwrap().ejection_multiplier = 3; - } + detector + .state + .get_mut(&addr(8080)) + .unwrap() + .ejection_multiplier = 3; // Healthy interval (some traffic, no ejection). h.record_success(); detector.run_sweep(Instant::now()); - let state = detector.state.lock().unwrap(); - assert_eq!(state.get(&addr(8080)).unwrap().ejection_multiplier, 2); + assert_eq!( + detector.state.get(&addr(8080)).unwrap().ejection_multiplier, + 2, + ); } #[test] @@ -853,32 +776,47 @@ mod tests { // A50: a non-ejected address with multiplier > 0 has its // multiplier decremented every sweep, regardless of whether it // received any RPCs that interval. - let detector = detector_no_loop(base_config(), FixedRng::boxed(99)); + let mut detector = detector_with_rng(base_config(), FixedRng::boxed(99)); detector.add_endpoint(addr(8080)); - { - let mut state = detector.state.lock().unwrap(); - state.get_mut(&addr(8080)).unwrap().ejection_multiplier = 3; - } + detector + .state + .get_mut(&addr(8080)) + .unwrap() + .ejection_multiplier = 3; // No traffic recorded. detector.run_sweep(Instant::now()); - let state = detector.state.lock().unwrap(); - assert_eq!(state.get(&addr(8080)).unwrap().ejection_multiplier, 2); + assert_eq!( + detector.state.get(&addr(8080)).unwrap().ejection_multiplier, + 2, + ); } - // ----- Sweep loop ----- + // ----- maybe_run_sweep gating ----- - #[tokio::test(start_paused = true)] - async fn sweep_loop_emits_decisions_on_tick() { - let mut config = fp_config(50, 10, 3); - config.interval = Duration::from_millis(100); - let (detector, mut rx, _abort) = OutlierDetector::spawn_with( - config, - OutlierDetectorOptions { - rng: FixedRng::boxed(99), - ..Default::default() - }, - ); + #[test] + fn maybe_run_sweep_runs_on_first_call() { + // `last_sweep_at` starts as `None`, so the first call always + // sweeps regardless of the wall clock argument. + let mut detector = detector_with_rng(fp_config(50, 10, 3), FixedRng::boxed(99)); + for port in 8080..=8083 { + let h = detector.add_endpoint(addr(port)); + for _ in 0..100 { + h.record_success(); + } + } + let bad = detector.add_endpoint(addr(8084)); + for _ in 0..100 { + bad.record_failure(); + } + let decisions = detector.maybe_run_sweep(Instant::now()); + assert_eq!(decisions, vec![EjectionDecision::Eject(addr(8084))]); + } + #[test] + fn maybe_run_sweep_skips_when_interval_not_elapsed() { + let mut config = fp_config(50, 10, 3); + config.interval = Duration::from_secs(10); + let mut detector = detector_with_rng(config, FixedRng::boxed(99)); for port in 8080..=8083 { let h = detector.add_endpoint(addr(port)); for _ in 0..100 { @@ -890,30 +828,55 @@ mod tests { bad.record_failure(); } - // Explicitly advance virtual time past the first sweep tick. - // `advance` is preferred over `sleep` for paused-time tests — it - // moves the clock deterministically and yields until pending - // task wake-ups have been polled, instead of relying on the - // runtime's auto-advance heuristic for parked tasks. - tokio::time::advance(Duration::from_millis(150)).await; + // First call always runs. + let t0 = Instant::now(); + assert_eq!( + detector.maybe_run_sweep(t0), + vec![EjectionDecision::Eject(addr(8084))], + ); - let decision = rx.recv().await.expect("sweep should emit a decision"); - assert_eq!(decision, EjectionDecision::Eject(addr(8084))); + // Re-arm with bad stats; second call Date: Tue, 5 May 2026 13:32:48 -0700 Subject: [PATCH 17/39] docs(tonic-xds): scrub narrative from outlier_detection comments MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Pass through every doc comment and inline comment, removing rationale, timeline language, and explanations that don't help a future reader. Notable trims: - Module docstring drops "Knows nothing about the data path:" framing, the "lands in a follow-up PR" timeline (regression — flagged and removed earlier on a different doc), and the "(mean and standard deviation across the qualifying hosts)" parenthetical. - `Rng` trait drops the "Abstracted so tests can inject" rationale. - `OutlierDetector` struct drops "State is owned (no `Mutex`, no `Arc`):" framing. - `add_endpoint` / `remove_endpoint` / `with_rng` lose the trailing usage hints / explanatory parentheticals. - `maybe_run_sweep` / `run_sweep` tightened to facts-only. - Inline comments inside `run_sweep` drop "we model that" and "intentionally not yet dispatched in this PR" timeline. - Inline comment for the budget-decrement guard now points at `Candidate::already_ejected` instead of duplicating its doc. - Test `already_ejected_re_ejection_does_not_consume_budget` drops the "this would fail before the fix" git-history paragraph. --- .../client/loadbalance/outlier_detection.rs | 118 +++++++----------- 1 file changed, 44 insertions(+), 74 deletions(-) diff --git a/tonic-xds/src/client/loadbalance/outlier_detection.rs b/tonic-xds/src/client/loadbalance/outlier_detection.rs index f3614e403..e28e81df1 100644 --- a/tonic-xds/src/client/loadbalance/outlier_detection.rs +++ b/tonic-xds/src/client/loadbalance/outlier_detection.rs @@ -1,20 +1,15 @@ //! gRFC A50 outlier-detection sweep engine. //! -//! Owns per-endpoint counters and an ejection state machine. Runs the -//! failure-percentage ejection algorithm on demand and returns the -//! resulting [`EjectionDecision`]s. Knows nothing about the data path: -//! callers feed it RPC outcomes via the lock-free [`EndpointCounters`] -//! handle returned by [`OutlierDetector::add_endpoint`], and pump the -//! sweep by calling [`OutlierDetector::maybe_run_sweep`] from their own -//! event loop (typically the load balancer's `poll_ready`). The wall -//! clock supplied to `maybe_run_sweep` decides when each sweep actually -//! runs — at most once per `config.interval`. +//! Tracks per-endpoint success/failure counters and an ejection state +//! machine. Callers feed RPC outcomes via the lock-free +//! [`EndpointCounters`] handle returned by +//! [`OutlierDetector::add_endpoint`], and drive sweeps by calling +//! [`OutlierDetector::maybe_run_sweep`] from their own event loop +//! (typically the load balancer's `poll_ready`); a sweep runs at most +//! once per `config.interval`. //! -//! Only the **failure-percentage** algorithm is implemented in this -//! module. The success-rate algorithm — which adds float-math (mean -//! and standard deviation across the qualifying hosts) — lands in a -//! follow-up PR. If [`OutlierDetectionConfig::success_rate`] is set, -//! it is currently ignored. +//! Only the failure-percentage algorithm is currently dispatched. If +//! [`OutlierDetectionConfig::success_rate`] is set, it is ignored. //! //! [gRFC A50]: https://github.com/grpc/proposal/blob/master/A50-xds-outlier-detection.md //! [`OutlierDetectionConfig::success_rate`]: crate::xds::resource::outlier_detection::OutlierDetectionConfig::success_rate @@ -77,8 +72,7 @@ pub(crate) enum EjectionDecision { Uneject(EndpointAddress), } -/// Probability source for `enforcing_*` rolls. Abstracted so tests can -/// inject deterministic outcomes. +/// Probability source for `enforcing_*` rolls. pub(crate) trait Rng: Send + Sync + 'static { /// Return a uniform random `u32` in `0..100`. fn pct_roll(&self) -> u32; @@ -115,11 +109,9 @@ impl EndpointState { /// gRFC A50 outlier detector. /// -/// State is owned (no `Mutex`, no `Arc`): the consumer holds the -/// detector by `&mut` and calls [`Self::maybe_run_sweep`] from its own -/// event loop, typically the load balancer's `poll_ready`. The wall -/// clock argument decides when each sweep actually runs — at most once -/// per `config.interval`. +/// Held by `&mut`; the consumer drives sweeps by calling +/// [`Self::maybe_run_sweep`] from its own event loop (typically the +/// load balancer's `poll_ready`). pub(crate) struct OutlierDetector { config: OutlierDetectionConfig, state: HashMap, @@ -136,8 +128,7 @@ impl OutlierDetector { Self::with_rng(config, Box::new(FastRandRng)) } - /// Build the detector with an injected [`Rng`]. Tests use this to - /// pin the `enforcing_*` rolls. + /// Build the detector with a custom [`Rng`]. pub(crate) fn with_rng(config: OutlierDetectionConfig, rng: Box) -> Self { Self { config, @@ -148,11 +139,9 @@ impl OutlierDetector { } /// Register an endpoint and return its lock-free counter handle. - /// The caller wires this handle into the data-path RPC interceptor so - /// that completed calls increment success/failure atomics. + /// The caller wires this handle into the data-path RPC interceptor. /// - /// Adding an already-registered address is a no-op and returns the - /// existing handle (so callers can re-add idempotently). + /// Adding an already-registered address returns the existing handle. pub(crate) fn add_endpoint(&mut self, addr: EndpointAddress) -> Arc { self.state .entry(addr) @@ -161,21 +150,17 @@ impl OutlierDetector { .clone() } - /// Forget a previously-registered endpoint. Drops its counters and - /// any ejection state. If the endpoint was ejected, no `Uneject` - /// decision is emitted — the caller is expected to handle the removal - /// directly (e.g., by dropping its slot in the load balancer). + /// Forget a previously-registered endpoint, dropping its counters + /// and ejection state. No `Uneject` decision is emitted if the + /// endpoint was ejected; the caller handles removal directly. pub(crate) fn remove_endpoint(&mut self, addr: &EndpointAddress) { self.state.remove(addr); } - /// Run a sweep at logical time `now` if at least `config.interval` - /// has elapsed since the last sweep, returning the resulting - /// ejection / un-ejection decisions. Otherwise returns an empty - /// vector and leaves the detector state untouched. - /// - /// The first call after construction always runs a sweep - /// (`last_sweep_at` starts as `None`). + /// Run a sweep at logical time `now`, returning the resulting + /// decisions. Sweeps are gated to at most one per `config.interval`; + /// calls inside the gate return an empty vector and leave state + /// untouched. The first call after construction always sweeps. pub(crate) fn maybe_run_sweep(&mut self, now: Instant) -> Vec { if let Some(last) = self.last_sweep_at && now.duration_since(last) < self.config.interval @@ -186,11 +171,8 @@ impl OutlierDetector { self.run_sweep(now) } - /// Unconditionally run one sweep at logical time `now` and return the - /// resulting decisions. Used by [`Self::maybe_run_sweep`] and by tests - /// that want to drive sweeps without modeling the interval gate. - /// - /// The order of operations follows gRFC A50: + /// Run one sweep at logical time `now` unconditionally and return + /// the resulting decisions, in gRFC A50 step order: /// 1. Record the timestamp. /// 2. Swap each address's call-counter buckets. /// 3. Run the success-rate algorithm if configured. @@ -212,10 +194,9 @@ impl OutlierDetector { }); } - // Compute a cap on the number of new ejections this sweep so we - // don't exceed `max_ejection_percent` of the total. Per A50, the - // check is performed before each candidate ejection; we model that - // as a budget that algorithms decrement. + // Per-sweep cap on new ejections, enforced as a budget the + // algorithms decrement. Per A50, the check happens before each + // candidate. let total_endpoints = self.state.len(); let max_ejections = (total_endpoints as u64 * u64::from(self.config.max_ejection_percent.get()) @@ -227,14 +208,11 @@ impl OutlierDetector { .count(); let mut budget = max_ejections.saturating_sub(already_ejected); - // Steps 3 & 4: run the algorithms on the snapshot. Hosts that are - // currently ejected naturally fail the `request_volume` gate - // because they receive no traffic in production, so iterating - // every address (per spec) and ejected-only candidates produce - // the same outcome on real workloads. - // - // Step 3 (`success_rate_ejection`) is intentionally not yet - // dispatched in this PR; it lands in a follow-up. + // Steps 3 & 4: run the algorithms on the snapshot. Ejected + // hosts have no in-interval traffic in production and so + // naturally fail the `request_volume` gate; iterating every + // address (per spec) is equivalent to iterating non-ejected + // ones. Step 3 (success-rate ejection) is not yet dispatched. let mut to_eject: Vec = Vec::new(); if let Some(fp) = self.config.failure_percentage.as_ref() { @@ -248,11 +226,10 @@ impl OutlierDetector { } } - // Step 5: decrement multipliers for non-ejected addresses, and - // un-eject any ejected addresses whose backoff has elapsed. This - // runs *after* re-ejection, so a same-sweep re-ejection updates - // `ejected_at` to `now` and the un-eject check sees zero elapsed - // time — no spurious uneject decision is emitted. + // Step 5: decrement multipliers for non-ejected addresses; + // un-eject ejected addresses whose backoff has elapsed. Runs + // *after* re-ejection, so a same-sweep re-eject refreshes + // `ejected_at` and the un-eject check sees zero elapsed time. let cap = self .config .base_ejection_time @@ -318,10 +295,8 @@ fn run_failure_percentage( let failure_pct = 100 * c.failure / c.total; if failure_pct > threshold && roll(rng, cfg.enforcing_failure_percentage.get()) { out.push(c.addr.clone()); - // Only NEW ejections consume a budget slot; re-ejecting - // an already-ejected address only refreshes its - // timestamp and multiplier, leaving the count of - // currently-ejected addresses unchanged. + // See `Candidate::already_ejected` for why re-ejections + // don't consume the budget. if !c.already_ejected { *budget -= 1; } @@ -346,11 +321,11 @@ struct Candidate { success: u64, failure: u64, total: u64, - /// Whether this address was already ejected at the start of the sweep. - /// "Re-ejecting" an already-ejected address only refreshes its - /// ejection timestamp and bumps the multiplier; it does not change - /// the count of currently-ejected addresses, so it must not consume - /// a `max_ejection_percent` budget slot. + /// Whether this address was already ejected at the start of the + /// sweep. Re-ejecting an already-ejected address refreshes its + /// timestamp and bumps its multiplier but doesn't change the count + /// of currently-ejected addresses, so it must not consume a + /// `max_ejection_percent` budget slot. already_ejected: bool, } @@ -715,11 +690,6 @@ mod tests { // accumulated during its backoff), four newly bad. Cap permits // 3 concurrently ejected hosts (60% of 5), with 1 already taken // by the pre-ejected host — so 2 new ejections remain in budget. - // - // This test would fail before the fix that excludes re-ejections - // from budget accounting: the algorithm would "re-eject" the - // already-ejected host (consuming the second slot), leaving only - // 1 new ejection from the four bad hosts. let mut config = fp_config(50, 10, 3); config.max_ejection_percent = pct(60); let mut detector = detector_with_rng(config, FixedRng::boxed(99)); From 026efc8e7ca53a6fe14c91b21323fc2931f22433 Mon Sep 17 00:00:00 2001 From: Yu Liu <60283975+LYZJU2019@users.noreply.github.com> Date: Fri, 8 May 2026 11:24:21 -0700 Subject: [PATCH 18/39] refactor(tonic-xds): unpack EndpointCounters into two AtomicU64s MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The packed-AtomicU64 design fixed a specific gap raised earlier in review (the snapshot's two swaps weren't atomic against each other), but the cost in readability and the marginal correctness benefit no longer justify it: the snapshot boundary is approximate either way — RPCs land continuously, so some always cross between "this interval" and "next interval" regardless of how the swap is implemented. For a statistical threshold at 85% over typically hundreds-to-thousands of RPCs per interval, the bias is well below the precision of the check. Replace the packing with two plain `AtomicU64` counters and document the snapshot's non-atomicity honestly on `snapshot_and_reset`. --- .../client/loadbalance/outlier_detection.rs | 34 +++++++------------ 1 file changed, 12 insertions(+), 22 deletions(-) diff --git a/tonic-xds/src/client/loadbalance/outlier_detection.rs b/tonic-xds/src/client/loadbalance/outlier_detection.rs index e28e81df1..11b8e902b 100644 --- a/tonic-xds/src/client/loadbalance/outlier_detection.rs +++ b/tonic-xds/src/client/loadbalance/outlier_detection.rs @@ -24,40 +24,30 @@ use crate::xds::resource::outlier_detection::{FailurePercentageConfig, OutlierDe /// Lock-free success/failure counter for one endpoint. The data path /// records RPC outcomes via `record_success` / `record_failure`; the -/// sweep snapshots and resets atomically. -/// -/// Counts are packed into a single `AtomicU64` (high 32 bits: -/// successes, low 32 bits: failures), so each record is one `fetch_add` -/// and a snapshot is one `swap(0)`. Each counter is capped at -/// `u32::MAX` per sweep interval; exceeding that carries into the -/// other counter's bits but is unreachable for realistic workloads. +/// sweep reads and resets between intervals. #[derive(Debug, Default)] pub(crate) struct EndpointCounters { - /// High 32 bits: successes since last sweep. - /// Low 32 bits: failures since last sweep. - packed: AtomicU64, + success: AtomicU64, + failure: AtomicU64, } -/// Increment to apply to [`EndpointCounters::packed`] for one success. -const SUCCESS_INC: u64 = 1 << 32; -/// Increment to apply to [`EndpointCounters::packed`] for one failure. -const FAILURE_INC: u64 = 1; -/// Mask for the failure half of the packed counter. -const FAILURE_MASK: u64 = 0xFFFF_FFFF; - impl EndpointCounters { pub(crate) fn record_success(&self) { - self.packed.fetch_add(SUCCESS_INC, Ordering::Relaxed); + self.success.fetch_add(1, Ordering::Relaxed); } pub(crate) fn record_failure(&self) { - self.packed.fetch_add(FAILURE_INC, Ordering::Relaxed); + self.failure.fetch_add(1, Ordering::Relaxed); } - /// Atomically read and zero both counters. Returns `(success, failure)`. + /// Read and zero both counters. Returns `(success, failure)`. The + /// two swaps are not atomic against each other — RPCs landing + /// between them may bias the snapshot by a small number of events, + /// well below the precision of the failure-percentage threshold. fn snapshot_and_reset(&self) -> (u64, u64) { - let v = self.packed.swap(0, Ordering::Relaxed); - (v >> 32, v & FAILURE_MASK) + let s = self.success.swap(0, Ordering::Relaxed); + let f = self.failure.swap(0, Ordering::Relaxed); + (s, f) } } From 6656304859046f7cde0060d11164476f47c3d7d4 Mon Sep 17 00:00:00 2001 From: Yu Liu <60283975+LYZJU2019@users.noreply.github.com> Date: Fri, 8 May 2026 14:16:05 -0700 Subject: [PATCH 19/39] refactor(tonic-xds): outlier detection via shared DashMap + actor MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Move outlier-detection state onto the channels themselves and run the sweep in a spawned actor task that mutates the shared state. The load balancer's `poll_ready` will observe ejection events O(1) per change via per-channel `watch::Receiver::changed()` futures (wired in a follow-up integration PR), so the O(n) scan stays off the LB's critical path. In `channel_state.rs`: - Add `EndpointCounters` (lock-free success/failure atomics) and `OutlierChannelState` (counters + edge-triggered `watch::Sender` ejection signal). Both `pub(crate)`. - `ReadyChannel` gains `outlier: Arc`. `ConnectingChannel::new` generates a fresh state; `with_outlier` preserves an existing one (for reconnect paths). - `EjectedChannel` carries the outlier state through the cooldown so it survives the eject → un-eject cycle. In `outlier_detection.rs`: - `OutlierDetector` no longer owns counters; it owns only algorithm- private state (per-endpoint multiplier and last-ejection timestamp) and config + RNG. - `OutlierStatsRegistry = Arc>>` is the shared structure between the detector and the LB. - `run_sweep(&mut self, now, &OutlierStatsRegistry)` scans the DashMap, snapshots counters via the channel state, decides ejections, and applies them inline by calling `OutlierChannelState::eject()` / `uneject()`. Algorithm state for removed channels is GC'd per-sweep. - `OutlierDetector::spawn(config, channels)` spawns the actor task on a `tokio::time::interval` ticker; returns `AbortOnDrop` for lifecycle control. `EjectionDecision` enum, `maybe_run_sweep`, `last_sweep_at`, `add_endpoint`, and `remove_endpoint` are all removed — the actor + shared state replaces them. - Tests rewritten to drive the new shape: construct a `DashMap>`, populate counters, call `run_sweep` and observe `is_ejected()` directly. Adds two actor-level tests covering `spawn` + `AbortOnDrop`. --- .../src/client/loadbalance/channel_state.rs | 164 +++- .../client/loadbalance/outlier_detection.rs | 723 +++++++++--------- 2 files changed, 503 insertions(+), 384 deletions(-) diff --git a/tonic-xds/src/client/loadbalance/channel_state.rs b/tonic-xds/src/client/loadbalance/channel_state.rs index 7916c9bb8..fb534b9cd 100644 --- a/tonic-xds/src/client/loadbalance/channel_state.rs +++ b/tonic-xds/src/client/loadbalance/channel_state.rs @@ -26,16 +26,135 @@ use std::future::Future; use std::pin::Pin; use std::sync::Arc; +use std::sync::atomic::{AtomicU64, Ordering}; use std::task::{Context, Poll}; use std::time::Duration; use pin_project_lite::pin_project; +use tokio::sync::watch; use tower::Service; use tower::load::Load; use crate::client::endpoint::{Connector, EndpointAddress}; use crate::common::async_util::BoxFuture; +// --------------------------------------------------------------------------- +// EndpointCounters / OutlierChannelState +// --------------------------------------------------------------------------- + +/// Lock-free success/failure counter for one endpoint. Records RPC +/// outcomes from the data path; the outlier-detection actor reads and +/// resets between intervals. +#[derive(Debug, Default)] +pub(crate) struct EndpointCounters { + success: AtomicU64, + failure: AtomicU64, +} + +impl EndpointCounters { + pub(crate) fn record_success(&self) { + self.success.fetch_add(1, Ordering::Relaxed); + } + + pub(crate) fn record_failure(&self) { + self.failure.fetch_add(1, Ordering::Relaxed); + } + + /// Read and zero both counters. The two swaps are not atomic against + /// each other — RPCs landing between them may bias the snapshot by + /// a small number of events, well below the precision of the + /// failure-percentage threshold. + pub(crate) fn snapshot_and_reset(&self) -> (u64, u64) { + let s = self.success.swap(0, Ordering::Relaxed); + let f = self.failure.swap(0, Ordering::Relaxed); + (s, f) + } +} + +/// Per-channel outlier-detection state, shared between the data path +/// (for outcome recording) and the outlier-detection actor (for sweeps +/// and ejection signalling). +/// +/// The ejection signal is edge-triggered: the actor calls [`eject`] / +/// [`uneject`] to flip the flag; observers subscribe via +/// [`subscribe`] and poll `Receiver::changed()` (typically inside a +/// `FuturesUnordered`) to react in O(1) on each transition. +/// +/// [`eject`]: Self::eject +/// [`uneject`]: Self::uneject +/// [`subscribe`]: Self::subscribe +#[derive(Debug)] +pub(crate) struct OutlierChannelState { + counters: EndpointCounters, + eject_tx: watch::Sender, +} + +impl Default for OutlierChannelState { + fn default() -> Self { + Self::new() + } +} + +impl OutlierChannelState { + pub(crate) fn new() -> Self { + let (eject_tx, _) = watch::channel(false); + Self { + counters: EndpointCounters::default(), + eject_tx, + } + } + + pub(crate) fn record_success(&self) { + self.counters.record_success(); + } + + pub(crate) fn record_failure(&self) { + self.counters.record_failure(); + } + + /// Atomically read and zero the counters. Returns `(success, failure)`. + pub(crate) fn snapshot_and_reset(&self) -> (u64, u64) { + self.counters.snapshot_and_reset() + } + + /// Flip the ejection flag to `true`. No-op if already ejected. + pub(crate) fn eject(&self) { + self.eject_tx.send_if_modified(|state| { + if *state { + false + } else { + *state = true; + true + } + }); + } + + /// Flip the ejection flag back to `false`. No-op if not ejected. + pub(crate) fn uneject(&self) { + self.eject_tx.send_if_modified(|state| { + if *state { + *state = false; + true + } else { + false + } + }); + } + + /// Current ejection state. + pub(crate) fn is_ejected(&self) -> bool { + *self.eject_tx.borrow() + } + + /// Subscribe to ejection-state changes. The returned receiver's + /// `changed()` future resolves on each transition; consumers + /// typically push it into a `FuturesUnordered`. + #[allow(dead_code)] // wired by the LoadBalancer in a follow-up PR. + pub(crate) fn subscribe(&self) -> watch::Receiver { + self.eject_tx.subscribe() + } +} + /// Configuration for an ejected channel. #[derive(Debug, Clone)] pub(crate) struct EjectionConfig { @@ -92,12 +211,27 @@ pub(crate) struct ConnectingChannel { } impl ConnectingChannel { + /// Start a connection, generating a fresh per-channel outlier + /// state. Used for first-time connects from `IdleChannel`. pub(crate) fn new(fut: BoxFuture, addr: EndpointAddress) -> Self { + Self::with_outlier(fut, addr, Arc::new(OutlierChannelState::new())) + } + + /// Start a connection that inherits an existing + /// [`OutlierChannelState`]. Used by reconnect paths so the + /// per-channel counters and ejection signal survive across the + /// connection cycle. + pub(crate) fn with_outlier( + fut: BoxFuture, + addr: EndpointAddress, + outlier: Arc, + ) -> Self { Self { inner: Box::pin(async move { ReadyChannel { addr, inner: fut.await, + outlier, } }), } @@ -119,14 +253,23 @@ impl Future for ConnectingChannel { /// A channel that is connected and ready to serve requests. /// /// Holds the raw service `S` and delegates [`Service`] calls directly, -/// preserving `S::Future` and `S::Error` with no wrapping or type erasure. +/// preserving `S::Future` and `S::Error` with no wrapping or type +/// erasure. The `Arc` is shared with the outlier- +/// detection actor for stats accumulation and edge-triggered ejection. #[derive(Clone)] pub(crate) struct ReadyChannel { addr: EndpointAddress, inner: S, + outlier: Arc, } impl ReadyChannel { + /// Per-channel outlier-detection state. Cloned cheaply via `Arc`. + #[allow(dead_code)] // consumed by the LoadBalancer in a follow-up PR. + pub(crate) fn outlier(&self) -> &Arc { + &self.outlier + } + /// Eject this channel (e.g., due to outlier detection). Consumes self. pub(crate) fn eject(self, config: EjectionConfig, connector: Arc) -> EjectedChannel where @@ -136,13 +279,15 @@ impl ReadyChannel { EjectedChannel { addr: self.addr, inner: self.inner, + outlier: self.outlier, config, connector, ejection_timer, } } - /// Start reconnecting. Consumes self, dropping the old connection. + /// Start reconnecting. Consumes self, dropping the old connection + /// but preserving the outlier-detection state. pub(crate) fn reconnect>( self, connector: Arc, @@ -150,7 +295,7 @@ impl ReadyChannel { where S: Send + 'static, { - ConnectingChannel::new(connector.connect(&self.addr), self.addr) + ConnectingChannel::with_outlier(connector.connect(&self.addr), self.addr, self.outlier) } } @@ -193,6 +338,7 @@ pin_project! { pub(crate) struct EjectedChannel { addr: EndpointAddress, inner: S, + outlier: Arc, config: EjectionConfig, connector: Arc + Send + Sync>, #[pin] @@ -209,14 +355,18 @@ impl Future for EjectedChannel { Poll::Ready(()) => { if this.config.needs_reconnect { let fut = this.connector.connect(this.addr); - Poll::Ready(UnejectedChannel::Connecting(ConnectingChannel::new( - fut, - this.addr.clone(), - ))) + Poll::Ready(UnejectedChannel::Connecting( + ConnectingChannel::with_outlier( + fut, + this.addr.clone(), + this.outlier.clone(), + ), + )) } else { Poll::Ready(UnejectedChannel::Ready(ReadyChannel { addr: this.addr.clone(), inner: this.inner.clone(), + outlier: this.outlier.clone(), })) } } diff --git a/tonic-xds/src/client/loadbalance/outlier_detection.rs b/tonic-xds/src/client/loadbalance/outlier_detection.rs index 11b8e902b..37023ad95 100644 --- a/tonic-xds/src/client/loadbalance/outlier_detection.rs +++ b/tonic-xds/src/client/loadbalance/outlier_detection.rs @@ -1,66 +1,39 @@ //! gRFC A50 outlier-detection sweep engine. //! -//! Tracks per-endpoint success/failure counters and an ejection state -//! machine. Callers feed RPC outcomes via the lock-free -//! [`EndpointCounters`] handle returned by -//! [`OutlierDetector::add_endpoint`], and drive sweeps by calling -//! [`OutlierDetector::maybe_run_sweep`] from their own event loop -//! (typically the load balancer's `poll_ready`); a sweep runs at most -//! once per `config.interval`. +//! Reads per-endpoint counters from a shared +//! [`DashMap>`] and applies +//! ejection / un-ejection decisions in place by toggling each entry's +//! ejection signal. The load balancer registers each [`ReadyChannel`]'s +//! [`OutlierChannelState`] in the same map and observes the signal via +//! a `FuturesUnordered` of `watch::Receiver::changed()` futures, so the +//! O(n) sweep runs in a spawned actor task off the LB's critical path. //! //! Only the failure-percentage algorithm is currently dispatched. If //! [`OutlierDetectionConfig::success_rate`] is set, it is ignored. //! //! [gRFC A50]: https://github.com/grpc/proposal/blob/master/A50-xds-outlier-detection.md +//! [`ReadyChannel`]: crate::client::loadbalance::channel_state::ReadyChannel +//! [`OutlierChannelState`]: crate::client::loadbalance::channel_state::OutlierChannelState //! [`OutlierDetectionConfig::success_rate`]: crate::xds::resource::outlier_detection::OutlierDetectionConfig::success_rate -use std::collections::HashMap; +use std::collections::{HashMap, HashSet}; use std::sync::Arc; -use std::sync::atomic::{AtomicU64, Ordering}; use std::time::Instant; +use dashmap::DashMap; + use crate::client::endpoint::EndpointAddress; +use crate::client::loadbalance::channel_state::OutlierChannelState; +use crate::common::async_util::AbortOnDrop; use crate::xds::resource::outlier_detection::{FailurePercentageConfig, OutlierDetectionConfig}; -/// Lock-free success/failure counter for one endpoint. The data path -/// records RPC outcomes via `record_success` / `record_failure`; the -/// sweep reads and resets between intervals. -#[derive(Debug, Default)] -pub(crate) struct EndpointCounters { - success: AtomicU64, - failure: AtomicU64, -} - -impl EndpointCounters { - pub(crate) fn record_success(&self) { - self.success.fetch_add(1, Ordering::Relaxed); - } - - pub(crate) fn record_failure(&self) { - self.failure.fetch_add(1, Ordering::Relaxed); - } - - /// Read and zero both counters. Returns `(success, failure)`. The - /// two swaps are not atomic against each other — RPCs landing - /// between them may bias the snapshot by a small number of events, - /// well below the precision of the failure-percentage threshold. - fn snapshot_and_reset(&self) -> (u64, u64) { - let s = self.success.swap(0, Ordering::Relaxed); - let f = self.failure.swap(0, Ordering::Relaxed); - (s, f) - } -} - -/// A decision emitted by an [`OutlierDetector`] sweep. -#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord)] -pub(crate) enum EjectionDecision { - /// Eject this endpoint from the load-balancing pool. The caller - /// should keep its underlying connection alive (A50 requires - /// preserving connections across ejection). - Eject(EndpointAddress), - /// Restore a previously-ejected endpoint to the pool. - Uneject(EndpointAddress), -} +/// Shared map of per-endpoint outlier state, keyed by address. The +/// load balancer inserts each [`ReadyChannel`]'s +/// [`OutlierChannelState`] on connect and removes it on disconnect; the +/// detector iterates the map on each sweep. +/// +/// [`ReadyChannel`]: crate::client::loadbalance::channel_state::ReadyChannel +pub(crate) type OutlierStatsRegistry = Arc>>; /// Probability source for `enforcing_*` rolls. pub(crate) trait Rng: Send + Sync + 'static { @@ -77,9 +50,11 @@ impl Rng for FastRandRng { } } -/// Per-endpoint state held inside the detector. -struct EndpointState { - counters: Arc, +/// Algorithm-private per-endpoint state. Tracks the ejection-time +/// multiplier and the last ejection timestamp; counters and the +/// outward-facing ejection signal live on the channel's +/// [`OutlierChannelState`]. +struct AlgState { /// Number of times this endpoint has been ejected. Grows on each /// re-ejection and decays on each healthy interval. ejection_multiplier: u32, @@ -87,10 +62,9 @@ struct EndpointState { ejected_at: Option, } -impl EndpointState { +impl AlgState { fn new() -> Self { Self { - counters: Arc::new(EndpointCounters::default()), ejection_multiplier: 0, ejected_at: None, } @@ -99,16 +73,14 @@ impl EndpointState { /// gRFC A50 outlier detector. /// -/// Held by `&mut`; the consumer drives sweeps by calling -/// [`Self::maybe_run_sweep`] from its own event loop (typically the -/// load balancer's `poll_ready`). +/// Held by an actor task that ticks once per `config.interval` and +/// calls [`Self::run_sweep`] over the shared [`OutlierStatsRegistry`]. +/// Stats and ejection signals live on the channels themselves; the +/// detector owns only algorithm-private metadata (per-endpoint +/// multiplier and last-ejection timestamp). pub(crate) struct OutlierDetector { config: OutlierDetectionConfig, - state: HashMap, - /// Wall-clock time of the last sweep that actually ran. `None` - /// before the first sweep, so the first call to `maybe_run_sweep` - /// always runs. - last_sweep_at: Option, + state: HashMap, rng: Box, } @@ -123,66 +95,43 @@ impl OutlierDetector { Self { config, state: HashMap::new(), - last_sweep_at: None, rng, } } - /// Register an endpoint and return its lock-free counter handle. - /// The caller wires this handle into the data-path RPC interceptor. + /// Run one sweep at logical time `now` over the shared registry. + /// Applies ejection decisions inline by calling + /// [`OutlierChannelState::eject`] / [`OutlierChannelState::uneject`] + /// on each affected entry. /// - /// Adding an already-registered address returns the existing handle. - pub(crate) fn add_endpoint(&mut self, addr: EndpointAddress) -> Arc { - self.state - .entry(addr) - .or_insert_with(EndpointState::new) - .counters - .clone() - } - - /// Forget a previously-registered endpoint, dropping its counters - /// and ejection state. No `Uneject` decision is emitted if the - /// endpoint was ejected; the caller handles removal directly. - pub(crate) fn remove_endpoint(&mut self, addr: &EndpointAddress) { - self.state.remove(addr); - } - - /// Run a sweep at logical time `now`, returning the resulting - /// decisions. Sweeps are gated to at most one per `config.interval`; - /// calls inside the gate return an empty vector and leave state - /// untouched. The first call after construction always sweeps. - pub(crate) fn maybe_run_sweep(&mut self, now: Instant) -> Vec { - if let Some(last) = self.last_sweep_at - && now.duration_since(last) < self.config.interval - { - return Vec::new(); - } - self.last_sweep_at = Some(now); - self.run_sweep(now) - } - - /// Run one sweep at logical time `now` unconditionally and return - /// the resulting decisions, in gRFC A50 step order: + /// Order of operations follows gRFC A50: /// 1. Record the timestamp. - /// 2. Swap each address's call-counter buckets. - /// 3. Run the success-rate algorithm if configured. + /// 2. Snapshot each address's call-counter buckets. + /// 3. Run the success-rate algorithm if configured (not yet dispatched). /// 4. Run the failure-percentage algorithm if configured. - /// 5. For each address: decrement the multiplier of non-ejected - /// addresses with multiplier > 0, and un-eject ejected addresses - /// whose backoff has elapsed. - pub(crate) fn run_sweep(&mut self, now: Instant) -> Vec { - // Step 2: snapshot every endpoint's counters. - let mut snapshots: Vec = Vec::with_capacity(self.state.len()); - for (addr, ep) in self.state.iter_mut() { - let (success, failure) = ep.counters.snapshot_and_reset(); + /// 5. Decrement the multiplier of non-ejected addresses with + /// multiplier > 0; un-eject ejected addresses whose backoff has + /// elapsed. + pub(crate) fn run_sweep(&mut self, now: Instant, channels: &OutlierStatsRegistry) { + // Step 2: snapshot every channel's counters and record which + // addresses are still in the registry. + let mut snapshots: Vec = Vec::with_capacity(channels.len()); + let mut seen: HashSet = HashSet::with_capacity(channels.len()); + for entry in channels.iter() { + let addr = entry.key().clone(); + let (success, failure) = entry.value().snapshot_and_reset(); + let alg = self.state.entry(addr.clone()).or_insert_with(AlgState::new); snapshots.push(Candidate { addr: addr.clone(), success, failure, total: success + failure, - already_ejected: ep.ejected_at.is_some(), + already_ejected: alg.ejected_at.is_some(), }); + seen.insert(addr); } + // Drop algorithm state for addresses no longer in the registry. + self.state.retain(|addr, _| seen.contains(addr)); // Per-sweep cap on new ejections, enforced as a budget the // algorithms decrement. Per A50, the check happens before each @@ -194,25 +143,27 @@ impl OutlierDetector { let already_ejected = self .state .values() - .filter(|ep| ep.ejected_at.is_some()) + .filter(|s| s.ejected_at.is_some()) .count(); let mut budget = max_ejections.saturating_sub(already_ejected); - // Steps 3 & 4: run the algorithms on the snapshot. Ejected - // hosts have no in-interval traffic in production and so - // naturally fail the `request_volume` gate; iterating every - // address (per spec) is equivalent to iterating non-ejected - // ones. Step 3 (success-rate ejection) is not yet dispatched. + // Steps 3 & 4: run the algorithms. Ejected hosts have no + // in-interval traffic in production and so naturally fail the + // `request_volume` gate; iterating every address (per spec) is + // equivalent to iterating non-ejected ones. Step 3 (success- + // rate ejection) is not yet dispatched. let mut to_eject: Vec = Vec::new(); - if let Some(fp) = self.config.failure_percentage.as_ref() { run_failure_percentage(fp, &snapshots, &mut budget, &mut to_eject, &*self.rng); } for addr in &to_eject { - if let Some(ep) = self.state.get_mut(addr) { - ep.ejected_at = Some(now); - ep.ejection_multiplier = ep.ejection_multiplier.saturating_add(1); + if let Some(alg) = self.state.get_mut(addr) { + alg.ejected_at = Some(now); + alg.ejection_multiplier = alg.ejection_multiplier.saturating_add(1); + } + if let Some(state) = channels.get(addr) { + state.eject(); } } @@ -224,31 +175,58 @@ impl OutlierDetector { .config .base_ejection_time .max(self.config.max_ejection_time); - let mut to_uneject: Vec = Vec::new(); - for (addr, ep) in self.state.iter_mut() { - if let Some(at) = ep.ejected_at { + for (addr, alg) in self.state.iter_mut() { + if let Some(at) = alg.ejected_at { if let Some(scaled) = self .config .base_ejection_time - .checked_mul(ep.ejection_multiplier) + .checked_mul(alg.ejection_multiplier) && now.duration_since(at) >= scaled.min(cap) { - ep.ejected_at = None; - to_uneject.push(addr.clone()); + alg.ejected_at = None; + if let Some(state) = channels.get(addr) { + state.uneject(); + } } - } else if ep.ejection_multiplier > 0 { - ep.ejection_multiplier -= 1; + } else if alg.ejection_multiplier > 0 { + alg.ejection_multiplier -= 1; } } + } - let mut decisions = Vec::with_capacity(to_uneject.len() + to_eject.len()); - for addr in to_uneject { - decisions.push(EjectionDecision::Uneject(addr)); - } - for addr in to_eject { - decisions.push(EjectionDecision::Eject(addr)); - } - decisions + /// Spawn the detector as an actor task with the default RNG. The + /// task ticks every `config.interval` and runs a sweep over the + /// shared registry. Dropping the returned [`AbortOnDrop`] stops + /// the task. + pub(crate) fn spawn( + config: OutlierDetectionConfig, + channels: OutlierStatsRegistry, + ) -> AbortOnDrop { + Self::spawn_inner(Self::new(config), channels) + } + + /// Variant of [`Self::spawn`] that accepts a custom [`Rng`]. + pub(crate) fn spawn_with_rng( + config: OutlierDetectionConfig, + rng: Box, + channels: OutlierStatsRegistry, + ) -> AbortOnDrop { + Self::spawn_inner(Self::with_rng(config, rng), channels) + } + + fn spawn_inner(mut detector: Self, channels: OutlierStatsRegistry) -> AbortOnDrop { + let interval = detector.config.interval; + let task = tokio::spawn(async move { + let mut ticker = tokio::time::interval(interval); + ticker.set_missed_tick_behavior(tokio::time::MissedTickBehavior::Skip); + // First tick fires immediately so the actor runs an initial + // sweep on startup; subsequent ticks fire on the interval. + loop { + ticker.tick().await; + detector.run_sweep(Instant::now(), &channels); + } + }); + AbortOnDrop(task) } } @@ -322,8 +300,10 @@ struct Candidate { #[cfg(test)] mod tests { use super::*; - use crate::xds::resource::outlier_detection::Percentage; - use std::sync::atomic::AtomicU32; + use crate::xds::resource::outlier_detection::{ + FailurePercentageConfig, OutlierDetectionConfig, Percentage, + }; + use std::sync::atomic::{AtomicU32, Ordering}; use std::time::Duration; // ----- Fixtures ----- @@ -336,7 +316,6 @@ mod tests { Percentage::new(v).unwrap() } - /// Base config with both algorithms disabled; tests opt in. fn base_config() -> OutlierDetectionConfig { OutlierDetectionConfig { interval: Duration::from_secs(1), @@ -348,15 +327,27 @@ mod tests { } } - /// Deterministic RNG: `pct_roll()` returns a fixed value, configurable. + fn fp_config( + threshold: u32, + request_volume: u32, + minimum_hosts: u32, + ) -> OutlierDetectionConfig { + let mut c = base_config(); + c.failure_percentage = Some(FailurePercentageConfig { + threshold: pct(threshold), + enforcing_failure_percentage: pct(100), + minimum_hosts, + request_volume, + }); + c + } + + /// Deterministic RNG: `pct_roll()` returns a fixed value. struct FixedRng(AtomicU32); impl FixedRng { - fn new(value: u32) -> Self { - Self(AtomicU32::new(value)) - } fn boxed(value: u32) -> Box { - Box::new(Self::new(value)) + Box::new(Self(AtomicU32::new(value))) } } @@ -370,69 +361,63 @@ mod tests { OutlierDetector::with_rng(config, rng) } - // ----- EndpointCounters ----- + fn registry() -> OutlierStatsRegistry { + Arc::new(DashMap::new()) + } + + fn add(channels: &OutlierStatsRegistry, port: u16) -> Arc { + let state = Arc::new(OutlierChannelState::new()); + channels.insert(addr(port), state.clone()); + state + } - #[test] - fn counters_record_and_reset() { - let c = EndpointCounters::default(); - c.record_success(); - c.record_success(); - c.record_failure(); - assert_eq!(c.snapshot_and_reset(), (2, 1)); - assert_eq!(c.snapshot_and_reset(), (0, 0)); + fn ejected(channels: &OutlierStatsRegistry, port: u16) -> bool { + channels + .get(&addr(port)) + .map(|e| e.value().is_ejected()) + .unwrap_or(false) + } + + fn ejected_count(channels: &OutlierStatsRegistry) -> usize { + channels.iter().filter(|e| e.value().is_ejected()).count() } - // ----- add_endpoint / remove_endpoint ----- + // ----- OutlierChannelState (sanity) ----- #[test] - fn add_endpoint_returns_shared_counter() { - let mut detector = detector_with_rng(base_config(), FixedRng::boxed(99)); - let h1 = detector.add_endpoint(addr(8080)); - let h2 = detector.add_endpoint(addr(8080)); - assert!( - Arc::ptr_eq(&h1, &h2), - "second add should return same handle" - ); - h1.record_success(); - assert_eq!(h2.snapshot_and_reset(), (1, 0)); + fn channel_state_records_and_resets() { + let s = OutlierChannelState::new(); + s.record_success(); + s.record_success(); + s.record_failure(); + assert_eq!(s.snapshot_and_reset(), (2, 1)); + assert_eq!(s.snapshot_and_reset(), (0, 0)); } #[test] - fn remove_endpoint_drops_state() { - let mut detector = detector_with_rng(base_config(), FixedRng::boxed(99)); - detector.add_endpoint(addr(8080)); - detector.remove_endpoint(&addr(8080)); - assert!(detector.state.is_empty()); + fn channel_state_eject_uneject_flips_signal() { + let s = OutlierChannelState::new(); + assert!(!s.is_ejected()); + s.eject(); + assert!(s.is_ejected()); + s.uneject(); + assert!(!s.is_ejected()); } // ----- Failure-percentage algorithm ----- - fn fp_config( - threshold: u32, - request_volume: u32, - minimum_hosts: u32, - ) -> OutlierDetectionConfig { - let mut c = base_config(); - c.failure_percentage = Some(FailurePercentageConfig { - threshold: pct(threshold), - enforcing_failure_percentage: pct(100), - minimum_hosts, - request_volume, - }); - c - } - #[test] fn failure_percentage_ejects_above_threshold() { let mut detector = detector_with_rng(fp_config(50, 10, 3), FixedRng::boxed(99)); - // 4 healthy endpoints + 1 bad one. + let channels = registry(); + for port in 8080..=8083 { - let h = detector.add_endpoint(addr(port)); + let s = add(&channels, port); for _ in 0..100 { - h.record_success(); + s.record_success(); } } - let bad = detector.add_endpoint(addr(8084)); + let bad = add(&channels, 8084); for _ in 0..90 { bad.record_failure(); } @@ -440,72 +425,80 @@ mod tests { bad.record_success(); } - let decisions = detector.run_sweep(Instant::now()); - assert_eq!(decisions, vec![EjectionDecision::Eject(addr(8084))]); + detector.run_sweep(Instant::now(), &channels); + assert!(bad.is_ejected()); + for port in 8080..=8083 { + assert!(!ejected(&channels, port)); + } } #[test] fn failure_percentage_skips_below_threshold() { let mut detector = detector_with_rng(fp_config(50, 10, 3), FixedRng::boxed(99)); + let channels = registry(); for port in 8080..=8084 { - let h = detector.add_endpoint(addr(port)); + let s = add(&channels, port); // 30% failure → below threshold of 50%. for _ in 0..70 { - h.record_success(); + s.record_success(); } for _ in 0..30 { - h.record_failure(); + s.record_failure(); } } - assert!(detector.run_sweep(Instant::now()).is_empty()); + detector.run_sweep(Instant::now(), &channels); + assert_eq!(ejected_count(&channels), 0); } #[test] fn failure_percentage_at_threshold_does_not_eject() { - // A50 specifies a strict "greater than" comparison: an address - // sitting exactly at the threshold should *not* be ejected. + // A50 specifies a strict "greater than" comparison. let mut detector = detector_with_rng(fp_config(50, 10, 3), FixedRng::boxed(0)); + let channels = registry(); for port in 8080..=8084 { - let h = detector.add_endpoint(addr(port)); - // Exactly 50% failure rate — equal to the threshold. + let s = add(&channels, port); for _ in 0..50 { - h.record_success(); + s.record_success(); } for _ in 0..50 { - h.record_failure(); + s.record_failure(); } } - assert!(detector.run_sweep(Instant::now()).is_empty()); + detector.run_sweep(Instant::now(), &channels); + assert_eq!(ejected_count(&channels), 0); } #[test] fn minimum_hosts_gates_failure_percentage() { let mut detector = detector_with_rng(fp_config(50, 10, 5), FixedRng::boxed(99)); + let channels = registry(); // Only 2 hosts have request_volume ≥ 10; minimum_hosts is 5 ⇒ skip. for port in 8080..=8081 { - let h = detector.add_endpoint(addr(port)); + let s = add(&channels, port); for _ in 0..100 { - h.record_failure(); + s.record_failure(); } } - assert!(detector.run_sweep(Instant::now()).is_empty()); + detector.run_sweep(Instant::now(), &channels); + assert_eq!(ejected_count(&channels), 0); } #[test] fn request_volume_filters_low_traffic_endpoints() { let mut detector = detector_with_rng(fp_config(50, 100, 3), FixedRng::boxed(99)); - // Bad endpoint, but only 5 requests — below request_volume=100. - let bad = detector.add_endpoint(addr(8080)); + let channels = registry(); + let bad = add(&channels, 8080); for _ in 0..5 { bad.record_failure(); } for port in 8081..=8084 { - let h = detector.add_endpoint(addr(port)); + let s = add(&channels, port); for _ in 0..200 { - h.record_success(); + s.record_success(); } } - assert!(detector.run_sweep(Instant::now()).is_empty()); + detector.run_sweep(Instant::now(), &channels); + assert_eq!(ejected_count(&channels), 0); } #[test] @@ -516,16 +509,16 @@ mod tests { .as_mut() .unwrap() .enforcing_failure_percentage = pct(0); - // Roll = 0 wouldn't trigger anyway since `roll(0)` short-circuits; - // pin the RNG to 0 just to be explicit. let mut detector = detector_with_rng(config, FixedRng::boxed(0)); + let channels = registry(); for port in 8080..=8084 { - let h = detector.add_endpoint(addr(port)); + let s = add(&channels, port); for _ in 0..100 { - h.record_failure(); + s.record_failure(); } } - assert!(detector.run_sweep(Instant::now()).is_empty()); + detector.run_sweep(Instant::now(), &channels); + assert_eq!(ejected_count(&channels), 0); } // ----- Ejection multiplier / un-ejection ----- @@ -536,195 +529,193 @@ mod tests { config.base_ejection_time = Duration::from_secs(10); config.max_ejection_time = Duration::from_secs(60); let mut detector = detector_with_rng(config, FixedRng::boxed(99)); + let channels = registry(); for port in 8080..=8084 { - let h = detector.add_endpoint(addr(port)); + let s = add(&channels, port); if port == 8084 { for _ in 0..100 { - h.record_failure(); + s.record_failure(); } } else { for _ in 0..100 { - h.record_success(); + s.record_success(); } } } let t0 = Instant::now(); - assert_eq!( - detector.run_sweep(t0), - vec![EjectionDecision::Eject(addr(8084))], - ); + detector.run_sweep(t0, &channels); + assert!(ejected(&channels, 8084)); // Still ejected just before base_ejection_time elapses. - assert!(detector.run_sweep(t0 + Duration::from_secs(9)).is_empty()); + detector.run_sweep(t0 + Duration::from_secs(9), &channels); + assert!(ejected(&channels, 8084)); // Un-eject after `base * multiplier(=1)` = 10s. - assert_eq!( - detector.run_sweep(t0 + Duration::from_secs(10)), - vec![EjectionDecision::Uneject(addr(8084))], - ); + detector.run_sweep(t0 + Duration::from_secs(10), &channels); + assert!(!ejected(&channels, 8084)); } #[test] fn re_ejection_doubles_duration() { - // The multiplier doubles only when un-ejection and re-ejection - // happen in the *same* sweep — at that point the multiplier- - // decrement step has skipped the (still-ejected-at-start) - // endpoint, so re-ejection increments it from 1 to 2. + // Same-sweep un-eject + re-eject grows the multiplier 1 → 2. let mut config = fp_config(50, 10, 3); config.base_ejection_time = Duration::from_secs(10); config.max_ejection_time = Duration::from_secs(60); let mut detector = detector_with_rng(config, FixedRng::boxed(99)); + let channels = registry(); - let bad = addr(8084); - let bad_h = detector.add_endpoint(bad.clone()); + let bad = add(&channels, 8084); for port in 8080..=8083 { - let h = detector.add_endpoint(addr(port)); + let s = add(&channels, port); for _ in 0..100 { - h.record_success(); + s.record_success(); } } for _ in 0..100 { - bad_h.record_failure(); + bad.record_failure(); } // Sweep 1: eject. Multiplier 0 → 1. let t0 = Instant::now(); - assert_eq!( - detector.run_sweep(t0), - vec![EjectionDecision::Eject(bad.clone())], - ); + detector.run_sweep(t0, &channels); + assert!(bad.is_ejected()); - // Re-record stats so sweep 2's snapshot has volume to evaluate. + // Re-record stats so sweep 2 has volume to evaluate. for port in 8080..=8083 { - let h = detector.add_endpoint(addr(port)); + let s = channels.get(&addr(port)).unwrap().value().clone(); for _ in 0..100 { - h.record_success(); + s.record_success(); } } for _ in 0..100 { - bad_h.record_failure(); + bad.record_failure(); } - // Sweep 2 at t0+10: re-ejection happens before the un-eject - // housekeeping step (per A50 ordering), so `ejected_at` is - // refreshed to `now` and the un-eject check sees zero elapsed - // time. Only an Eject decision is emitted; the multiplier moves - // 1 → 2. - assert_eq!( - detector.run_sweep(t0 + Duration::from_secs(10)), - vec![EjectionDecision::Eject(bad.clone())], - ); + // Sweep 2 at t0+10: re-ejection refreshes timestamp, multiplier 1 → 2. + detector.run_sweep(t0 + Duration::from_secs(10), &channels); + assert!(bad.is_ejected()); // Re-ejection started at t0+10 with multiplier=2 → duration 20s. - // Still ejected 19s later (29s after t0). - assert!(detector.run_sweep(t0 + Duration::from_secs(29)).is_empty()); + detector.run_sweep(t0 + Duration::from_secs(29), &channels); + assert!(bad.is_ejected()); // Un-ejects at the 20s mark (30s after t0). - assert_eq!( - detector.run_sweep(t0 + Duration::from_secs(30)), - vec![EjectionDecision::Uneject(bad)], - ); + detector.run_sweep(t0 + Duration::from_secs(30), &channels); + assert!(!bad.is_ejected()); } #[test] fn ejection_capped_by_max_ejection_time() { - // base=10s, max=15s, multiplier=10 → cap at 15s rather than 100s. let mut config = fp_config(50, 10, 3); config.base_ejection_time = Duration::from_secs(10); config.max_ejection_time = Duration::from_secs(15); let mut detector = detector_with_rng(config, FixedRng::boxed(99)); + let channels = registry(); for port in 8080..=8084 { - detector.add_endpoint(addr(port)); + add(&channels, port); } let t0 = Instant::now(); - // Force multiplier=10 directly. - { - let ep = detector.state.get_mut(&addr(8084)).unwrap(); - ep.ejection_multiplier = 10; - ep.ejected_at = Some(t0); - } - // After base*multiplier (= 100s) the cap (= 15s) has long passed, - // so a sweep at 16s should un-eject. - let decisions = detector.run_sweep(t0 + Duration::from_secs(16)); - assert_eq!(decisions, vec![EjectionDecision::Uneject(addr(8084))]); + // Force multiplier=10 on 8084 directly. We need to drive a + // first sweep to populate `state[8084]`, then fix it up. + detector.run_sweep(t0, &channels); + let alg = detector.state.get_mut(&addr(8084)).unwrap(); + alg.ejection_multiplier = 10; + alg.ejected_at = Some(t0); + channels.get(&addr(8084)).unwrap().value().eject(); + + // base*multiplier = 100s; cap = 15s → un-eject after 16s. + detector.run_sweep(t0 + Duration::from_secs(16), &channels); + assert!(!ejected(&channels, 8084)); } #[test] fn max_ejection_percent_caps_concurrent_ejections() { - // 5 hosts, all bad, but max_ejection_percent=20 ⇒ at most 1 ejected. let mut config = fp_config(50, 10, 3); config.max_ejection_percent = pct(20); let mut detector = detector_with_rng(config, FixedRng::boxed(99)); + let channels = registry(); for port in 8080..=8084 { - let h = detector.add_endpoint(addr(port)); + let s = add(&channels, port); for _ in 0..100 { - h.record_failure(); + s.record_failure(); } } - let mut decisions = detector.run_sweep(Instant::now()); - decisions.sort(); - let ejects = decisions - .iter() - .filter(|d| matches!(d, EjectionDecision::Eject(_))) - .count(); - assert_eq!(ejects, 1, "max_ejection_percent=20% of 5 hosts ⇒ 1"); + detector.run_sweep(Instant::now(), &channels); + assert_eq!(ejected_count(&channels), 1); } #[test] fn already_ejected_re_ejection_does_not_consume_budget() { - // 5 hosts: one already ejected (with stats from in-flight RPCs - // accumulated during its backoff), four newly bad. Cap permits - // 3 concurrently ejected hosts (60% of 5), with 1 already taken - // by the pre-ejected host — so 2 new ejections remain in budget. + // 5 hosts: one already ejected, four newly bad. Cap permits 3 + // concurrently ejected, with 1 already taken — so 2 new + // ejections remain in budget. let mut config = fp_config(50, 10, 3); config.max_ejection_percent = pct(60); let mut detector = detector_with_rng(config, FixedRng::boxed(99)); + let channels = registry(); - // Pre-eject host 8080 directly and give it bad in-flight stats. - let already_bad = detector.add_endpoint(addr(8080)); + // Pre-eject 8080 by driving one sweep with bad stats. + let already_bad = add(&channels, 8080); for _ in 0..100 { already_bad.record_failure(); } - { - let ep = detector.state.get_mut(&addr(8080)).unwrap(); - ep.ejected_at = Some(Instant::now()); - ep.ejection_multiplier = 1; + // Use a tiny first sweep to enter ejected state via the algorithm. + // Need at least minimum_hosts=3 candidates with volume; add three + // healthy hosts with ≥10 requests so the algorithm runs and the + // single bad one is ejected (cap 60% of 4 hosts = 2 → budget 2 → 1 + // new ejection). + for port in 8085..=8087 { + let s = add(&channels, port); + for _ in 0..100 { + s.record_success(); + } } - - // Four more bad hosts. + let t0 = Instant::now(); + detector.run_sweep(t0, &channels); + assert!(already_bad.is_ejected()); + + // Now grow the cluster to 5 hosts (8080 + 8081..=8084) and feed + // bad stats. 8085..=8087 are no longer relevant — drop them. + channels.remove(&addr(8085)); + channels.remove(&addr(8086)); + channels.remove(&addr(8087)); for port in 8081..=8084 { - let h = detector.add_endpoint(addr(port)); + let s = add(&channels, port); for _ in 0..100 { - h.record_failure(); + s.record_failure(); } } + for _ in 0..100 { + already_bad.record_failure(); + } - let mut decisions = detector.run_sweep(Instant::now()); - decisions.sort(); - let new_ejects = decisions - .iter() - .filter(|d| matches!(d, EjectionDecision::Eject(a) if *a != addr(8080))) - .count(); - assert_eq!(new_ejects, 2, "expected 2 new ejections under the cap"); + detector.run_sweep(t0 + Duration::from_secs(2), &channels); + // Cap = 60% of 5 = 3. already_ejected = 1. Budget = 2. Plus + // 8080's re-eject which doesn't consume budget. So 2 NEW + // ejections among 8081..=8084. + let new_ejects = (8081..=8084).filter(|p| ejected(&channels, *p)).count(); + assert_eq!(new_ejects, 2); } #[test] fn multiplier_decrements_on_healthy_interval() { let mut detector = detector_with_rng(base_config(), FixedRng::boxed(99)); - let h = detector.add_endpoint(addr(8080)); + let channels = registry(); + let s = add(&channels, 8080); + // First sweep populates the alg state. + detector.run_sweep(Instant::now(), &channels); // Force multiplier to 3 without ejecting. detector .state .get_mut(&addr(8080)) .unwrap() .ejection_multiplier = 3; - // Healthy interval (some traffic, no ejection). - h.record_success(); - detector.run_sweep(Instant::now()); + s.record_success(); + detector.run_sweep(Instant::now(), &channels); assert_eq!( detector.state.get(&addr(8080)).unwrap().ejection_multiplier, 2, @@ -734,109 +725,87 @@ mod tests { #[test] fn multiplier_decrements_even_without_traffic() { // A50: a non-ejected address with multiplier > 0 has its - // multiplier decremented every sweep, regardless of whether it - // received any RPCs that interval. + // multiplier decremented every sweep, regardless of traffic. let mut detector = detector_with_rng(base_config(), FixedRng::boxed(99)); - detector.add_endpoint(addr(8080)); + let channels = registry(); + add(&channels, 8080); + detector.run_sweep(Instant::now(), &channels); detector .state .get_mut(&addr(8080)) .unwrap() .ejection_multiplier = 3; - // No traffic recorded. - detector.run_sweep(Instant::now()); + detector.run_sweep(Instant::now(), &channels); assert_eq!( detector.state.get(&addr(8080)).unwrap().ejection_multiplier, 2, ); } - // ----- maybe_run_sweep gating ----- - #[test] - fn maybe_run_sweep_runs_on_first_call() { - // `last_sweep_at` starts as `None`, so the first call always - // sweeps regardless of the wall clock argument. - let mut detector = detector_with_rng(fp_config(50, 10, 3), FixedRng::boxed(99)); - for port in 8080..=8083 { - let h = detector.add_endpoint(addr(port)); - for _ in 0..100 { - h.record_success(); - } - } - let bad = detector.add_endpoint(addr(8084)); - for _ in 0..100 { - bad.record_failure(); - } - let decisions = detector.maybe_run_sweep(Instant::now()); - assert_eq!(decisions, vec![EjectionDecision::Eject(addr(8084))]); + fn alg_state_dropped_when_channel_removed() { + let mut detector = detector_with_rng(base_config(), FixedRng::boxed(99)); + let channels = registry(); + add(&channels, 8080); + add(&channels, 8081); + detector.run_sweep(Instant::now(), &channels); + assert_eq!(detector.state.len(), 2); + + channels.remove(&addr(8080)); + detector.run_sweep(Instant::now(), &channels); + assert_eq!(detector.state.len(), 1); + assert!(detector.state.contains_key(&addr(8081))); } - #[test] - fn maybe_run_sweep_skips_when_interval_not_elapsed() { + // ----- Spawned actor ----- + + #[tokio::test(start_paused = true)] + async fn spawned_actor_runs_sweeps_on_tick() { let mut config = fp_config(50, 10, 3); - config.interval = Duration::from_secs(10); - let mut detector = detector_with_rng(config, FixedRng::boxed(99)); + config.interval = Duration::from_millis(100); + let channels = registry(); + for port in 8080..=8083 { - let h = detector.add_endpoint(addr(port)); + let s = add(&channels, port); for _ in 0..100 { - h.record_success(); + s.record_success(); } } - let bad = detector.add_endpoint(addr(8084)); + let bad = add(&channels, 8084); for _ in 0..100 { bad.record_failure(); } - // First call always runs. - let t0 = Instant::now(); - assert_eq!( - detector.maybe_run_sweep(t0), - vec![EjectionDecision::Eject(addr(8084))], - ); + let _abort = OutlierDetector::spawn_with_rng(config, FixedRng::boxed(99), channels.clone()); - // Re-arm with bad stats; second call Date: Fri, 8 May 2026 15:09:05 -0700 Subject: [PATCH 20/39] refactor(tonic-xds): per-RPC outlier detection + actor for housekeeping MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Pivot the algorithm split per design feedback: - Per-RPC detection runs inline on each call completion via `OutlierStatsRegistry::record_outcome`. The wrapper records the outcome on the channel's `OutlierChannelState`, evaluates the failure-percentage threshold against the channel's local counters, and ejects the channel directly by flipping its `watch::Sender`. Cluster-wide gates (`minimum_hosts`, `max_ejection_percent`) are enforced via two atomic counters on the registry, kept in sync as channels cross thresholds. - The spawned actor runs only interval-boundary housekeeping: counter reset, un-eject if backoff has elapsed, decrement multipliers for non-ejected channels. The actor never makes ejection decisions. Reaction latency drops from up to one `interval` (default 10s) to the first failed RPC after `request_volume` is reached, while `LoadBalancer::poll_ready` stays O(1) — ejections are observed via per-channel `watch::Receiver::changed()` futures in a `FuturesUnordered`, which the integration PR will wire. Implementation: - `OutlierChannelState` (channel_state.rs) gains atomic ejection-time state: `is_qualifying: AtomicBool`, `ejection_multiplier: AtomicU32`, `ejected_at_nanos: AtomicU64` with a constant `epoch: Instant`. `try_eject` / `try_uneject` are CAS-style and return whether the call performed the transition, so callers can update registry counters exactly once. - `OutlierStatsRegistry` (outlier_detection.rs) is the new central type. Holds the `DashMap>`, cluster-wide atomic counters, config, and RNG. All methods take `&self` (concurrent access from data path and actor). - `OutlierDetector` struct removed; everything lives on the registry. The actor is spawned via the free `spawn_actor(registry)` function. - Tests rewritten: drive `record_outcome` and observe `is_ejected()`; drive `run_housekeeping` for interval-boundary scenarios. --- .../src/client/loadbalance/channel_state.rs | 131 ++- .../client/loadbalance/outlier_detection.rs | 949 +++++++----------- 2 files changed, 471 insertions(+), 609 deletions(-) diff --git a/tonic-xds/src/client/loadbalance/channel_state.rs b/tonic-xds/src/client/loadbalance/channel_state.rs index fb534b9cd..7885705a1 100644 --- a/tonic-xds/src/client/loadbalance/channel_state.rs +++ b/tonic-xds/src/client/loadbalance/channel_state.rs @@ -26,9 +26,9 @@ use std::future::Future; use std::pin::Pin; use std::sync::Arc; -use std::sync::atomic::{AtomicU64, Ordering}; +use std::sync::atomic::{AtomicBool, AtomicU32, AtomicU64, Ordering}; use std::task::{Context, Poll}; -use std::time::Duration; +use std::time::{Duration, Instant}; use pin_project_lite::pin_project; use tokio::sync::watch; @@ -71,22 +71,37 @@ impl EndpointCounters { } } -/// Per-channel outlier-detection state, shared between the data path -/// (for outcome recording) and the outlier-detection actor (for sweeps -/// and ejection signalling). +/// Per-channel outlier-detection state, shared (via `Arc`) between +/// the data path (per-RPC outcome recording + threshold-based ejection) +/// and the outlier-detection actor (interval-based housekeeping). /// -/// The ejection signal is edge-triggered: the actor calls [`eject`] / -/// [`uneject`] to flip the flag; observers subscribe via -/// [`subscribe`] and poll `Receiver::changed()` (typically inside a -/// `FuturesUnordered`) to react in O(1) on each transition. +/// Ejection is edge-triggered: callers flip the flag via [`eject`] / +/// [`uneject`]; observers poll `Receiver::changed()` (typically inside +/// a `FuturesUnordered`) to react in O(1) on each transition. +/// +/// All fields are atomics or wrapped in lock-free primitives so the +/// data path can mutate them without locking. /// /// [`eject`]: Self::eject /// [`uneject`]: Self::uneject -/// [`subscribe`]: Self::subscribe #[derive(Debug)] pub(crate) struct OutlierChannelState { counters: EndpointCounters, eject_tx: watch::Sender, + /// Whether this channel currently contributes to the registry's + /// `qualifying_count`. Set when `total` first reaches + /// `request_volume` in the current interval; cleared on counter + /// reset. + is_qualifying: AtomicBool, + /// Number of times this channel has been ejected. Bumped on each + /// ejection; decremented (saturating) on each healthy interval. + ejection_multiplier: AtomicU32, + /// `0` when not ejected. Otherwise nanos since [`Self::epoch`] of + /// the current ejection's start. + ejected_at_nanos: AtomicU64, + /// Reference instant used as the origin for `ejected_at_nanos`. + /// Established at construction and never changes. + epoch: Instant, } impl Default for OutlierChannelState { @@ -101,6 +116,10 @@ impl OutlierChannelState { Self { counters: EndpointCounters::default(), eject_tx, + is_qualifying: AtomicBool::new(false), + ejection_multiplier: AtomicU32::new(0), + ejected_at_nanos: AtomicU64::new(0), + epoch: Instant::now(), } } @@ -112,14 +131,39 @@ impl OutlierChannelState { self.counters.record_failure(); } - /// Atomically read and zero the counters. Returns `(success, failure)`. + /// Read the current counter values without resetting. Returns + /// `(success, failure)`. The two reads are not atomic against + /// each other but the difference is bounded by concurrent in-flight + /// RPCs and is below the precision of the failure-percentage check. + pub(crate) fn counters(&self) -> (u64, u64) { + let s = self.counters.success.load(Ordering::Relaxed); + let f = self.counters.failure.load(Ordering::Relaxed); + (s, f) + } + + /// Read and zero the counters. Returns `(success, failure)`. pub(crate) fn snapshot_and_reset(&self) -> (u64, u64) { self.counters.snapshot_and_reset() } - /// Flip the ejection flag to `true`. No-op if already ejected. - pub(crate) fn eject(&self) { - self.eject_tx.send_if_modified(|state| { + /// Try to set `is_qualifying` to `true`. Returns `true` if this + /// call performed the false → true transition, so callers can + /// increment a registry-level counter exactly once per crossing. + pub(crate) fn mark_qualifying(&self) -> bool { + !self.is_qualifying.swap(true, Ordering::AcqRel) + } + + /// Clear `is_qualifying`. Returns the previous value. + pub(crate) fn clear_qualifying(&self) -> bool { + self.is_qualifying.swap(false, Ordering::AcqRel) + } + + /// Flip the ejection flag to `true`. Returns `true` if this call + /// performed the false → true transition (so callers can update + /// registry-level counters exactly once per ejection). + /// Records the ejection timestamp and bumps the multiplier. + pub(crate) fn try_eject(&self, now: Instant) -> bool { + let won = self.eject_tx.send_if_modified(|state| { if *state { false } else { @@ -127,11 +171,24 @@ impl OutlierChannelState { true } }); - } - - /// Flip the ejection flag back to `false`. No-op if not ejected. - pub(crate) fn uneject(&self) { - self.eject_tx.send_if_modified(|state| { + if !won { + return false; + } + let nanos = now + .saturating_duration_since(self.epoch) + .as_nanos() + .min(u64::MAX as u128) as u64; + // Use 1 as a sentinel if the channel was created at exactly + // `now`, since 0 means "not ejected". + self.ejected_at_nanos.store(nanos.max(1), Ordering::Relaxed); + self.ejection_multiplier.fetch_add(1, Ordering::Relaxed); + true + } + + /// Flip the ejection flag back to `false`. Returns `true` if this + /// call performed the true → false transition. + pub(crate) fn try_uneject(&self) -> bool { + let won = self.eject_tx.send_if_modified(|state| { if *state { *state = false; true @@ -139,6 +196,10 @@ impl OutlierChannelState { false } }); + if won { + self.ejected_at_nanos.store(0, Ordering::Relaxed); + } + won } /// Current ejection state. @@ -146,6 +207,31 @@ impl OutlierChannelState { *self.eject_tx.borrow() } + /// Returns the elapsed time since this channel was ejected, or + /// `None` if it is not currently ejected. + pub(crate) fn ejected_duration(&self, now: Instant) -> Option { + let nanos = self.ejected_at_nanos.load(Ordering::Relaxed); + if nanos == 0 { + return None; + } + let ejected_at = self.epoch + Duration::from_nanos(nanos); + Some(now.saturating_duration_since(ejected_at)) + } + + /// Current ejection multiplier. + pub(crate) fn ejection_multiplier(&self) -> u32 { + self.ejection_multiplier.load(Ordering::Relaxed) + } + + /// Decrement the multiplier saturating at zero. Called by the + /// actor on healthy intervals. + pub(crate) fn decrement_multiplier(&self) { + let prev = self.ejection_multiplier.load(Ordering::Relaxed); + if prev > 0 { + self.ejection_multiplier.store(prev - 1, Ordering::Relaxed); + } + } + /// Subscribe to ejection-state changes. The returned receiver's /// `changed()` future resolves on each transition; consumers /// typically push it into a `FuturesUnordered`. @@ -153,6 +239,13 @@ impl OutlierChannelState { pub(crate) fn subscribe(&self) -> watch::Receiver { self.eject_tx.subscribe() } + + /// Test-only setter for the ejection multiplier; lets tests drive + /// housekeeping behavior without going through `try_eject`. + #[cfg(test)] + pub(crate) fn set_ejection_multiplier(&self, value: u32) { + self.ejection_multiplier.store(value, Ordering::Relaxed); + } } /// Configuration for an ejected channel. diff --git a/tonic-xds/src/client/loadbalance/outlier_detection.rs b/tonic-xds/src/client/loadbalance/outlier_detection.rs index 37023ad95..5295a09c7 100644 --- a/tonic-xds/src/client/loadbalance/outlier_detection.rs +++ b/tonic-xds/src/client/loadbalance/outlier_detection.rs @@ -1,23 +1,34 @@ -//! gRFC A50 outlier-detection sweep engine. +//! gRFC A50 outlier detection. //! -//! Reads per-endpoint counters from a shared -//! [`DashMap>`] and applies -//! ejection / un-ejection decisions in place by toggling each entry's -//! ejection signal. The load balancer registers each [`ReadyChannel`]'s -//! [`OutlierChannelState`] in the same map and observes the signal via -//! a `FuturesUnordered` of `watch::Receiver::changed()` futures, so the -//! O(n) sweep runs in a spawned actor task off the LB's critical path. +//! The algorithm is split between the data path and a spawned actor: //! -//! Only the failure-percentage algorithm is currently dispatched. If -//! [`OutlierDetectionConfig::success_rate`] is set, it is ignored. +//! - **Per-RPC detection** runs inline on each call completion via +//! [`OutlierStatsRegistry::record_outcome`]. The wrapper records the +//! outcome on the channel's [`OutlierChannelState`], evaluates the +//! failure-percentage threshold against the channel's local +//! counters, and ejects the channel directly by flipping its +//! `watch::Sender`. Cluster-wide gates (`minimum_hosts`, +//! `max_ejection_percent`) are enforced via two atomic counters on +//! the registry, kept in sync as channels cross thresholds. +//! - **Interval-based housekeeping** runs in a spawned actor (see +//! [`spawn_actor`]). It resets per-channel counters at the +//! `config.interval` boundary, un-ejects channels whose +//! `base × multiplier` backoff has elapsed, and decrements +//! multipliers for non-ejected channels. The actor never makes +//! ejection decisions. +//! +//! `LoadBalancer::poll_ready` observes ejections in O(1) per +//! transition by polling a `FuturesUnordered` +//! over each channel's signal. +//! +//! Only the failure-percentage algorithm is dispatched. The +//! success-rate algorithm (cross-endpoint mean/stdev) is left to a +//! follow-up. //! //! [gRFC A50]: https://github.com/grpc/proposal/blob/master/A50-xds-outlier-detection.md -//! [`ReadyChannel`]: crate::client::loadbalance::channel_state::ReadyChannel -//! [`OutlierChannelState`]: crate::client::loadbalance::channel_state::OutlierChannelState -//! [`OutlierDetectionConfig::success_rate`]: crate::xds::resource::outlier_detection::OutlierDetectionConfig::success_rate -use std::collections::{HashMap, HashSet}; use std::sync::Arc; +use std::sync::atomic::{AtomicU64, Ordering}; use std::time::Instant; use dashmap::DashMap; @@ -25,15 +36,7 @@ use dashmap::DashMap; use crate::client::endpoint::EndpointAddress; use crate::client::loadbalance::channel_state::OutlierChannelState; use crate::common::async_util::AbortOnDrop; -use crate::xds::resource::outlier_detection::{FailurePercentageConfig, OutlierDetectionConfig}; - -/// Shared map of per-endpoint outlier state, keyed by address. The -/// load balancer inserts each [`ReadyChannel`]'s -/// [`OutlierChannelState`] on connect and removes it on disconnect; the -/// detector iterates the map on each sweep. -/// -/// [`ReadyChannel`]: crate::client::loadbalance::channel_state::ReadyChannel -pub(crate) type OutlierStatsRegistry = Arc>>; +use crate::xds::resource::outlier_detection::OutlierDetectionConfig; /// Probability source for `enforcing_*` rolls. pub(crate) trait Rng: Send + Sync + 'static { @@ -50,226 +53,184 @@ impl Rng for FastRandRng { } } -/// Algorithm-private per-endpoint state. Tracks the ejection-time -/// multiplier and the last ejection timestamp; counters and the -/// outward-facing ejection signal live on the channel's -/// [`OutlierChannelState`]. -struct AlgState { - /// Number of times this endpoint has been ejected. Grows on each - /// re-ejection and decays on each healthy interval. - ejection_multiplier: u32, - /// `Some(at)` when currently ejected; `None` otherwise. - ejected_at: Option, -} - -impl AlgState { - fn new() -> Self { - Self { - ejection_multiplier: 0, - ejected_at: None, - } - } -} - -/// gRFC A50 outlier detector. -/// -/// Held by an actor task that ticks once per `config.interval` and -/// calls [`Self::run_sweep`] over the shared [`OutlierStatsRegistry`]. -/// Stats and ejection signals live on the channels themselves; the -/// detector owns only algorithm-private metadata (per-endpoint -/// multiplier and last-ejection timestamp). -pub(crate) struct OutlierDetector { +/// Shared outlier-detection state, owned by `Arc` and accessed +/// concurrently by: +/// - The load balancer's call wrapper, which calls +/// [`Self::record_outcome`] after each RPC completion. +/// - The spawned actor task, which calls [`Self::run_housekeeping`] +/// on every `config.interval` tick. +/// - The load balancer's `poll_ready`, which subscribes to per-channel +/// ejection signals via [`OutlierChannelState::subscribe`]. +pub(crate) struct OutlierStatsRegistry { + /// Per-endpoint state, keyed by address. Inserted by the LB on + /// channel creation and removed on disconnect. + channels: DashMap>, + /// Number of channels currently with `total >= request_volume` in + /// the active interval. Drives the `minimum_hosts` gate. + qualifying_count: AtomicU64, + /// Number of channels currently ejected. Drives the + /// `max_ejection_percent` cap. + ejected_count: AtomicU64, config: OutlierDetectionConfig, - state: HashMap, rng: Box, } -impl OutlierDetector { - /// Build the detector with the default RNG (`fastrand`). - pub(crate) fn new(config: OutlierDetectionConfig) -> Self { +impl OutlierStatsRegistry { + /// Build a registry with the default RNG. + pub(crate) fn new(config: OutlierDetectionConfig) -> Arc { Self::with_rng(config, Box::new(FastRandRng)) } - /// Build the detector with a custom [`Rng`]. - pub(crate) fn with_rng(config: OutlierDetectionConfig, rng: Box) -> Self { - Self { + /// Build a registry with a custom [`Rng`]. + pub(crate) fn with_rng(config: OutlierDetectionConfig, rng: Box) -> Arc { + Arc::new(Self { + channels: DashMap::new(), + qualifying_count: AtomicU64::new(0), + ejected_count: AtomicU64::new(0), config, - state: HashMap::new(), rng, - } + }) + } + + /// Register a new channel. Returns the `Arc` + /// the load balancer wires into the channel; the same `Arc` is + /// retained in the registry so the actor can iterate it. + pub(crate) fn add_channel(&self, addr: EndpointAddress) -> Arc { + let state = Arc::new(OutlierChannelState::new()); + self.channels.insert(addr, state.clone()); + state } - /// Run one sweep at logical time `now` over the shared registry. - /// Applies ejection decisions inline by calling - /// [`OutlierChannelState::eject`] / [`OutlierChannelState::uneject`] - /// on each affected entry. - /// - /// Order of operations follows gRFC A50: - /// 1. Record the timestamp. - /// 2. Snapshot each address's call-counter buckets. - /// 3. Run the success-rate algorithm if configured (not yet dispatched). - /// 4. Run the failure-percentage algorithm if configured. - /// 5. Decrement the multiplier of non-ejected addresses with - /// multiplier > 0; un-eject ejected addresses whose backoff has - /// elapsed. - pub(crate) fn run_sweep(&mut self, now: Instant, channels: &OutlierStatsRegistry) { - // Step 2: snapshot every channel's counters and record which - // addresses are still in the registry. - let mut snapshots: Vec = Vec::with_capacity(channels.len()); - let mut seen: HashSet = HashSet::with_capacity(channels.len()); - for entry in channels.iter() { - let addr = entry.key().clone(); - let (success, failure) = entry.value().snapshot_and_reset(); - let alg = self.state.entry(addr.clone()).or_insert_with(AlgState::new); - snapshots.push(Candidate { - addr: addr.clone(), - success, - failure, - total: success + failure, - already_ejected: alg.ejected_at.is_some(), - }); - seen.insert(addr); - } - // Drop algorithm state for addresses no longer in the registry. - self.state.retain(|addr, _| seen.contains(addr)); - - // Per-sweep cap on new ejections, enforced as a budget the - // algorithms decrement. Per A50, the check happens before each - // candidate. - let total_endpoints = self.state.len(); - let max_ejections = (total_endpoints as u64 - * u64::from(self.config.max_ejection_percent.get()) - / 100) as usize; - let already_ejected = self - .state - .values() - .filter(|s| s.ejected_at.is_some()) - .count(); - let mut budget = max_ejections.saturating_sub(already_ejected); - - // Steps 3 & 4: run the algorithms. Ejected hosts have no - // in-interval traffic in production and so naturally fail the - // `request_volume` gate; iterating every address (per spec) is - // equivalent to iterating non-ejected ones. Step 3 (success- - // rate ejection) is not yet dispatched. - let mut to_eject: Vec = Vec::new(); - if let Some(fp) = self.config.failure_percentage.as_ref() { - run_failure_percentage(fp, &snapshots, &mut budget, &mut to_eject, &*self.rng); - } - - for addr in &to_eject { - if let Some(alg) = self.state.get_mut(addr) { - alg.ejected_at = Some(now); - alg.ejection_multiplier = alg.ejection_multiplier.saturating_add(1); + /// Forget a channel. Drops the registry's reference; cluster-wide + /// counters are decremented if the channel was qualifying or + /// ejected. + pub(crate) fn remove_channel(&self, addr: &EndpointAddress) { + if let Some((_, state)) = self.channels.remove(addr) { + if state.clear_qualifying() { + self.qualifying_count.fetch_sub(1, Ordering::Relaxed); } - if let Some(state) = channels.get(addr) { - state.eject(); + if state.is_ejected() { + self.ejected_count.fetch_sub(1, Ordering::Relaxed); } } + } + + /// Number of registered channels. + pub(crate) fn len(&self) -> usize { + self.channels.len() + } + + /// Per-RPC entry point. Called by the load balancer's call wrapper + /// after each RPC completion. Increments the channel's success or + /// failure counter and then evaluates the failure-percentage + /// threshold; if all gates pass, ejects the channel inline. + pub(crate) fn record_outcome(&self, state: &OutlierChannelState, success: bool) { + if success { + state.record_success(); + } else { + state.record_failure(); + } + + let Some(fp) = self.config.failure_percentage.as_ref() else { + return; + }; + + let (s, f) = state.counters(); + let total = s + f; + let request_volume = u64::from(fp.request_volume); + + // Track when each channel first qualifies in the current + // interval, so the `minimum_hosts` gate can be checked with a + // single atomic load. + if total >= request_volume && state.mark_qualifying() { + self.qualifying_count.fetch_add(1, Ordering::Relaxed); + } + + if state.is_ejected() { + return; + } + if total < request_volume { + return; + } + if self.qualifying_count.load(Ordering::Relaxed) < u64::from(fp.minimum_hosts) { + return; + } + if self.ejected_count.load(Ordering::Relaxed) >= self.max_ejections() { + return; + } - // Step 5: decrement multipliers for non-ejected addresses; - // un-eject ejected addresses whose backoff has elapsed. Runs - // *after* re-ejection, so a same-sweep re-eject refreshes - // `ejected_at` and the un-eject check sees zero elapsed time. + // failure_pct = 100 * failure / total. A50 uses strict ">". + let failure_pct = 100 * f / total; + if failure_pct <= u64::from(fp.threshold.get()) { + return; + } + if !roll(&*self.rng, fp.enforcing_failure_percentage.get()) { + return; + } + + if state.try_eject(Instant::now()) { + self.ejected_count.fetch_add(1, Ordering::Relaxed); + } + } + + /// Interval-boundary housekeeping. Called by the spawned actor on + /// each `config.interval` tick. Resets counters, un-ejects + /// channels whose backoff has elapsed, and decrements multipliers + /// for non-ejected channels. + pub(crate) fn run_housekeeping(&self, now: Instant) { + // Cap the un-ejection backoff at `max(base, max_ejection_time)`. let cap = self .config .base_ejection_time .max(self.config.max_ejection_time); - for (addr, alg) in self.state.iter_mut() { - if let Some(at) = alg.ejected_at { - if let Some(scaled) = self - .config - .base_ejection_time - .checked_mul(alg.ejection_multiplier) - && now.duration_since(at) >= scaled.min(cap) + + for entry in self.channels.iter() { + let state = entry.value(); + + // Reset counters; clear `is_qualifying` and adjust the + // registry-level counter in lockstep. + state.snapshot_and_reset(); + if state.clear_qualifying() { + self.qualifying_count.fetch_sub(1, Ordering::Relaxed); + } + + if state.is_ejected() { + let multiplier = state.ejection_multiplier(); + let elapsed = state.ejected_duration(now).unwrap_or_default(); + if let Some(scaled) = self.config.base_ejection_time.checked_mul(multiplier) + && elapsed >= scaled.min(cap) + && state.try_uneject() { - alg.ejected_at = None; - if let Some(state) = channels.get(addr) { - state.uneject(); - } + self.ejected_count.fetch_sub(1, Ordering::Relaxed); } - } else if alg.ejection_multiplier > 0 { - alg.ejection_multiplier -= 1; + } else { + state.decrement_multiplier(); } } } - /// Spawn the detector as an actor task with the default RNG. The - /// task ticks every `config.interval` and runs a sweep over the - /// shared registry. Dropping the returned [`AbortOnDrop`] stops - /// the task. - pub(crate) fn spawn( - config: OutlierDetectionConfig, - channels: OutlierStatsRegistry, - ) -> AbortOnDrop { - Self::spawn_inner(Self::new(config), channels) - } - - /// Variant of [`Self::spawn`] that accepts a custom [`Rng`]. - pub(crate) fn spawn_with_rng( - config: OutlierDetectionConfig, - rng: Box, - channels: OutlierStatsRegistry, - ) -> AbortOnDrop { - Self::spawn_inner(Self::with_rng(config, rng), channels) - } - - fn spawn_inner(mut detector: Self, channels: OutlierStatsRegistry) -> AbortOnDrop { - let interval = detector.config.interval; - let task = tokio::spawn(async move { - let mut ticker = tokio::time::interval(interval); - ticker.set_missed_tick_behavior(tokio::time::MissedTickBehavior::Skip); - // First tick fires immediately so the actor runs an initial - // sweep on startup; subsequent ticks fire on the interval. - loop { - ticker.tick().await; - detector.run_sweep(Instant::now(), &channels); - } - }); - AbortOnDrop(task) + /// `max_ejection_percent` resolved against the current channel + /// count. Updated as channels come and go. + fn max_ejections(&self) -> u64 { + self.channels.len() as u64 * u64::from(self.config.max_ejection_percent.get()) / 100 } } -/// A50 failure-percentage algorithm. -fn run_failure_percentage( - cfg: &FailurePercentageConfig, - all: &[Candidate], - budget: &mut usize, - out: &mut Vec, - rng: &dyn Rng, -) { - let qualifying: Vec<&Candidate> = all - .iter() - .filter(|c| c.total >= u64::from(cfg.request_volume)) - .collect(); - if qualifying.len() < cfg.minimum_hosts as usize { - return; - } - - let threshold = u64::from(cfg.threshold.get()); - for c in qualifying { - if *budget == 0 { - break; - } - // A50 doesn't forbid `request_volume == 0`, in which case a - // candidate may have `total == 0`. The spec is silent on - // `0/0`; skip these endpoints rather than divide by zero. - if c.total == 0 { - continue; - } - // failure_pct = 100 * failure / total. A50 specifies a strict - // "greater than" comparison: an address sitting exactly at - // the threshold is not ejected. - let failure_pct = 100 * c.failure / c.total; - if failure_pct > threshold && roll(rng, cfg.enforcing_failure_percentage.get()) { - out.push(c.addr.clone()); - // See `Candidate::already_ejected` for why re-ejections - // don't consume the budget. - if !c.already_ejected { - *budget -= 1; - } - } - } +/// Spawn the housekeeping actor. The task ticks every +/// `config.interval` and calls +/// [`OutlierStatsRegistry::run_housekeeping`]. Dropping the returned +/// [`AbortOnDrop`] stops the task. +pub(crate) fn spawn_actor(registry: Arc) -> AbortOnDrop { + let interval = registry.config.interval; + let task = tokio::spawn(async move { + let mut ticker = tokio::time::interval(interval); + ticker.set_missed_tick_behavior(tokio::time::MissedTickBehavior::Skip); + loop { + ticker.tick().await; + registry.run_housekeeping(Instant::now()); + } + }); + AbortOnDrop(task) } /// Return true with probability `pct / 100` (clamped at 100 ⇒ always). @@ -283,20 +244,6 @@ fn roll(rng: &dyn Rng, pct: u8) -> bool { rng.pct_roll() < u32::from(pct) } -/// Cached per-endpoint snapshot used during a sweep. -struct Candidate { - addr: EndpointAddress, - success: u64, - failure: u64, - total: u64, - /// Whether this address was already ejected at the start of the - /// sweep. Re-ejecting an already-ejected address refreshes its - /// timestamp and bumps its multiplier but doesn't change the count - /// of currently-ejected addresses, so it must not consume a - /// `max_ejection_percent` budget slot. - already_ejected: bool, -} - #[cfg(test)] mod tests { use super::*; @@ -306,8 +253,6 @@ mod tests { use std::sync::atomic::{AtomicU32, Ordering}; use std::time::Duration; - // ----- Fixtures ----- - fn addr(port: u16) -> EndpointAddress { EndpointAddress::new("10.0.0.1", port) } @@ -357,148 +302,91 @@ mod tests { } } - fn detector_with_rng(config: OutlierDetectionConfig, rng: Box) -> OutlierDetector { - OutlierDetector::with_rng(config, rng) - } - - fn registry() -> OutlierStatsRegistry { - Arc::new(DashMap::new()) - } - - fn add(channels: &OutlierStatsRegistry, port: u16) -> Arc { - let state = Arc::new(OutlierChannelState::new()); - channels.insert(addr(port), state.clone()); - state - } - - fn ejected(channels: &OutlierStatsRegistry, port: u16) -> bool { - channels - .get(&addr(port)) - .map(|e| e.value().is_ejected()) - .unwrap_or(false) - } - - fn ejected_count(channels: &OutlierStatsRegistry) -> usize { - channels.iter().filter(|e| e.value().is_ejected()).count() - } - - // ----- OutlierChannelState (sanity) ----- - - #[test] - fn channel_state_records_and_resets() { - let s = OutlierChannelState::new(); - s.record_success(); - s.record_success(); - s.record_failure(); - assert_eq!(s.snapshot_and_reset(), (2, 1)); - assert_eq!(s.snapshot_and_reset(), (0, 0)); - } - - #[test] - fn channel_state_eject_uneject_flips_signal() { - let s = OutlierChannelState::new(); - assert!(!s.is_ejected()); - s.eject(); - assert!(s.is_ejected()); - s.uneject(); - assert!(!s.is_ejected()); + /// Drive `n` outcomes through `record_outcome` for one channel. + fn drive( + registry: &OutlierStatsRegistry, + state: &OutlierChannelState, + successes: u64, + failures: u64, + ) { + for _ in 0..successes { + registry.record_outcome(state, true); + } + for _ in 0..failures { + registry.record_outcome(state, false); + } } - // ----- Failure-percentage algorithm ----- + // ----- record_outcome: failure-percentage detection ----- #[test] - fn failure_percentage_ejects_above_threshold() { - let mut detector = detector_with_rng(fp_config(50, 10, 3), FixedRng::boxed(99)); - let channels = registry(); - + fn ejects_above_threshold_inline() { + let registry = OutlierStatsRegistry::with_rng(fp_config(50, 10, 3), FixedRng::boxed(99)); + let bad = registry.add_channel(addr(8084)); for port in 8080..=8083 { - let s = add(&channels, port); - for _ in 0..100 { - s.record_success(); - } - } - let bad = add(&channels, 8084); - for _ in 0..90 { - bad.record_failure(); + let s = registry.add_channel(addr(port)); + drive(®istry, &s, 100, 0); } - for _ in 0..10 { - bad.record_success(); - } - - detector.run_sweep(Instant::now(), &channels); + drive(®istry, &bad, 10, 90); assert!(bad.is_ejected()); - for port in 8080..=8083 { - assert!(!ejected(&channels, port)); - } + assert_eq!(registry.ejected_count.load(Ordering::Relaxed), 1); } #[test] - fn failure_percentage_skips_below_threshold() { - let mut detector = detector_with_rng(fp_config(50, 10, 3), FixedRng::boxed(99)); - let channels = registry(); + fn skips_below_threshold() { + let registry = OutlierStatsRegistry::with_rng(fp_config(50, 10, 3), FixedRng::boxed(99)); + let mut all = vec![]; for port in 8080..=8084 { - let s = add(&channels, port); - // 30% failure → below threshold of 50%. - for _ in 0..70 { - s.record_success(); - } - for _ in 0..30 { - s.record_failure(); - } + let s = registry.add_channel(addr(port)); + // 30% failure → below 50% threshold. + drive(®istry, &s, 70, 30); + all.push(s); + } + for s in &all { + assert!(!s.is_ejected()); } - detector.run_sweep(Instant::now(), &channels); - assert_eq!(ejected_count(&channels), 0); } #[test] - fn failure_percentage_at_threshold_does_not_eject() { + fn at_threshold_does_not_eject() { // A50 specifies a strict "greater than" comparison. - let mut detector = detector_with_rng(fp_config(50, 10, 3), FixedRng::boxed(0)); - let channels = registry(); + let registry = OutlierStatsRegistry::with_rng(fp_config(50, 10, 3), FixedRng::boxed(0)); + let mut all = vec![]; for port in 8080..=8084 { - let s = add(&channels, port); - for _ in 0..50 { - s.record_success(); - } - for _ in 0..50 { - s.record_failure(); - } + let s = registry.add_channel(addr(port)); + drive(®istry, &s, 50, 50); + all.push(s); + } + for s in &all { + assert!(!s.is_ejected()); } - detector.run_sweep(Instant::now(), &channels); - assert_eq!(ejected_count(&channels), 0); } #[test] - fn minimum_hosts_gates_failure_percentage() { - let mut detector = detector_with_rng(fp_config(50, 10, 5), FixedRng::boxed(99)); - let channels = registry(); + fn minimum_hosts_gates_ejection() { + let registry = OutlierStatsRegistry::with_rng(fp_config(50, 10, 5), FixedRng::boxed(99)); // Only 2 hosts have request_volume ≥ 10; minimum_hosts is 5 ⇒ skip. + let mut all = vec![]; for port in 8080..=8081 { - let s = add(&channels, port); - for _ in 0..100 { - s.record_failure(); - } + let s = registry.add_channel(addr(port)); + drive(®istry, &s, 0, 100); + all.push(s); + } + for s in &all { + assert!(!s.is_ejected()); } - detector.run_sweep(Instant::now(), &channels); - assert_eq!(ejected_count(&channels), 0); } #[test] - fn request_volume_filters_low_traffic_endpoints() { - let mut detector = detector_with_rng(fp_config(50, 100, 3), FixedRng::boxed(99)); - let channels = registry(); - let bad = add(&channels, 8080); - for _ in 0..5 { - bad.record_failure(); - } + fn request_volume_filters_low_traffic() { + let registry = OutlierStatsRegistry::with_rng(fp_config(50, 100, 3), FixedRng::boxed(99)); + let bad = registry.add_channel(addr(8080)); + drive(®istry, &bad, 0, 5); for port in 8081..=8084 { - let s = add(&channels, port); - for _ in 0..200 { - s.record_success(); - } + let s = registry.add_channel(addr(port)); + drive(®istry, &s, 200, 0); } - detector.run_sweep(Instant::now(), &channels); - assert_eq!(ejected_count(&channels), 0); + assert!(!bad.is_ejected()); } #[test] @@ -509,303 +397,184 @@ mod tests { .as_mut() .unwrap() .enforcing_failure_percentage = pct(0); - let mut detector = detector_with_rng(config, FixedRng::boxed(0)); - let channels = registry(); + let registry = OutlierStatsRegistry::with_rng(config, FixedRng::boxed(0)); + let mut all = vec![]; for port in 8080..=8084 { - let s = add(&channels, port); - for _ in 0..100 { - s.record_failure(); - } + let s = registry.add_channel(addr(port)); + drive(®istry, &s, 0, 100); + all.push(s); + } + for s in &all { + assert!(!s.is_ejected()); } - detector.run_sweep(Instant::now(), &channels); - assert_eq!(ejected_count(&channels), 0); } - // ----- Ejection multiplier / un-ejection ----- - #[test] - fn unejects_after_base_time() { + fn max_ejection_percent_caps_concurrent_ejections() { let mut config = fp_config(50, 10, 3); - config.base_ejection_time = Duration::from_secs(10); - config.max_ejection_time = Duration::from_secs(60); - let mut detector = detector_with_rng(config, FixedRng::boxed(99)); - let channels = registry(); + config.max_ejection_percent = pct(20); + let registry = OutlierStatsRegistry::with_rng(config, FixedRng::boxed(99)); + let mut all = vec![]; for port in 8080..=8084 { - let s = add(&channels, port); - if port == 8084 { - for _ in 0..100 { - s.record_failure(); - } - } else { - for _ in 0..100 { - s.record_success(); - } - } + let s = registry.add_channel(addr(port)); + all.push(s); + } + // Drive all hosts to bad state in parallel pseudo-order. + for s in &all { + drive(®istry, s, 0, 100); } - let t0 = Instant::now(); - detector.run_sweep(t0, &channels); - assert!(ejected(&channels, 8084)); - - // Still ejected just before base_ejection_time elapses. - detector.run_sweep(t0 + Duration::from_secs(9), &channels); - assert!(ejected(&channels, 8084)); - - // Un-eject after `base * multiplier(=1)` = 10s. - detector.run_sweep(t0 + Duration::from_secs(10), &channels); - assert!(!ejected(&channels, 8084)); + let ejected = all.iter().filter(|s| s.is_ejected()).count(); + // 5 hosts × 20% = 1 max ejection. + assert_eq!(ejected, 1); } #[test] - fn re_ejection_doubles_duration() { - // Same-sweep un-eject + re-eject grows the multiplier 1 → 2. - let mut config = fp_config(50, 10, 3); - config.base_ejection_time = Duration::from_secs(10); - config.max_ejection_time = Duration::from_secs(60); - let mut detector = detector_with_rng(config, FixedRng::boxed(99)); - let channels = registry(); - - let bad = add(&channels, 8084); - for port in 8080..=8083 { - let s = add(&channels, port); - for _ in 0..100 { - s.record_success(); - } - } - for _ in 0..100 { - bad.record_failure(); - } - - // Sweep 1: eject. Multiplier 0 → 1. - let t0 = Instant::now(); - detector.run_sweep(t0, &channels); - assert!(bad.is_ejected()); - - // Re-record stats so sweep 2 has volume to evaluate. + fn remove_channel_decrements_counters() { + let registry = OutlierStatsRegistry::with_rng(fp_config(50, 10, 3), FixedRng::boxed(99)); + let mut all = vec![]; for port in 8080..=8083 { - let s = channels.get(&addr(port)).unwrap().value().clone(); - for _ in 0..100 { - s.record_success(); - } + let s = registry.add_channel(addr(port)); + drive(®istry, &s, 100, 0); + all.push(s); } - for _ in 0..100 { - bad.record_failure(); - } - - // Sweep 2 at t0+10: re-ejection refreshes timestamp, multiplier 1 → 2. - detector.run_sweep(t0 + Duration::from_secs(10), &channels); - assert!(bad.is_ejected()); - - // Re-ejection started at t0+10 with multiplier=2 → duration 20s. - detector.run_sweep(t0 + Duration::from_secs(29), &channels); + let bad = registry.add_channel(addr(8084)); + drive(®istry, &bad, 0, 100); assert!(bad.is_ejected()); + assert_eq!(registry.ejected_count.load(Ordering::Relaxed), 1); + // Each healthy host crossed request_volume; bad too. So + // qualifying_count = 5. + assert_eq!(registry.qualifying_count.load(Ordering::Relaxed), 5); - // Un-ejects at the 20s mark (30s after t0). - detector.run_sweep(t0 + Duration::from_secs(30), &channels); - assert!(!bad.is_ejected()); + registry.remove_channel(&addr(8084)); + assert_eq!(registry.ejected_count.load(Ordering::Relaxed), 0); + assert_eq!(registry.qualifying_count.load(Ordering::Relaxed), 4); } - #[test] - fn ejection_capped_by_max_ejection_time() { - let mut config = fp_config(50, 10, 3); - config.base_ejection_time = Duration::from_secs(10); - config.max_ejection_time = Duration::from_secs(15); - let mut detector = detector_with_rng(config, FixedRng::boxed(99)); - let channels = registry(); + // ----- Housekeeping ----- - for port in 8080..=8084 { - add(&channels, port); + #[test] + fn housekeeping_resets_counters_and_qualifying() { + let registry = OutlierStatsRegistry::with_rng(fp_config(50, 10, 3), FixedRng::boxed(99)); + for port in 8080..=8083 { + let s = registry.add_channel(addr(port)); + drive(®istry, &s, 100, 0); } - let t0 = Instant::now(); - // Force multiplier=10 on 8084 directly. We need to drive a - // first sweep to populate `state[8084]`, then fix it up. - detector.run_sweep(t0, &channels); - let alg = detector.state.get_mut(&addr(8084)).unwrap(); - alg.ejection_multiplier = 10; - alg.ejected_at = Some(t0); - channels.get(&addr(8084)).unwrap().value().eject(); + assert_eq!(registry.qualifying_count.load(Ordering::Relaxed), 4); - // base*multiplier = 100s; cap = 15s → un-eject after 16s. - detector.run_sweep(t0 + Duration::from_secs(16), &channels); - assert!(!ejected(&channels, 8084)); + registry.run_housekeeping(Instant::now()); + assert_eq!(registry.qualifying_count.load(Ordering::Relaxed), 0); + for port in 8080..=8083 { + let s = registry.channels.get(&addr(port)).unwrap(); + assert_eq!(s.counters(), (0, 0)); + } } #[test] - fn max_ejection_percent_caps_concurrent_ejections() { + fn housekeeping_unejects_after_base_time() { let mut config = fp_config(50, 10, 3); - config.max_ejection_percent = pct(20); - let mut detector = detector_with_rng(config, FixedRng::boxed(99)); - let channels = registry(); + config.base_ejection_time = Duration::from_secs(10); + config.max_ejection_time = Duration::from_secs(60); + let registry = OutlierStatsRegistry::with_rng(config, FixedRng::boxed(99)); - for port in 8080..=8084 { - let s = add(&channels, port); - for _ in 0..100 { - s.record_failure(); - } + let bad = registry.add_channel(addr(8084)); + for port in 8080..=8083 { + let s = registry.add_channel(addr(port)); + drive(®istry, &s, 100, 0); } - detector.run_sweep(Instant::now(), &channels); - assert_eq!(ejected_count(&channels), 1); - } + drive(®istry, &bad, 0, 100); + assert!(bad.is_ejected()); - #[test] - fn already_ejected_re_ejection_does_not_consume_budget() { - // 5 hosts: one already ejected, four newly bad. Cap permits 3 - // concurrently ejected, with 1 already taken — so 2 new - // ejections remain in budget. - let mut config = fp_config(50, 10, 3); - config.max_ejection_percent = pct(60); - let mut detector = detector_with_rng(config, FixedRng::boxed(99)); - let channels = registry(); - - // Pre-eject 8080 by driving one sweep with bad stats. - let already_bad = add(&channels, 8080); - for _ in 0..100 { - already_bad.record_failure(); - } - // Use a tiny first sweep to enter ejected state via the algorithm. - // Need at least minimum_hosts=3 candidates with volume; add three - // healthy hosts with ≥10 requests so the algorithm runs and the - // single bad one is ejected (cap 60% of 4 hosts = 2 → budget 2 → 1 - // new ejection). - for port in 8085..=8087 { - let s = add(&channels, port); - for _ in 0..100 { - s.record_success(); - } - } + // Advance fewer than base_ejection_time ⇒ stays ejected. let t0 = Instant::now(); - detector.run_sweep(t0, &channels); - assert!(already_bad.is_ejected()); - - // Now grow the cluster to 5 hosts (8080 + 8081..=8084) and feed - // bad stats. 8085..=8087 are no longer relevant — drop them. - channels.remove(&addr(8085)); - channels.remove(&addr(8086)); - channels.remove(&addr(8087)); - for port in 8081..=8084 { - let s = add(&channels, port); - for _ in 0..100 { - s.record_failure(); - } - } - for _ in 0..100 { - already_bad.record_failure(); - } + registry.run_housekeeping(t0 + Duration::from_secs(9)); + assert!(bad.is_ejected()); - detector.run_sweep(t0 + Duration::from_secs(2), &channels); - // Cap = 60% of 5 = 3. already_ejected = 1. Budget = 2. Plus - // 8080's re-eject which doesn't consume budget. So 2 NEW - // ejections among 8081..=8084. - let new_ejects = (8081..=8084).filter(|p| ejected(&channels, *p)).count(); - assert_eq!(new_ejects, 2); + // After base_ejection_time × 1 elapsed ⇒ uneject. + registry.run_housekeeping(t0 + Duration::from_secs(20)); + assert!(!bad.is_ejected()); + assert_eq!(registry.ejected_count.load(Ordering::Relaxed), 0); } #[test] - fn multiplier_decrements_on_healthy_interval() { - let mut detector = detector_with_rng(base_config(), FixedRng::boxed(99)); - let channels = registry(); - let s = add(&channels, 8080); - // First sweep populates the alg state. - detector.run_sweep(Instant::now(), &channels); - // Force multiplier to 3 without ejecting. - detector - .state - .get_mut(&addr(8080)) - .unwrap() - .ejection_multiplier = 3; - s.record_success(); - detector.run_sweep(Instant::now(), &channels); - assert_eq!( - detector.state.get(&addr(8080)).unwrap().ejection_multiplier, - 2, - ); - } + fn housekeeping_decrements_multiplier_on_healthy_interval() { + let registry = OutlierStatsRegistry::with_rng(base_config(), FixedRng::boxed(99)); + let s = registry.add_channel(addr(8080)); + // Force multiplier to 3 directly (no traffic, no eject). + s.set_ejection_multiplier(3); - #[test] - fn multiplier_decrements_even_without_traffic() { - // A50: a non-ejected address with multiplier > 0 has its - // multiplier decremented every sweep, regardless of traffic. - let mut detector = detector_with_rng(base_config(), FixedRng::boxed(99)); - let channels = registry(); - add(&channels, 8080); - detector.run_sweep(Instant::now(), &channels); - detector - .state - .get_mut(&addr(8080)) - .unwrap() - .ejection_multiplier = 3; - detector.run_sweep(Instant::now(), &channels); - assert_eq!( - detector.state.get(&addr(8080)).unwrap().ejection_multiplier, - 2, - ); + registry.run_housekeeping(Instant::now()); + assert_eq!(s.ejection_multiplier(), 2); } #[test] - fn alg_state_dropped_when_channel_removed() { - let mut detector = detector_with_rng(base_config(), FixedRng::boxed(99)); - let channels = registry(); - add(&channels, 8080); - add(&channels, 8081); - detector.run_sweep(Instant::now(), &channels); - assert_eq!(detector.state.len(), 2); - - channels.remove(&addr(8080)); - detector.run_sweep(Instant::now(), &channels); - assert_eq!(detector.state.len(), 1); - assert!(detector.state.contains_key(&addr(8081))); - } - - // ----- Spawned actor ----- - - #[tokio::test(start_paused = true)] - async fn spawned_actor_runs_sweeps_on_tick() { + fn housekeeping_caps_ejection_at_max_ejection_time() { let mut config = fp_config(50, 10, 3); - config.interval = Duration::from_millis(100); - let channels = registry(); - - for port in 8080..=8083 { - let s = add(&channels, port); - for _ in 0..100 { - s.record_success(); - } - } - let bad = add(&channels, 8084); - for _ in 0..100 { - bad.record_failure(); - } - - let _abort = OutlierDetector::spawn_with_rng(config, FixedRng::boxed(99), channels.clone()); + config.base_ejection_time = Duration::from_secs(10); + config.max_ejection_time = Duration::from_secs(15); + let registry = OutlierStatsRegistry::with_rng(config, FixedRng::boxed(99)); - // Advance past the first sweep tick. The yield gives the - // spawned actor a turn to run after time advances. - tokio::time::advance(Duration::from_millis(150)).await; - tokio::task::yield_now().await; - tokio::task::yield_now().await; + let s = registry.add_channel(addr(8080)); + // Pretend 8080 was ejected long ago with a huge multiplier. + s.try_eject(Instant::now()); + s.set_ejection_multiplier(10); + registry.ejected_count.fetch_add(0, Ordering::Relaxed); // try_eject already added 1 - assert!(bad.is_ejected()); - for port in 8080..=8083 { - assert!(!ejected(&channels, port)); - } + // base * multiplier = 100s, but cap = 15s. Sweep at 16s ⇒ uneject. + let t0 = Instant::now(); + registry.run_housekeeping(t0 + Duration::from_secs(16)); + assert!(!s.is_ejected()); } + // ----- Spawned actor ----- + // + // The actor's algorithmic behavior is fully exercised by the + // synchronous `housekeeping_*` tests above; here we only verify + // that dropping the `AbortOnDrop` handle reliably stops the task. + #[tokio::test(start_paused = true)] async fn dropping_abort_stops_actor() { let mut config = base_config(); config.interval = Duration::from_millis(50); - let channels = registry(); - let bad = add(&channels, 8080); + let registry = OutlierStatsRegistry::with_rng(config, FixedRng::boxed(99)); + let s = registry.add_channel(addr(8080)); + s.set_ejection_multiplier(5); - let abort = OutlierDetector::spawn(config, channels.clone()); + let abort = spawn_actor(registry.clone()); drop(abort); - // Even after several tick periods, no sweep should have run - // because the task was aborted. + // Even with several tick periods elapsed, no housekeeping + // should have run because the task was aborted. tokio::time::advance(Duration::from_millis(500)).await; + tokio::task::yield_now().await; - // The bad channel had no traffic recorded, so neither side - // would eject — but verify nothing happened to the signal. - assert!(!bad.is_ejected()); + assert_eq!(s.ejection_multiplier(), 5); + } + + // ----- OutlierChannelState sanity (kept in this file as it is the + // primary consumer of the type) ----- + + #[test] + fn channel_state_records_and_resets() { + let s = OutlierChannelState::new(); + s.record_success(); + s.record_success(); + s.record_failure(); + assert_eq!(s.snapshot_and_reset(), (2, 1)); + assert_eq!(s.snapshot_and_reset(), (0, 0)); + } + + #[test] + fn channel_state_try_eject_uneject_flips_signal() { + let s = OutlierChannelState::new(); + assert!(!s.is_ejected()); + assert!(s.try_eject(Instant::now())); + assert!(s.is_ejected()); + // Second call is a no-op. + assert!(!s.try_eject(Instant::now())); + assert!(s.try_uneject()); + assert!(!s.is_ejected()); + assert!(!s.try_uneject()); } } From 47944fda6b0ea158317025b97511c06570f81a00 Mon Sep 17 00:00:00 2001 From: Yu Liu <60283975+LYZJU2019@users.noreply.github.com> Date: Mon, 11 May 2026 10:38:06 -0700 Subject: [PATCH 21/39] refactor(tonic-xds): lift outlier state out of Connecting/EjectedChannel MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Outlier-detection state belongs to `ReadyChannel` — the only state machine variant that serves traffic. `ConnectingChannel` is just a connect future and `EjectedChannel` is just a cooldown timer; neither reads or writes counters or the ejection signal, so neither should carry the `Arc`. Changes: - `ConnectingChannel::Output` is now bare `S` (was `ReadyChannel`). The captured async block no longer holds an outlier state; the address is kept by the caller (typically as the key in `KeyedFutures`). - `EjectedChannel` drops its `outlier` field. `UnejectedChannel:: Ready(S)` now carries a bare service; the consumer re-attaches the registry-supplied outlier state when wrapping it back into a `ReadyChannel`. - `ReadyChannel` gains an explicit `new(addr, inner, outlier)` constructor so the outlier state is required at construction time. - `ReadyChannel::eject` and `ReadyChannel::reconnect` drop the outlier reference — it lives in the registry, keyed by address, and survives the cycle. - `LoadBalancer::connecting` is now `KeyedFutures` (was over `ReadyChannel`). `poll_connecting` wraps the resolved service into a `ReadyChannel` with a fresh `OutlierChannelState`; the integration PR replaces the fresh state with one supplied by the `OutlierStatsRegistry`. Tests in `channel_state.rs` use a small `wrap_ready` helper to build `ReadyChannel` instances from the bare services returned by `IdleChannel::connect()`. --- .../src/client/loadbalance/channel_state.rs | 130 ++++++++++-------- .../src/client/loadbalance/loadbalancer.rs | 17 ++- 2 files changed, 84 insertions(+), 63 deletions(-) diff --git a/tonic-xds/src/client/loadbalance/channel_state.rs b/tonic-xds/src/client/loadbalance/channel_state.rs index 7885705a1..b87414bc1 100644 --- a/tonic-xds/src/client/loadbalance/channel_state.rs +++ b/tonic-xds/src/client/loadbalance/channel_state.rs @@ -259,8 +259,11 @@ pub(crate) struct EjectionConfig { /// Result of an ejection expiring. pub(crate) enum UnejectedChannel { - /// The channel is ready to serve again (ejection expired, no reconnect needed). - Ready(ReadyChannel), + /// The channel is ready to serve again (ejection expired, no + /// reconnect needed). The consumer wraps the bare service into a + /// [`ReadyChannel`] using the registry-supplied + /// [`OutlierChannelState`]. + Ready(S), /// A fresh connection has been started. Connecting(ConnectingChannel), } @@ -295,44 +298,31 @@ impl IdleChannel { /// A channel that is in the process of connecting. /// -/// Implements [`Future`] -- resolves to [`ReadyChannel`] when connected. +/// Implements [`Future`] -- resolves to the connected service `S` +/// when the connection completes. The consumer wraps that into a +/// [`ReadyChannel`] (attaching its [`OutlierChannelState`]). /// Cancellation is handled externally via [`KeyedFutures::cancel`]. /// +/// `ConnectingChannel` deliberately does not carry an +/// [`OutlierChannelState`]: it does not serve traffic, so it has +/// nothing to count or signal. +/// /// [`KeyedFutures::cancel`]: crate::client::loadbalance::keyed_futures::KeyedFutures::cancel pub(crate) struct ConnectingChannel { - inner: Pin> + Send>>, + inner: Pin + Send>>, } impl ConnectingChannel { - /// Start a connection, generating a fresh per-channel outlier - /// state. Used for first-time connects from `IdleChannel`. - pub(crate) fn new(fut: BoxFuture, addr: EndpointAddress) -> Self { - Self::with_outlier(fut, addr, Arc::new(OutlierChannelState::new())) - } - - /// Start a connection that inherits an existing - /// [`OutlierChannelState`]. Used by reconnect paths so the - /// per-channel counters and ejection signal survive across the - /// connection cycle. - pub(crate) fn with_outlier( - fut: BoxFuture, - addr: EndpointAddress, - outlier: Arc, - ) -> Self { - Self { - inner: Box::pin(async move { - ReadyChannel { - addr, - inner: fut.await, - outlier, - } - }), - } + /// Start a connection. The address is kept by the caller (it is + /// typically the key in a `KeyedFutures` map); only the future is + /// stored here. + pub(crate) fn new(fut: BoxFuture, _addr: EndpointAddress) -> Self { + Self { inner: fut } } } impl Future for ConnectingChannel { - type Output = ReadyChannel; + type Output = S; fn poll(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll { self.get_mut().inner.as_mut().poll(cx) @@ -348,7 +338,9 @@ impl Future for ConnectingChannel { /// Holds the raw service `S` and delegates [`Service`] calls directly, /// preserving `S::Future` and `S::Error` with no wrapping or type /// erasure. The `Arc` is shared with the outlier- -/// detection actor for stats accumulation and edge-triggered ejection. +/// detection actor for stats accumulation and edge-triggered ejection; +/// because only `ReadyChannel` serves traffic, only `ReadyChannel` +/// carries this state. #[derive(Clone)] pub(crate) struct ReadyChannel { addr: EndpointAddress, @@ -357,13 +349,26 @@ pub(crate) struct ReadyChannel { } impl ReadyChannel { + /// Wrap a connected service `S` into a [`ReadyChannel`] using the + /// caller-supplied outlier state. + pub(crate) fn new(addr: EndpointAddress, inner: S, outlier: Arc) -> Self { + Self { + addr, + inner, + outlier, + } + } + /// Per-channel outlier-detection state. Cloned cheaply via `Arc`. #[allow(dead_code)] // consumed by the LoadBalancer in a follow-up PR. pub(crate) fn outlier(&self) -> &Arc { &self.outlier } - /// Eject this channel (e.g., due to outlier detection). Consumes self. + /// Eject this channel (e.g., due to outlier detection). Consumes + /// self. The outlier state remains in the registry; only the + /// service and address are passed into [`EjectedChannel`] (which + /// just times the cooldown). pub(crate) fn eject(self, config: EjectionConfig, connector: Arc) -> EjectedChannel where C: Connector + Send + Sync + 'static, @@ -372,15 +377,15 @@ impl ReadyChannel { EjectedChannel { addr: self.addr, inner: self.inner, - outlier: self.outlier, config, connector, ejection_timer, } } - /// Start reconnecting. Consumes self, dropping the old connection - /// but preserving the outlier-detection state. + /// Start reconnecting. Consumes self, dropping the old connection. + /// The outlier state remains in the registry; the consumer + /// re-attaches it when the new [`ReadyChannel`] is constructed. pub(crate) fn reconnect>( self, connector: Arc, @@ -388,7 +393,7 @@ impl ReadyChannel { where S: Send + 'static, { - ConnectingChannel::with_outlier(connector.connect(&self.addr), self.addr, self.outlier) + ConnectingChannel::new(connector.connect(&self.addr), self.addr) } } @@ -424,14 +429,19 @@ impl Load for ReadyChannel { pin_project! { /// A channel that has been ejected and is cooling down. /// - /// The underlying connection is kept alive but cannot serve requests. - /// Implements [`Future`] -- resolves once the ejection timer expires to either: + /// The underlying connection is kept alive but cannot serve + /// requests. Implements [`Future`] -- resolves once the ejection + /// timer expires to either: /// - [`UnejectedChannel::Ready`] if no reconnect is needed /// - [`UnejectedChannel::Connecting`] if a fresh connection is required + /// + /// `EjectedChannel` deliberately does not carry an + /// [`OutlierChannelState`]: the state lives in the registry, keyed + /// by address, and the consumer re-attaches it when the channel + /// transitions back to [`ReadyChannel`]. pub(crate) struct EjectedChannel { addr: EndpointAddress, inner: S, - outlier: Arc, config: EjectionConfig, connector: Arc + Send + Sync>, #[pin] @@ -448,19 +458,12 @@ impl Future for EjectedChannel { Poll::Ready(()) => { if this.config.needs_reconnect { let fut = this.connector.connect(this.addr); - Poll::Ready(UnejectedChannel::Connecting( - ConnectingChannel::with_outlier( - fut, - this.addr.clone(), - this.outlier.clone(), - ), - )) + Poll::Ready(UnejectedChannel::Connecting(ConnectingChannel::new( + fut, + this.addr.clone(), + ))) } else { - Poll::Ready(UnejectedChannel::Ready(ReadyChannel { - addr: this.addr.clone(), - inner: this.inner.clone(), - outlier: this.outlier.clone(), - })) + Poll::Ready(UnejectedChannel::Ready(this.inner.clone())) } } Poll::Pending => Poll::Pending, @@ -529,17 +532,23 @@ mod tests { assert_eq!(connector.connect_count.load(Ordering::SeqCst), 1); } + fn wrap_ready(addr: EndpointAddress, svc: MockService) -> ReadyChannel { + ReadyChannel::new(addr, svc, Arc::new(OutlierChannelState::new())) + } + #[tokio::test] - async fn test_connecting_future_yields_ready() { + async fn test_connecting_future_yields_service() { let connector = MockConnector::new(); - let ready = IdleChannel::new(test_addr()).connect(connector).await; - assert_eq!(ready.addr, test_addr()); + let svc: MockService = IdleChannel::new(test_addr()).connect(connector).await; + // The bare service is what `ConnectingChannel` resolves to. + let _ready = wrap_ready(test_addr(), svc); } #[tokio::test] async fn test_ready_service_delegates() { let connector = MockConnector::new(); - let mut ready = IdleChannel::new(test_addr()).connect(connector).await; + let svc = IdleChannel::new(test_addr()).connect(connector).await; + let mut ready = wrap_ready(test_addr(), svc); let resp: &str = ready.call("hello").await.unwrap(); assert_eq!(resp, "ok"); } @@ -547,9 +556,10 @@ mod tests { #[tokio::test] async fn test_ready_to_connecting_via_reconnect() { let connector = MockConnector::new(); - let ready = IdleChannel::new(test_addr()) + let svc = IdleChannel::new(test_addr()) .connect(connector.clone()) .await; + let ready = wrap_ready(test_addr(), svc); let _reconnecting = ready.reconnect(connector.clone()); assert_eq!(connector.connect_count.load(Ordering::SeqCst), 2); } @@ -562,7 +572,7 @@ mod tests { let connecting = ConnectingChannel::new(Box::pin(async move { rx.await.unwrap() }), test_addr()); - let mut set: KeyedFutures> = KeyedFutures::new(); + let mut set: KeyedFutures = KeyedFutures::new(); set.add(test_addr(), connecting).unwrap(); assert!(matches!(set.poll_next(&mut noop_cx()), Poll::Pending)); @@ -580,7 +590,7 @@ mod tests { let connecting = ConnectingChannel::new(Box::pin(future::pending::()), test_addr()); - let mut set: KeyedFutures> = KeyedFutures::new(); + let mut set: KeyedFutures = KeyedFutures::new(); set.add(test_addr(), connecting).unwrap(); assert!(matches!(set.poll_next(&mut noop_cx()), Poll::Pending)); @@ -592,9 +602,10 @@ mod tests { #[tokio::test(start_paused = true)] async fn test_ejected_in_keyed_futures_ready() { let connector = MockConnector::new(); - let ready = IdleChannel::new(test_addr()) + let svc = IdleChannel::new(test_addr()) .connect(connector.clone()) .await; + let ready = wrap_ready(test_addr(), svc); let ejected = ready.eject( EjectionConfig { timeout: Duration::from_secs(5), @@ -617,9 +628,10 @@ mod tests { #[tokio::test(start_paused = true)] async fn test_ejected_in_keyed_futures_needs_reconnect() { let connector = MockConnector::new(); - let ready = IdleChannel::new(test_addr()) + let svc = IdleChannel::new(test_addr()) .connect(connector.clone()) .await; + let ready = wrap_ready(test_addr(), svc); let ejected = ready.eject( EjectionConfig { timeout: Duration::from_secs(5), diff --git a/tonic-xds/src/client/loadbalance/loadbalancer.rs b/tonic-xds/src/client/loadbalance/loadbalancer.rs index 3a1a0171f..61bc6681a 100644 --- a/tonic-xds/src/client/loadbalance/loadbalancer.rs +++ b/tonic-xds/src/client/loadbalance/loadbalancer.rs @@ -14,7 +14,7 @@ use tower::Service; use tower::discover::{Change, Discover}; use crate::client::endpoint::{Connector, EndpointAddress}; -use crate::client::loadbalance::channel_state::{IdleChannel, ReadyChannel}; +use crate::client::loadbalance::channel_state::{IdleChannel, OutlierChannelState, ReadyChannel}; use crate::client::loadbalance::errors::LbError; use crate::client::loadbalance::keyed_futures::KeyedFutures; use crate::client::loadbalance::pickers::ChannelPicker; @@ -58,7 +58,10 @@ pub(crate) struct LoadBalancer { /// Connector for creating connections from idle channels. connector: Arc, /// In-flight connection attempts, keyed by endpoint address. - connecting: KeyedFutures>, + /// `ConnectingChannel` resolves to the bare service; the LB wraps + /// it into a `ReadyChannel` with an outlier state when it + /// transitions to ready. + connecting: KeyedFutures, /// Ready-to-serve channels, keyed by endpoint address. ready: IndexMap>, /// Channel picker for load balancing. @@ -117,9 +120,15 @@ where } } - /// Drain completed connection futures into the ready set. + /// Drain completed connection futures into the ready set. Wraps + /// the bare service into a `ReadyChannel` with a fresh + /// `OutlierChannelState`. The outlier-detection PR will replace + /// the fresh state with one looked up from the + /// `OutlierStatsRegistry`. fn poll_connecting(&mut self, cx: &mut Context<'_>) { - while let Poll::Ready(Some((addr, ready))) = self.connecting.poll_next(cx) { + while let Poll::Ready(Some((addr, svc))) = self.connecting.poll_next(cx) { + let outlier = Arc::new(OutlierChannelState::new()); + let ready = ReadyChannel::new(addr.clone(), svc, outlier); self.ready.insert(addr, ready); } } From 5e835e76ba7e7ff4b50acca4aac54b81412640b5 Mon Sep 17 00:00:00 2001 From: Yu Liu <60283975+LYZJU2019@users.noreply.github.com> Date: Mon, 11 May 2026 11:05:33 -0700 Subject: [PATCH 22/39] feat(tonic-xds): integrate outlier detection with LoadBalancer Wire the outlier-detection registry into `LoadBalancer` end-to-end: - `LoadBalancer::with_outlier(discovery, connector, picker, Some(registry))` constructs an LB that participates in outlier detection. The plain `new(...)` constructor is a thin alias that passes `None` (no outlier detection); existing tests are unchanged. - At construction, the housekeeping actor is spawned via `spawn_actor(registry)`; the returned `AbortOnDrop` is stored on the LB so the actor stops when the LB is dropped. - `poll_discover` now also unhooks the registry entry, the ejection signal stream, and any ejected slot when an address is removed or re-inserted. - `poll_connecting` registers the new channel with the registry (`registry.add_channel(addr)`), subscribes to its ejection signal via `WatchStream::from_changes`, and inserts the stream into a `StreamMap>`. - A new `poll_ejection_signals` step in `poll_ready` drains the `StreamMap` in amortized O(1) per transition, moving channels between `ready: IndexMap` and a new `ejected: HashMap`. The picker continues to see only `ready`, so ejected endpoints are automatically excluded from selection. - `call` clones the picked channel's `OutlierChannelState` and, after the inner call completes, invokes `registry.record_outcome(state, result.is_ok())`. Per-RPC detection runs inline; the LB's critical path stays O(1) in the number of endpoints. Other changes: - `OutlierStatsRegistry::add_channel` is idempotent: re-inserting an existing address returns the existing state so reconnect cycles preserve counters and ejection bookkeeping. - Cargo: `tokio-stream` gains the `sync` feature to expose `WatchStream`. Three integration tests cover: a failing endpoint gets ejected and removed from `ready`; a healthy cluster sees no ejections; endpoint removal cleans up the registry. --- tonic-xds/Cargo.toml | 2 +- .../src/client/loadbalance/loadbalancer.rs | 269 ++++++++++++++++-- .../client/loadbalance/outlier_detection.rs | 13 +- 3 files changed, 261 insertions(+), 23 deletions(-) diff --git a/tonic-xds/Cargo.toml b/tonic-xds/Cargo.toml index 8d94f3342..84e1246e0 100644 --- a/tonic-xds/Cargo.toml +++ b/tonic-xds/Cargo.toml @@ -45,7 +45,7 @@ tokio = { version = "1", features = ["sync", "time"] } fastrand = "2" indexmap = "2" tracing = "0.1" -tokio-stream = "0.1" +tokio-stream = { version = "0.1", features = ["sync"] } tokio-util = "0.7" backoff = "0.4" shared_http_body = "0.1" diff --git a/tonic-xds/src/client/loadbalance/loadbalancer.rs b/tonic-xds/src/client/loadbalance/loadbalancer.rs index 61bc6681a..d11c57f52 100644 --- a/tonic-xds/src/client/loadbalance/loadbalancer.rs +++ b/tonic-xds/src/client/loadbalance/loadbalancer.rs @@ -3,13 +3,23 @@ //! Receives endpoint updates via [`tower::discover::Discover`] (yielding //! [`IdleChannel`]s), manages the connection lifecycle via the channel state //! machine, and routes requests to ready endpoints via a [`ChannelPicker`]. - +//! +//! Outlier detection is integrated via an optional +//! [`OutlierStatsRegistry`]: ejection decisions are made on the data +//! path (per-RPC) and surfaced to `poll_ready` via per-channel +//! `watch::Receiver` streams aggregated in a `StreamMap`. The +//! LB then moves the corresponding [`ReadyChannel`] between its ready +//! and ejected maps in O(1) per transition. + +use std::collections::HashMap; use std::future::Future; use std::pin::Pin; use std::sync::Arc; use std::task::{Context, Poll, ready}; use indexmap::IndexMap; +use tokio_stream::StreamMap; +use tokio_stream::wrappers::WatchStream; use tower::Service; use tower::discover::{Change, Discover}; @@ -17,7 +27,9 @@ use crate::client::endpoint::{Connector, EndpointAddress}; use crate::client::loadbalance::channel_state::{IdleChannel, OutlierChannelState, ReadyChannel}; use crate::client::loadbalance::errors::LbError; use crate::client::loadbalance::keyed_futures::KeyedFutures; +use crate::client::loadbalance::outlier_detection::{OutlierStatsRegistry, spawn_actor}; use crate::client::loadbalance::pickers::ChannelPicker; +use crate::common::async_util::AbortOnDrop; /// Future returned by [`LoadBalancer::call`]. /// @@ -64,6 +76,20 @@ pub(crate) struct LoadBalancer { connecting: KeyedFutures, /// Ready-to-serve channels, keyed by endpoint address. ready: IndexMap>, + /// Channels currently ejected by outlier detection. Their + /// underlying connections are kept alive so traffic can resume + /// without reconnecting after un-ejection. + ejected: HashMap>, + /// Per-channel ejection signal streams, aggregated for O(1) + /// observation in `poll_ready`. Present only when outlier + /// detection is enabled. + ejection_signals: StreamMap>, + /// Outlier-detection registry, shared with the spawned actor and + /// the data path. `None` disables outlier detection. + outlier: Option>, + /// Handle to the outlier-detection actor task; dropped when the + /// LB is dropped. + _outlier_actor: Option, /// Channel picker for load balancing. picker: Arc, Req> + Send + Sync>, } @@ -75,21 +101,51 @@ where C: Connector + Send + Sync + 'static, C::Service: Send + 'static, { - /// Create a new load balancer with the given picker. + /// Create a load balancer with no outlier detection. pub(crate) fn new( discovery: D, connector: Arc, picker: Arc, Req> + Send + Sync>, ) -> Self { + Self::with_outlier(discovery, connector, picker, None) + } + + /// Create a load balancer, optionally enabling outlier detection. + /// When `outlier` is `Some`, the registry's housekeeping actor is + /// spawned and its lifetime is bound to the load balancer. + pub(crate) fn with_outlier( + discovery: D, + connector: Arc, + picker: Arc, Req> + Send + Sync>, + outlier: Option>, + ) -> Self { + let _outlier_actor = outlier.as_ref().map(|reg| spawn_actor(reg.clone())); Self { discovery, connector, connecting: KeyedFutures::new(), ready: IndexMap::new(), + ejected: HashMap::new(), + ejection_signals: StreamMap::new(), + outlier, + _outlier_actor, picker, } } + /// Forget all per-endpoint state for `addr`: the connecting + /// future, the ready slot, the ejected slot, the ejection signal + /// stream, and the registry entry. + fn forget_endpoint(&mut self, addr: &EndpointAddress) { + let _ = self.connecting.cancel(addr); + self.ready.swap_remove(addr); + self.ejected.remove(addr); + self.ejection_signals.remove(addr); + if let Some(registry) = self.outlier.as_ref() { + registry.remove_channel(addr); + } + } + /// Drain pending discovery events. Either resolves to an error /// ([`LbError::DiscoverClosed`] or [`LbError::DiscoverError`]) or stays /// pending — there is no success outcome since the loop only exits on @@ -106,32 +162,56 @@ where Some(Err(e)) => return Poll::Ready(LbError::DiscoverError(e.into())), Some(Ok(Change::Insert(addr, idle))) => { tracing::trace!("discovery: insert {addr}"); - let _ = self.connecting.cancel(&addr); - self.ready.swap_remove(&addr); + self.forget_endpoint(&addr); let connecting = idle.connect(self.connector.clone()); let _ = self.connecting.add(addr, connecting); } Some(Ok(Change::Remove(addr))) => { tracing::trace!("discovery: remove {addr}"); - let _ = self.connecting.cancel(&addr); - self.ready.swap_remove(&addr); + self.forget_endpoint(&addr); } } } } /// Drain completed connection futures into the ready set. Wraps - /// the bare service into a `ReadyChannel` with a fresh - /// `OutlierChannelState`. The outlier-detection PR will replace - /// the fresh state with one looked up from the - /// `OutlierStatsRegistry`. + /// each bare service into a `ReadyChannel` using the outlier + /// state from the registry (or a fresh state if outlier detection + /// is disabled), and subscribes to the per-channel ejection + /// signal. fn poll_connecting(&mut self, cx: &mut Context<'_>) { while let Poll::Ready(Some((addr, svc))) = self.connecting.poll_next(cx) { - let outlier = Arc::new(OutlierChannelState::new()); - let ready = ReadyChannel::new(addr.clone(), svc, outlier); + let state = match self.outlier.as_ref() { + Some(registry) => registry.add_channel(addr.clone()), + None => Arc::new(OutlierChannelState::new()), + }; + if self.outlier.is_some() { + self.ejection_signals + .insert(addr.clone(), WatchStream::from_changes(state.subscribe())); + } + let ready = ReadyChannel::new(addr.clone(), svc, state); self.ready.insert(addr, ready); } } + + /// Drain ejection-signal transitions, moving channels between + /// `ready` and `ejected`. O(k) per call where k = ready signals. + fn poll_ejection_signals(&mut self, cx: &mut Context<'_>) { + use futures_core::Stream; + while let Poll::Ready(Some((addr, ejected))) = + Pin::new(&mut self.ejection_signals).poll_next(cx) + { + if ejected { + if let Some(ch) = self.ready.swap_remove(&addr) { + tracing::debug!("outlier detection: eject {addr}"); + self.ejected.insert(addr, ch); + } + } else if let Some(ch) = self.ejected.remove(&addr) { + tracing::debug!("outlier detection: uneject {addr}"); + self.ready.insert(addr, ch); + } + } + } } impl Service for LoadBalancer @@ -152,6 +232,7 @@ where fn poll_ready(&mut self, cx: &mut Context<'_>) -> Poll> { let discover_result = self.poll_discover(cx); self.poll_connecting(cx); + self.poll_ejection_signals(cx); if !self.ready.is_empty() { return Poll::Ready(Ok(())); @@ -183,16 +264,26 @@ where let Some(picked) = self.picker.pick(&req, &self.ready) else { return LbFuture::Error(Some(LbError::Unavailable)); }; - // `picked` is a read-only borrow into `self.ready`. Clone to get an - // owned service we can drive in the async block. + // `picked` is a read-only borrow into `self.ready`. Clone to get + // an owned service and outlier handle for the async block; both + // are `Arc`-shared, so cloning is cheap. let mut svc = picked.clone(); + let outlier_state = picked.outlier().clone(); + let registry = self.outlier.clone(); LbFuture::Pending(Box::pin(async move { tower::ServiceExt::ready(&mut svc) .await .map_err(|e| LbError::LbChannelPollReadyError(e.into()))?; - svc.call(req) - .await - .map_err(|e| LbError::LbChannelCallError(e.into())) + let result = svc.call(req).await; + if let Some(registry) = registry.as_ref() { + // Per-RPC outlier detection: bump the channel's + // counter and (inside `record_outcome`) possibly + // eject if the failure-percentage threshold is + // crossed. Treat any `Err` outcome as a failure for + // outlier purposes. + registry.record_outcome(&outlier_state, result.is_ok()); + } + result.map_err(|e| LbError::LbChannelCallError(e.into())) })) } } @@ -667,4 +758,148 @@ mod tests { "expected LbChannelCallError, got {result:?}" ); } + + // -- Outlier-detection integration tests -- + + use crate::client::loadbalance::outlier_detection::{OutlierStatsRegistry, Rng}; + use crate::xds::resource::outlier_detection::{ + FailurePercentageConfig, OutlierDetectionConfig, Percentage, + }; + use std::time::Duration; + + fn pct(v: u32) -> Percentage { + Percentage::new(v).unwrap() + } + + struct AlwaysFireRng; + impl Rng for AlwaysFireRng { + fn pct_roll(&self) -> u32 { + 0 + } + } + + fn fp_config( + threshold: u32, + request_volume: u32, + minimum_hosts: u32, + ) -> OutlierDetectionConfig { + OutlierDetectionConfig { + interval: Duration::from_secs(60), + base_ejection_time: Duration::from_secs(30), + max_ejection_time: Duration::from_secs(300), + max_ejection_percent: pct(100), + success_rate: None, + failure_percentage: Some(FailurePercentageConfig { + threshold: pct(threshold), + enforcing_failure_percentage: pct(100), + minimum_hosts, + request_volume, + }), + } + } + + /// Build an LB with outlier detection enabled. + fn make_lb_with_outlier( + discover: MockDiscover, + config: OutlierDetectionConfig, + ) -> (Lb, Arc, Arc) { + let connector = Arc::new(MockConnector::new()); + let picker: Arc, &'static str> + Send + Sync> = + Arc::new(P2cPicker); + let registry = OutlierStatsRegistry::with_rng(config, Box::new(AlwaysFireRng)); + let lb = + LoadBalancer::with_outlier(discover, connector.clone(), picker, Some(registry.clone())); + (lb, connector, registry) + } + + /// Drive the LB through one call per port. Asserts each succeeds. + async fn call_each(lb: &mut Lb, n: usize) { + for _ in 0..n { + lb.call("hello").await.unwrap(); + } + } + + #[tokio::test] + async fn test_outlier_detection_ejects_failing_endpoint() { + // 5 endpoints, all healthy except 8084. Once 8084's failures + // cross the threshold, it should be moved out of `ready` and + // into `ejected`. + let (tx, discover) = new_discover(); + let (mut lb, connector, registry) = make_lb_with_outlier( + discover, + fp_config( + /*threshold*/ 50, /*request_volume*/ 5, /*minimum_hosts*/ 3, + ), + ); + + for port in 8080..=8084 { + tx.send(Ok(Change::Insert(addr(port), IdleChannel::new(addr(port))))) + .await + .unwrap(); + } + drive_to_ready(&mut lb, &connector).await; + assert_eq!(lb.ready.len(), 5); + + // Configure 8084 to always fail. Other endpoints stay healthy. + connector + .service(&addr(8084)) + .fail_call + .store(true, Ordering::Relaxed); + + // Drive enough calls to ensure 8084 reaches request_volume + // and its failure rate triggers ejection. With 5 endpoints + // and P2C picking, each gets ~k/5 calls; drive 100 to be safe. + for _ in 0..100 { + let _ = lb.call("hello").await; + } + + // poll_ready drains the ejection signal and moves 8084. + let _ = poll_ready_now(&mut lb); + assert!( + lb.ejected.contains_key(&addr(8084)), + "8084 should be ejected; ejected map: {:?}, ready keys: {:?}", + lb.ejected.keys().collect::>(), + lb.ready.keys().collect::>(), + ); + assert!(!lb.ready.contains_key(&addr(8084))); + // The registry's `ejected_count` should reflect the same. + assert!(registry.len() == 5); + } + + #[tokio::test] + async fn test_outlier_detection_healthy_cluster_no_ejections() { + let (tx, discover) = new_discover(); + let (mut lb, connector, _registry) = make_lb_with_outlier(discover, fp_config(50, 5, 3)); + + for port in 8080..=8084 { + tx.send(Ok(Change::Insert(addr(port), IdleChannel::new(addr(port))))) + .await + .unwrap(); + } + drive_to_ready(&mut lb, &connector).await; + assert_eq!(lb.ready.len(), 5); + + call_each(&mut lb, 50).await; + + let _ = poll_ready_now(&mut lb); + assert_eq!(lb.ejected.len(), 0); + assert_eq!(lb.ready.len(), 5); + } + + #[tokio::test] + async fn test_outlier_detection_endpoint_removal_cleans_registry() { + let (tx, discover) = new_discover(); + let (mut lb, connector, registry) = make_lb_with_outlier(discover, fp_config(50, 5, 3)); + + tx.send(Ok(Change::Insert(addr(8080), IdleChannel::new(addr(8080))))) + .await + .unwrap(); + drive_to_ready(&mut lb, &connector).await; + assert_eq!(registry.len(), 1); + + tx.send(Ok(Change::Remove(addr(8080)))).await.unwrap(); + let _ = poll_ready_now(&mut lb); + assert_eq!(registry.len(), 0); + assert_eq!(lb.ready.len(), 0); + } } diff --git a/tonic-xds/src/client/loadbalance/outlier_detection.rs b/tonic-xds/src/client/loadbalance/outlier_detection.rs index 5295a09c7..1cbfb6233 100644 --- a/tonic-xds/src/client/loadbalance/outlier_detection.rs +++ b/tonic-xds/src/client/loadbalance/outlier_detection.rs @@ -92,13 +92,16 @@ impl OutlierStatsRegistry { }) } - /// Register a new channel. Returns the `Arc` + /// Register a channel and return the `Arc` /// the load balancer wires into the channel; the same `Arc` is - /// retained in the registry so the actor can iterate it. + /// retained in the registry so the actor can iterate it. If a + /// state for this address already exists, returns it untouched — + /// state continuity across reconnect cycles is preserved. pub(crate) fn add_channel(&self, addr: EndpointAddress) -> Arc { - let state = Arc::new(OutlierChannelState::new()); - self.channels.insert(addr, state.clone()); - state + self.channels + .entry(addr) + .or_insert_with(|| Arc::new(OutlierChannelState::new())) + .clone() } /// Forget a channel. Drops the registry's reference; cluster-wide From b72040d3b023020a55cdbd8772643ba8e80d30a2 Mon Sep 17 00:00:00 2001 From: Yu Liu <60283975+LYZJU2019@users.noreply.github.com> Date: Mon, 11 May 2026 14:19:07 -0700 Subject: [PATCH 23/39] refactor(tonic-xds): bundle outlier LB state into OutlierDetector MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The four outlier-related fields on `LoadBalancer` (registry, ejected pool, ejection-signal streams, actor handle) always lived in lockstep — either all four were present (outlier detection enabled) or all four were absent. Bundle them into a single `OutlierDetector` struct stored as `Option>` so the type system enforces the invariant and the LB methods that touch outlier state become one-line delegations. --- .../src/client/loadbalance/loadbalancer.rs | 97 ++++++------------- .../client/loadbalance/outlier_detection.rs | 89 ++++++++++++++++- 2 files changed, 120 insertions(+), 66 deletions(-) diff --git a/tonic-xds/src/client/loadbalance/loadbalancer.rs b/tonic-xds/src/client/loadbalance/loadbalancer.rs index d11c57f52..90effb476 100644 --- a/tonic-xds/src/client/loadbalance/loadbalancer.rs +++ b/tonic-xds/src/client/loadbalance/loadbalancer.rs @@ -5,21 +5,21 @@ //! machine, and routes requests to ready endpoints via a [`ChannelPicker`]. //! //! Outlier detection is integrated via an optional -//! [`OutlierStatsRegistry`]: ejection decisions are made on the data -//! path (per-RPC) and surfaced to `poll_ready` via per-channel -//! `watch::Receiver` streams aggregated in a `StreamMap`. The -//! LB then moves the corresponding [`ReadyChannel`] between its ready -//! and ejected maps in O(1) per transition. +//! [`OutlierDetector`], which bundles the shared +//! [`OutlierStatsRegistry`], the ejected-channel pool, the per-channel +//! ejection-signal streams, and the housekeeping actor handle. +//! Ejection decisions are made on the data path (per-RPC) and surfaced +//! to `poll_ready` via per-channel `watch::Receiver` streams +//! aggregated in a `StreamMap`. The LB then moves the corresponding +//! [`ReadyChannel`] between its `ready` map and the detector's ejected +//! pool in O(1) per transition. -use std::collections::HashMap; use std::future::Future; use std::pin::Pin; use std::sync::Arc; use std::task::{Context, Poll, ready}; use indexmap::IndexMap; -use tokio_stream::StreamMap; -use tokio_stream::wrappers::WatchStream; use tower::Service; use tower::discover::{Change, Discover}; @@ -27,9 +27,8 @@ use crate::client::endpoint::{Connector, EndpointAddress}; use crate::client::loadbalance::channel_state::{IdleChannel, OutlierChannelState, ReadyChannel}; use crate::client::loadbalance::errors::LbError; use crate::client::loadbalance::keyed_futures::KeyedFutures; -use crate::client::loadbalance::outlier_detection::{OutlierStatsRegistry, spawn_actor}; +use crate::client::loadbalance::outlier_detection::{OutlierDetector, OutlierStatsRegistry}; use crate::client::loadbalance::pickers::ChannelPicker; -use crate::common::async_util::AbortOnDrop; /// Future returned by [`LoadBalancer::call`]. /// @@ -76,20 +75,10 @@ pub(crate) struct LoadBalancer { connecting: KeyedFutures, /// Ready-to-serve channels, keyed by endpoint address. ready: IndexMap>, - /// Channels currently ejected by outlier detection. Their - /// underlying connections are kept alive so traffic can resume - /// without reconnecting after un-ejection. - ejected: HashMap>, - /// Per-channel ejection signal streams, aggregated for O(1) - /// observation in `poll_ready`. Present only when outlier - /// detection is enabled. - ejection_signals: StreamMap>, - /// Outlier-detection registry, shared with the spawned actor and - /// the data path. `None` disables outlier detection. - outlier: Option>, - /// Handle to the outlier-detection actor task; dropped when the - /// LB is dropped. - _outlier_actor: Option, + /// All per-LB outlier-detection state — the shared registry, the + /// ejected pool, the ejection-signal streams, and the + /// housekeeping actor handle. `None` disables outlier detection. + outlier: Option>, /// Channel picker for load balancing. picker: Arc, Req> + Send + Sync>, } @@ -119,30 +108,23 @@ where picker: Arc, Req> + Send + Sync>, outlier: Option>, ) -> Self { - let _outlier_actor = outlier.as_ref().map(|reg| spawn_actor(reg.clone())); Self { discovery, connector, connecting: KeyedFutures::new(), ready: IndexMap::new(), - ejected: HashMap::new(), - ejection_signals: StreamMap::new(), - outlier, - _outlier_actor, + outlier: outlier.map(OutlierDetector::new), picker, } } /// Forget all per-endpoint state for `addr`: the connecting - /// future, the ready slot, the ejected slot, the ejection signal - /// stream, and the registry entry. + /// future, the ready slot, and any outlier bookkeeping. fn forget_endpoint(&mut self, addr: &EndpointAddress) { let _ = self.connecting.cancel(addr); self.ready.swap_remove(addr); - self.ejected.remove(addr); - self.ejection_signals.remove(addr); - if let Some(registry) = self.outlier.as_ref() { - registry.remove_channel(addr); + if let Some(o) = self.outlier.as_mut() { + o.forget(addr); } } @@ -176,40 +158,24 @@ where /// Drain completed connection futures into the ready set. Wraps /// each bare service into a `ReadyChannel` using the outlier - /// state from the registry (or a fresh state if outlier detection - /// is disabled), and subscribes to the per-channel ejection - /// signal. + /// state from the detector (or a fresh state if outlier detection + /// is disabled). fn poll_connecting(&mut self, cx: &mut Context<'_>) { while let Poll::Ready(Some((addr, svc))) = self.connecting.poll_next(cx) { - let state = match self.outlier.as_ref() { - Some(registry) => registry.add_channel(addr.clone()), + let state = match self.outlier.as_mut() { + Some(o) => o.register(addr.clone()), None => Arc::new(OutlierChannelState::new()), }; - if self.outlier.is_some() { - self.ejection_signals - .insert(addr.clone(), WatchStream::from_changes(state.subscribe())); - } let ready = ReadyChannel::new(addr.clone(), svc, state); self.ready.insert(addr, ready); } } - /// Drain ejection-signal transitions, moving channels between - /// `ready` and `ejected`. O(k) per call where k = ready signals. - fn poll_ejection_signals(&mut self, cx: &mut Context<'_>) { - use futures_core::Stream; - while let Poll::Ready(Some((addr, ejected))) = - Pin::new(&mut self.ejection_signals).poll_next(cx) - { - if ejected { - if let Some(ch) = self.ready.swap_remove(&addr) { - tracing::debug!("outlier detection: eject {addr}"); - self.ejected.insert(addr, ch); - } - } else if let Some(ch) = self.ejected.remove(&addr) { - tracing::debug!("outlier detection: uneject {addr}"); - self.ready.insert(addr, ch); - } + /// Drain outlier ejection-signal transitions, moving channels + /// between `ready` and the detector's ejected pool. + fn poll_outlier(&mut self, cx: &mut Context<'_>) { + if let Some(o) = self.outlier.as_mut() { + o.poll_signals(cx, &mut self.ready); } } } @@ -232,7 +198,7 @@ where fn poll_ready(&mut self, cx: &mut Context<'_>) -> Poll> { let discover_result = self.poll_discover(cx); self.poll_connecting(cx); - self.poll_ejection_signals(cx); + self.poll_outlier(cx); if !self.ready.is_empty() { return Poll::Ready(Ok(())); @@ -269,7 +235,7 @@ where // are `Arc`-shared, so cloning is cheap. let mut svc = picked.clone(); let outlier_state = picked.outlier().clone(); - let registry = self.outlier.clone(); + let registry = self.outlier.as_ref().map(|o| o.registry().clone()); LbFuture::Pending(Box::pin(async move { tower::ServiceExt::ready(&mut svc) .await @@ -855,10 +821,11 @@ mod tests { // poll_ready drains the ejection signal and moves 8084. let _ = poll_ready_now(&mut lb); + let ejected = lb.outlier.as_ref().unwrap().ejected(); assert!( - lb.ejected.contains_key(&addr(8084)), + ejected.contains_key(&addr(8084)), "8084 should be ejected; ejected map: {:?}, ready keys: {:?}", - lb.ejected.keys().collect::>(), + ejected.keys().collect::>(), lb.ready.keys().collect::>(), ); assert!(!lb.ready.contains_key(&addr(8084))); @@ -882,7 +849,7 @@ mod tests { call_each(&mut lb, 50).await; let _ = poll_ready_now(&mut lb); - assert_eq!(lb.ejected.len(), 0); + assert_eq!(lb.outlier.as_ref().unwrap().ejected().len(), 0); assert_eq!(lb.ready.len(), 5); } diff --git a/tonic-xds/src/client/loadbalance/outlier_detection.rs b/tonic-xds/src/client/loadbalance/outlier_detection.rs index 1cbfb6233..6ccad8d77 100644 --- a/tonic-xds/src/client/loadbalance/outlier_detection.rs +++ b/tonic-xds/src/client/loadbalance/outlier_detection.rs @@ -27,14 +27,20 @@ //! //! [gRFC A50]: https://github.com/grpc/proposal/blob/master/A50-xds-outlier-detection.md +use std::collections::HashMap; +use std::pin::Pin; use std::sync::Arc; use std::sync::atomic::{AtomicU64, Ordering}; +use std::task::{Context, Poll}; use std::time::Instant; use dashmap::DashMap; +use indexmap::IndexMap; +use tokio_stream::StreamMap; +use tokio_stream::wrappers::WatchStream; use crate::client::endpoint::EndpointAddress; -use crate::client::loadbalance::channel_state::OutlierChannelState; +use crate::client::loadbalance::channel_state::{OutlierChannelState, ReadyChannel}; use crate::common::async_util::AbortOnDrop; use crate::xds::resource::outlier_detection::OutlierDetectionConfig; @@ -236,6 +242,87 @@ pub(crate) fn spawn_actor(registry: Arc) -> AbortOnDrop { AbortOnDrop(task) } +/// All per-LB outlier-detection state: the shared registry, the pool +/// of currently-ejected channels (whose connections are kept alive +/// across ejection), the per-channel ejection-signal streams +/// aggregated for O(1) observation in `poll_ready`, and the handle to +/// the housekeeping actor (dropped with the LB). +/// +/// `LoadBalancer` holds this as `Option>`: `None` +/// when outlier detection is disabled, `Some` when enabled. +pub(crate) struct OutlierDetector { + registry: Arc, + ejected: HashMap>, + ejection_signals: StreamMap>, + _actor: AbortOnDrop, +} + +impl OutlierDetector { + /// Build from a registry, spawning the housekeeping actor. + pub(crate) fn new(registry: Arc) -> Self { + let _actor = spawn_actor(registry.clone()); + Self { + registry, + ejected: HashMap::new(), + ejection_signals: StreamMap::new(), + _actor, + } + } + + /// Shared registry handle — clone to hand to the data path. + pub(crate) fn registry(&self) -> &Arc { + &self.registry + } + + /// Register a newly-connected channel for tracking and subscribe + /// to its ejection signal. Returns the per-channel state for the + /// load balancer to wire into [`ReadyChannel`]. + pub(crate) fn register(&mut self, addr: EndpointAddress) -> Arc { + let state = self.registry.add_channel(addr.clone()); + self.ejection_signals + .insert(addr, WatchStream::from_changes(state.subscribe())); + state + } + + /// Drop all bookkeeping for `addr`: ejection slot, signal stream, + /// registry entry. + pub(crate) fn forget(&mut self, addr: &EndpointAddress) { + self.ejected.remove(addr); + self.ejection_signals.remove(addr); + self.registry.remove_channel(addr); + } + + /// Drain ejection-signal transitions, moving channels between + /// `ready` and the internal ejected pool. O(k) per call where k is + /// the number of pending signal changes. + pub(crate) fn poll_signals( + &mut self, + cx: &mut Context<'_>, + ready: &mut IndexMap>, + ) { + use futures_core::Stream; + while let Poll::Ready(Some((addr, ejected))) = + Pin::new(&mut self.ejection_signals).poll_next(cx) + { + if ejected { + if let Some(ch) = ready.swap_remove(&addr) { + tracing::debug!("outlier detection: eject {addr}"); + self.ejected.insert(addr, ch); + } + } else if let Some(ch) = self.ejected.remove(&addr) { + tracing::debug!("outlier detection: uneject {addr}"); + ready.insert(addr, ch); + } + } + } + + /// Number of currently-ejected channels. + #[cfg(test)] + pub(crate) fn ejected(&self) -> &HashMap> { + &self.ejected + } +} + /// Return true with probability `pct / 100` (clamped at 100 ⇒ always). fn roll(rng: &dyn Rng, pct: u8) -> bool { if pct >= 100 { From be41f3fd1fa03365b6457509d4b628827531a111 Mon Sep 17 00:00:00 2001 From: Yu Liu <60283975+LYZJU2019@users.noreply.github.com> Date: Mon, 11 May 2026 14:58:54 -0700 Subject: [PATCH 24/39] fix(tonic-xds): preserve outlier-detection state across re-insert MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Match grpc-go (`internal/xds/balancer/outlierdetection`) and Envoy (`BaseDynamicClusterImpl::updateDynamicHostList` reusing existing `HostSharedPtr`s): outlier-detection state is keyed by stable endpoint identity and survives a transient discovery flap. Previously, every `Change::Insert` ran the same purge path as `Change::Remove`, wiping the registry entry along with the connecting / ready / ejected slots — a brief disappearance lost the channel's counters and ejection multiplier. Split the path: - `purge_endpoint` (Remove) — cancels connecting, clears ready, and drops all outlier bookkeeping including the registry entry. - `reset_active_slots` (Insert) — cancels connecting, clears ready, and drops the obsolete `ReadyChannel` from the detector's ejected pool, but leaves the registry entry and ejection-signal subscription intact. `OutlierDetector::register` now only inserts a new signal subscription when one is not already present, so a pending ejection transition is not dropped by a redundant resubscribe. `poll_connecting` checks the preserved `state.is_ejected()` and routes a re-discovered ejected channel directly into the ejected pool via the new `place_ejected`, avoiding any window where traffic could be routed to a logically ejected channel. Adds two regression tests: - `test_outlier_detection_reinsert_preserves_state` — counters survive Insert for an existing address; same `Arc` is returned. - `test_outlier_detection_reinsert_while_ejected_stays_ejected` — re-discovered ejected channel lands in the ejected pool, not `ready`. --- .../src/client/loadbalance/loadbalancer.rs | 160 +++++++++++++++++- .../client/loadbalance/outlier_detection.rs | 43 ++++- 2 files changed, 191 insertions(+), 12 deletions(-) diff --git a/tonic-xds/src/client/loadbalance/loadbalancer.rs b/tonic-xds/src/client/loadbalance/loadbalancer.rs index 90effb476..7465ea997 100644 --- a/tonic-xds/src/client/loadbalance/loadbalancer.rs +++ b/tonic-xds/src/client/loadbalance/loadbalancer.rs @@ -118,9 +118,11 @@ where } } - /// Forget all per-endpoint state for `addr`: the connecting - /// future, the ready slot, and any outlier bookkeeping. - fn forget_endpoint(&mut self, addr: &EndpointAddress) { + /// Purge all per-endpoint state for `addr`: the connecting + /// future, the ready slot, and **all** outlier bookkeeping + /// (registry entry, ejection-signal subscription, ejected slot). + /// Used when discovery says the endpoint is gone from the cluster. + fn purge_endpoint(&mut self, addr: &EndpointAddress) { let _ = self.connecting.cancel(addr); self.ready.swap_remove(addr); if let Some(o) = self.outlier.as_mut() { @@ -128,6 +130,24 @@ where } } + /// Clear stale slots that held the old service (in-flight + /// connecting future, ready entry, ejected entry) but **preserve** + /// the outlier-detection registry entry — counters, ejection + /// multiplier, and ejection flag carry across the reconnect. + /// Used when discovery re-inserts an endpoint we already track. + /// + /// This matches grpc-go and Envoy: outlier state is keyed by + /// stable endpoint identity and survives a transient discovery + /// flap, so a brief disappearance does not wipe what we already + /// know about the endpoint's health. + fn reset_active_slots(&mut self, addr: &EndpointAddress) { + let _ = self.connecting.cancel(addr); + self.ready.swap_remove(addr); + if let Some(o) = self.outlier.as_mut() { + o.clear_active_slots(addr); + } + } + /// Drain pending discovery events. Either resolves to an error /// ([`LbError::DiscoverClosed`] or [`LbError::DiscoverError`]) or stays /// pending — there is no success outcome since the loop only exits on @@ -144,13 +164,13 @@ where Some(Err(e)) => return Poll::Ready(LbError::DiscoverError(e.into())), Some(Ok(Change::Insert(addr, idle))) => { tracing::trace!("discovery: insert {addr}"); - self.forget_endpoint(&addr); + self.reset_active_slots(&addr); let connecting = idle.connect(self.connector.clone()); let _ = self.connecting.add(addr, connecting); } Some(Ok(Change::Remove(addr))) => { tracing::trace!("discovery: remove {addr}"); - self.forget_endpoint(&addr); + self.purge_endpoint(&addr); } } } @@ -160,14 +180,25 @@ where /// each bare service into a `ReadyChannel` using the outlier /// state from the detector (or a fresh state if outlier detection /// is disabled). + /// + /// If the preserved outlier state for a re-discovered endpoint + /// says it is still ejected, the new channel goes directly into + /// the detector's ejected pool — not the ready set — so no + /// traffic is routed to it until the housekeeping actor un-ejects. fn poll_connecting(&mut self, cx: &mut Context<'_>) { while let Poll::Ready(Some((addr, svc))) = self.connecting.poll_next(cx) { let state = match self.outlier.as_mut() { Some(o) => o.register(addr.clone()), None => Arc::new(OutlierChannelState::new()), }; + let is_ejected = state.is_ejected(); let ready = ReadyChannel::new(addr.clone(), svc, state); - self.ready.insert(addr, ready); + match self.outlier.as_mut() { + Some(o) if is_ejected => o.place_ejected(addr, ready), + _ => { + self.ready.insert(addr, ready); + } + } } } @@ -869,4 +900,121 @@ mod tests { assert_eq!(registry.len(), 0); assert_eq!(lb.ready.len(), 0); } + + /// Re-discovering an endpoint (Insert for an address the LB + /// already tracks) must preserve its outlier-detection counters + /// and multiplier. Matches grpc-go / Envoy behavior. + #[tokio::test] + async fn test_outlier_detection_reinsert_preserves_state() { + let (tx, discover) = new_discover(); + let (mut lb, connector, registry) = make_lb_with_outlier(discover, fp_config(50, 5, 3)); + + tx.send(Ok(Change::Insert(addr(8080), IdleChannel::new(addr(8080))))) + .await + .unwrap(); + drive_to_ready(&mut lb, &connector).await; + let state = registry.add_channel(addr(8080)); // idempotent — returns the existing state + // Drive some successes through the data path so the channel + // accumulates counter state worth preserving. + for _ in 0..3 { + lb.call("hello").await.unwrap(); + } + let (s_before, f_before) = state.counters(); + assert!( + s_before > 0, + "expected accumulated successes before re-insert" + ); + let registry_before = Arc::as_ptr(&state); + + // Re-insert the same address. State must survive. + tx.send(Ok(Change::Insert(addr(8080), IdleChannel::new(addr(8080))))) + .await + .unwrap(); + drive_to_ready(&mut lb, &connector).await; + + let state_after = registry.add_channel(addr(8080)); + assert_eq!( + Arc::as_ptr(&state_after), + registry_before, + "registry entry should be the same Arc — state continuity preserved", + ); + let (s_after, f_after) = state_after.counters(); + assert_eq!( + (s_after, f_after), + (s_before, f_before), + "counters must survive re-insert", + ); + assert_eq!(registry.len(), 1); + } + + /// A re-discovered endpoint whose preserved state says "ejected" + /// is placed directly into the ejected pool, not the ready set, so + /// no traffic is routed to it until the housekeeping actor + /// un-ejects it. + #[tokio::test] + async fn test_outlier_detection_reinsert_while_ejected_stays_ejected() { + let (tx, discover) = new_discover(); + let (mut lb, connector, registry) = make_lb_with_outlier(discover, fp_config(50, 5, 3)); + + // Bring up 5 endpoints; make 8084 fail enough to be ejected. + for port in 8080..=8084 { + tx.send(Ok(Change::Insert(addr(port), IdleChannel::new(addr(port))))) + .await + .unwrap(); + } + drive_to_ready(&mut lb, &connector).await; + connector + .service(&addr(8084)) + .fail_call + .store(true, Ordering::Relaxed); + for _ in 0..100 { + let _ = lb.call("hello").await; + } + let _ = poll_ready_now(&mut lb); + let state_8084 = registry.add_channel(addr(8084)); + assert!( + state_8084.is_ejected(), + "8084 must be ejected before re-insert" + ); + assert!( + lb.outlier + .as_ref() + .unwrap() + .ejected() + .contains_key(&addr(8084)), + "8084 should be in the ejected pool" + ); + + // Re-insert 8084. The ejected slot's old ReadyChannel is + // dropped, but the registry entry (is_ejected=true) is + // preserved. The new channel should land in the ejected pool, + // not in `ready`. Drive the steps explicitly because + // `lb.ready` is non-empty throughout (8080..=8083), so + // `drive_to_ready` may return before the new 8084 connect + // resolves. + tx.send(Ok(Change::Insert(addr(8084), IdleChannel::new(addr(8084))))) + .await + .unwrap(); + // 1. Drain the Insert into `self.connecting`. + let _ = poll_ready_now(&mut lb); + // 2. Synchronously resolve the new connect future. + connector.resolve_all(); + // 3. Drain the now-ready connecting future; `poll_connecting` + // sees `state.is_ejected() == true` and calls `place_ejected`. + let _ = poll_ready_now(&mut lb); + + assert!( + !lb.ready.contains_key(&addr(8084)), + "8084 must not be in ready while still logically ejected" + ); + assert!( + lb.outlier + .as_ref() + .unwrap() + .ejected() + .contains_key(&addr(8084)), + "8084 must remain in the ejected pool after re-insert" + ); + assert!(state_8084.is_ejected()); + } } diff --git a/tonic-xds/src/client/loadbalance/outlier_detection.rs b/tonic-xds/src/client/loadbalance/outlier_detection.rs index 6ccad8d77..e4fd685d2 100644 --- a/tonic-xds/src/client/loadbalance/outlier_detection.rs +++ b/tonic-xds/src/client/loadbalance/outlier_detection.rs @@ -274,24 +274,55 @@ impl OutlierDetector { &self.registry } - /// Register a newly-connected channel for tracking and subscribe - /// to its ejection signal. Returns the per-channel state for the - /// load balancer to wire into [`ReadyChannel`]. + /// Register a newly-connected channel for tracking and (on first + /// registration only) subscribe to its ejection signal. Returns + /// the per-channel state for the load balancer to wire into + /// [`ReadyChannel`]. + /// + /// When an endpoint is re-discovered (Insert for an address whose + /// registry entry was preserved), the existing signal subscription + /// is left in place so any pending ejection transition is not + /// dropped. pub(crate) fn register(&mut self, addr: EndpointAddress) -> Arc { let state = self.registry.add_channel(addr.clone()); - self.ejection_signals - .insert(addr, WatchStream::from_changes(state.subscribe())); + if !self.ejection_signals.contains_key(&addr) { + self.ejection_signals + .insert(addr, WatchStream::from_changes(state.subscribe())); + } state } /// Drop all bookkeeping for `addr`: ejection slot, signal stream, - /// registry entry. + /// registry entry. Used when the endpoint is removed from the + /// cluster. pub(crate) fn forget(&mut self, addr: &EndpointAddress) { self.ejected.remove(addr); self.ejection_signals.remove(addr); self.registry.remove_channel(addr); } + /// Drop the ejected-pool entry for `addr` (which holds an obsolete + /// `ReadyChannel`) but preserve the registry entry — counters, + /// ejection multiplier, and ejection flag carry across the + /// reconnect. Used when an endpoint is re-discovered. + /// + /// Matches grpc-go (`internal/xds/balancer/outlierdetection`) and + /// Envoy (`BaseDynamicClusterImpl::updateDynamicHostList` reusing + /// existing `HostSharedPtr`s): outlier state is keyed by stable + /// endpoint identity and survives transient discovery flaps. + pub(crate) fn clear_active_slots(&mut self, addr: &EndpointAddress) { + self.ejected.remove(addr); + } + + /// Place a freshly-connected channel directly into the ejected + /// pool. Used by the load balancer when the preserved state for a + /// re-discovered endpoint says it is still ejected; this avoids a + /// brief window of routing traffic to a logically-ejected channel + /// until the housekeeping actor un-ejects it. + pub(crate) fn place_ejected(&mut self, addr: EndpointAddress, ch: ReadyChannel) { + self.ejected.insert(addr, ch); + } + /// Drain ejection-signal transitions, moving channels between /// `ready` and the internal ejected pool. O(k) per call where k is /// the number of pending signal changes. From 6d5324bff73e6776b5eb5382accc4b61c5d12d49 Mon Sep 17 00:00:00 2001 From: Yu Liu <60283975+LYZJU2019@users.noreply.github.com> Date: Tue, 12 May 2026 10:54:07 -0700 Subject: [PATCH 25/39] refactor(tonic-xds): drive outlier ejection through the channel state machine MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Use the type-state machinery from `channel_state.rs` (`ReadyChannel::eject` → `EjectedChannel` → `UnejectedChannel`) as the primary mechanism for outlier-detection ejection, retiring the parallel `watch::Sender` + dual-map design. The compile-time invariant that ejected channels cannot be picked is now enforced by the type system: the picker takes `ReadyChannel`, ejected channels live in a `KeyedFutures<_, UnejectedChannel<_>>` mirroring the existing pattern for `ConnectingChannel`. This brings the outlier-detection LB integration in line with the project's existing idioms and gives the previously-unused channel state machine its first production caller. Architecture: - **Data path** still uses `OutlierStatsRegistry::record_outcome` to apply the failure-percentage algorithm per-RPC. On transition to ejected the registry sends the address through an mpsc `UnboundedSender` rather than flipping a watch flag. - **LoadBalancer** drains the mpsc in `poll_ready`, consumes the matching `ReadyChannel` via `.eject(EjectionConfig { timeout, .. })`, and tracks the resulting `EjectedChannel` in a second `KeyedFutures`. Each ejected channel's internal `Sleep` fires exactly at `base × multiplier` (capped at `max_ejection_time`), yielding `UnejectedChannel::Ready(svc)`; `poll_unejection` drains it on the next `poll_ready` and routes the channel back into `ready`. - **Housekeeping actor** simplifies: it resets counters and decrements multipliers on the `config.interval` boundary, but no longer un-ejects — un-ejection is timer-driven by `EjectedChannel`. `OutlierStatsRegistry` gains two methods: - `note_uneject(state)` — clears the `ejected_at_nanos` atomic on the channel state and decrements `ejected_count`. Called by the LB when an `EjectedChannel`'s timer fires. - `remaining_ejection(state, now)` — computes how much of the ejection window is left, capped by `max_ejection_time`. Used by the LB on initial ejection (full duration) and on re-discovery (remaining duration) to size the `EjectionConfig::timeout`. `OutlierChannelState` drops the `watch::Sender` field entirely; `is_ejected` / `try_eject` / `try_uneject` now use atomic CAS on `ejected_at_nanos` as the single source of truth. The `OutlierDetector` struct simplifies to `{ registry, eject_rx, _actor }` — no generic parameter, no internal `ejected` map, no signal-stream aggregator. Re-discovery while ejected (Insert for an address whose preserved state says `is_ejected`) re-ejects the new channel with the `remaining_ejection` duration so the original backoff is honored rather than restarted; if the deadline has already passed, the channel is un-ejected immediately. Behavior matches grpc-go and Envoy. Adds tests: - `OutlierStatsRegistry::{remaining_ejection,note_uneject}` — five new unit tests covering full duration, cap, mid-eject subtraction, past-deadline, and not-ejected cases. - `ejection_dispatches_address_through_mpsc` — verifies the data path sends through the mpsc on transition. - `housekeeping_leaves_ejected_multipliers_alone` — guards the new invariant that the actor no longer touches ejected channels. - `test_outlier_detection_timer_driven_unejection` — end-to-end LB test that an ejected channel returns to `ready` after `base × multiplier` elapses (with `tokio::time::advance`). Adds `KeyedFutures::contains_key` for test access; no production caller depends on it. --- .../src/client/loadbalance/channel_state.rs | 95 ++-- .../src/client/loadbalance/keyed_futures.rs | 7 + .../src/client/loadbalance/loadbalancer.rs | 322 ++++++++++--- .../client/loadbalance/outlier_detection.rs | 445 ++++++++++-------- 4 files changed, 551 insertions(+), 318 deletions(-) diff --git a/tonic-xds/src/client/loadbalance/channel_state.rs b/tonic-xds/src/client/loadbalance/channel_state.rs index b87414bc1..472ba95c8 100644 --- a/tonic-xds/src/client/loadbalance/channel_state.rs +++ b/tonic-xds/src/client/loadbalance/channel_state.rs @@ -31,7 +31,6 @@ use std::task::{Context, Poll}; use std::time::{Duration, Instant}; use pin_project_lite::pin_project; -use tokio::sync::watch; use tower::Service; use tower::load::Load; @@ -72,22 +71,21 @@ impl EndpointCounters { } /// Per-channel outlier-detection state, shared (via `Arc`) between -/// the data path (per-RPC outcome recording + threshold-based ejection) -/// and the outlier-detection actor (interval-based housekeeping). +/// the data path (per-RPC outcome recording + threshold-based ejection), +/// the outlier-detection actor (interval-based housekeeping), and the +/// load balancer (consults `is_ejected` / `ejected_duration` on +/// reconnect). /// -/// Ejection is edge-triggered: callers flip the flag via [`eject`] / -/// [`uneject`]; observers poll `Receiver::changed()` (typically inside -/// a `FuturesUnordered`) to react in O(1) on each transition. -/// -/// All fields are atomics or wrapped in lock-free primitives so the -/// data path can mutate them without locking. -/// -/// [`eject`]: Self::eject -/// [`uneject`]: Self::uneject +/// All fields are atomics so the data path can mutate them without +/// locking. Ejection state is encoded in [`Self::ejected_at_nanos`]: +/// zero means not ejected, non-zero is the nanos-since-epoch of the +/// ejection's start. [`Self::try_eject`] / [`Self::try_uneject`] use +/// CAS to flip the field atomically and report whether the transition +/// fired (so callers can update registry-level counters exactly once +/// per transition). #[derive(Debug)] pub(crate) struct OutlierChannelState { counters: EndpointCounters, - eject_tx: watch::Sender, /// Whether this channel currently contributes to the registry's /// `qualifying_count`. Set when `total` first reaches /// `request_volume` in the current interval; cleared on counter @@ -97,7 +95,8 @@ pub(crate) struct OutlierChannelState { /// ejection; decremented (saturating) on each healthy interval. ejection_multiplier: AtomicU32, /// `0` when not ejected. Otherwise nanos since [`Self::epoch`] of - /// the current ejection's start. + /// the current ejection's start. Single source of truth for + /// "is this channel ejected right now?". ejected_at_nanos: AtomicU64, /// Reference instant used as the origin for `ejected_at_nanos`. /// Established at construction and never changes. @@ -112,10 +111,8 @@ impl Default for OutlierChannelState { impl OutlierChannelState { pub(crate) fn new() -> Self { - let (eject_tx, _) = watch::channel(false); Self { counters: EndpointCounters::default(), - eject_tx, is_qualifying: AtomicBool::new(false), ejection_multiplier: AtomicU32::new(0), ejected_at_nanos: AtomicU64::new(0), @@ -158,53 +155,39 @@ impl OutlierChannelState { self.is_qualifying.swap(false, Ordering::AcqRel) } - /// Flip the ejection flag to `true`. Returns `true` if this call - /// performed the false → true transition (so callers can update - /// registry-level counters exactly once per ejection). - /// Records the ejection timestamp and bumps the multiplier. + /// Atomically mark this channel as ejected starting at `now`. + /// Returns `true` if this call performed the not-ejected → + /// ejected transition (so callers can update registry-level + /// counters exactly once per ejection). Bumps the multiplier on + /// transition. pub(crate) fn try_eject(&self, now: Instant) -> bool { - let won = self.eject_tx.send_if_modified(|state| { - if *state { - false - } else { - *state = true; - true - } - }); - if !won { - return false; - } let nanos = now .saturating_duration_since(self.epoch) .as_nanos() .min(u64::MAX as u128) as u64; - // Use 1 as a sentinel if the channel was created at exactly - // `now`, since 0 means "not ejected". - self.ejected_at_nanos.store(nanos.max(1), Ordering::Relaxed); + // 0 means "not ejected"; use 1 as a sentinel if the channel + // was created at exactly `now`. + let stamp = nanos.max(1); + if self + .ejected_at_nanos + .compare_exchange(0, stamp, Ordering::AcqRel, Ordering::Relaxed) + .is_err() + { + return false; + } self.ejection_multiplier.fetch_add(1, Ordering::Relaxed); true } - /// Flip the ejection flag back to `false`. Returns `true` if this - /// call performed the true → false transition. + /// Atomically clear the ejection. Returns `true` if this call + /// performed the ejected → not-ejected transition. pub(crate) fn try_uneject(&self) -> bool { - let won = self.eject_tx.send_if_modified(|state| { - if *state { - *state = false; - true - } else { - false - } - }); - if won { - self.ejected_at_nanos.store(0, Ordering::Relaxed); - } - won + self.ejected_at_nanos.swap(0, Ordering::AcqRel) != 0 } /// Current ejection state. pub(crate) fn is_ejected(&self) -> bool { - *self.eject_tx.borrow() + self.ejected_at_nanos.load(Ordering::Acquire) != 0 } /// Returns the elapsed time since this channel was ejected, or @@ -232,14 +215,6 @@ impl OutlierChannelState { } } - /// Subscribe to ejection-state changes. The returned receiver's - /// `changed()` future resolves on each transition; consumers - /// typically push it into a `FuturesUnordered`. - #[allow(dead_code)] // wired by the LoadBalancer in a follow-up PR. - pub(crate) fn subscribe(&self) -> watch::Receiver { - self.eject_tx.subscribe() - } - /// Test-only setter for the ejection multiplier; lets tests drive /// housekeeping behavior without going through `try_eject`. #[cfg(test)] @@ -360,11 +335,15 @@ impl ReadyChannel { } /// Per-channel outlier-detection state. Cloned cheaply via `Arc`. - #[allow(dead_code)] // consumed by the LoadBalancer in a follow-up PR. pub(crate) fn outlier(&self) -> &Arc { &self.outlier } + /// Endpoint address this channel was created for. + pub(crate) fn addr(&self) -> &EndpointAddress { + &self.addr + } + /// Eject this channel (e.g., due to outlier detection). Consumes /// self. The outlier state remains in the registry; only the /// service and address are passed into [`EjectedChannel`] (which diff --git a/tonic-xds/src/client/loadbalance/keyed_futures.rs b/tonic-xds/src/client/loadbalance/keyed_futures.rs index 74319c6f3..c7f48aeaf 100644 --- a/tonic-xds/src/client/loadbalance/keyed_futures.rs +++ b/tonic-xds/src/client/loadbalance/keyed_futures.rs @@ -89,6 +89,13 @@ where self.futures.len() } + /// Returns true if a future is currently tracked for `key`. + /// Cancelled-but-not-yet-drained futures still count, since their + /// cancellation token entry is removed eagerly by [`Self::cancel`]. + pub(crate) fn contains_key(&self, key: &K) -> bool { + self.cancellations.contains_key(key) + } + /// Advance the internal futures. Yields `(K, T)` when a future completes, /// skipping cancelled futures silently. /// diff --git a/tonic-xds/src/client/loadbalance/loadbalancer.rs b/tonic-xds/src/client/loadbalance/loadbalancer.rs index 7465ea997..4cedff7dd 100644 --- a/tonic-xds/src/client/loadbalance/loadbalancer.rs +++ b/tonic-xds/src/client/loadbalance/loadbalancer.rs @@ -4,27 +4,35 @@ //! [`IdleChannel`]s), manages the connection lifecycle via the channel state //! machine, and routes requests to ready endpoints via a [`ChannelPicker`]. //! -//! Outlier detection is integrated via an optional -//! [`OutlierDetector`], which bundles the shared -//! [`OutlierStatsRegistry`], the ejected-channel pool, the per-channel -//! ejection-signal streams, and the housekeeping actor handle. -//! Ejection decisions are made on the data path (per-RPC) and surfaced -//! to `poll_ready` via per-channel `watch::Receiver` streams -//! aggregated in a `StreamMap`. The LB then moves the corresponding -//! [`ReadyChannel`] between its `ready` map and the detector's ejected -//! pool in O(1) per transition. +//! Outlier detection is integrated via an optional [`OutlierDetector`]. +//! Ejection decisions originate on the data path (per-RPC) and are +//! signaled to the LB via an mpsc channel. The LB consumes the named +//! [`ReadyChannel`] via [`ReadyChannel::eject`], obtaining an +//! [`EjectedChannel`] whose internal sleep fires exactly at +//! `base × multiplier` (capped by `max_ejection_time`); ejected +//! channels live in a second [`KeyedFutures`] (mirroring the existing +//! pattern for `ConnectingChannel`) until their timer yields +//! [`UnejectedChannel`], at which point the channel is routed back +//! into `ready` (`UnejectedChannel::Ready`) or `connecting` +//! (`UnejectedChannel::Connecting`). +//! +//! [`EjectedChannel`]: crate::client::loadbalance::channel_state::EjectedChannel +//! [`UnejectedChannel`]: crate::client::loadbalance::channel_state::UnejectedChannel use std::future::Future; use std::pin::Pin; use std::sync::Arc; use std::task::{Context, Poll, ready}; +use std::time::{Duration, Instant}; use indexmap::IndexMap; use tower::Service; use tower::discover::{Change, Discover}; use crate::client::endpoint::{Connector, EndpointAddress}; -use crate::client::loadbalance::channel_state::{IdleChannel, OutlierChannelState, ReadyChannel}; +use crate::client::loadbalance::channel_state::{ + EjectionConfig, IdleChannel, OutlierChannelState, ReadyChannel, UnejectedChannel, +}; use crate::client::loadbalance::errors::LbError; use crate::client::loadbalance::keyed_futures::KeyedFutures; use crate::client::loadbalance::outlier_detection::{OutlierDetector, OutlierStatsRegistry}; @@ -75,10 +83,16 @@ pub(crate) struct LoadBalancer { connecting: KeyedFutures, /// Ready-to-serve channels, keyed by endpoint address. ready: IndexMap>, - /// All per-LB outlier-detection state — the shared registry, the - /// ejected pool, the ejection-signal streams, and the - /// housekeeping actor handle. `None` disables outlier detection. - outlier: Option>, + /// Channels currently ejected by outlier detection. Each entry is + /// an [`EjectedChannel`] whose `Sleep` fires when the ejection + /// window expires; the resolved [`UnejectedChannel`] is drained in + /// `poll_ready` and routed back into `ready` (or `connecting` if + /// the underlying connection needs replacing). + ejected: KeyedFutures>, + /// Outlier-detection plumbing: shared registry, eject-signal + /// receiver, and the housekeeping actor handle. `None` disables + /// outlier detection. + outlier: Option, /// Channel picker for load balancing. picker: Arc, Req> + Send + Sync>, } @@ -88,7 +102,7 @@ where D: Discover + Unpin, D::Error: Into, C: Connector + Send + Sync + 'static, - C::Service: Send + 'static, + C::Service: Clone + Send + 'static, { /// Create a load balancer with no outlier detection. pub(crate) fn new( @@ -113,28 +127,31 @@ where connector, connecting: KeyedFutures::new(), ready: IndexMap::new(), + ejected: KeyedFutures::new(), outlier: outlier.map(OutlierDetector::new), picker, } } /// Purge all per-endpoint state for `addr`: the connecting - /// future, the ready slot, and **all** outlier bookkeeping - /// (registry entry, ejection-signal subscription, ejected slot). - /// Used when discovery says the endpoint is gone from the cluster. + /// future, the ready slot, the ejected channel (if any), and the + /// outlier-detection registry entry. Used when discovery says the + /// endpoint is gone from the cluster. fn purge_endpoint(&mut self, addr: &EndpointAddress) { let _ = self.connecting.cancel(addr); self.ready.swap_remove(addr); - if let Some(o) = self.outlier.as_mut() { - o.forget(addr); + let _ = self.ejected.cancel(addr); + if let Some(o) = self.outlier.as_ref() { + o.registry().remove_channel(addr); } } /// Clear stale slots that held the old service (in-flight - /// connecting future, ready entry, ejected entry) but **preserve** - /// the outlier-detection registry entry — counters, ejection - /// multiplier, and ejection flag carry across the reconnect. - /// Used when discovery re-inserts an endpoint we already track. + /// connecting future, ready entry, ejected channel) but + /// **preserve** the outlier-detection registry entry — counters, + /// ejection multiplier, and ejection flag carry across the + /// reconnect. Used when discovery re-inserts an endpoint we + /// already track. /// /// This matches grpc-go and Envoy: outlier state is keyed by /// stable endpoint identity and survives a transient discovery @@ -143,9 +160,7 @@ where fn reset_active_slots(&mut self, addr: &EndpointAddress) { let _ = self.connecting.cancel(addr); self.ready.swap_remove(addr); - if let Some(o) = self.outlier.as_mut() { - o.clear_active_slots(addr); - } + let _ = self.ejected.cancel(addr); } /// Drain pending discovery events. Either resolves to an error @@ -176,37 +191,140 @@ where } } - /// Drain completed connection futures into the ready set. Wraps - /// each bare service into a `ReadyChannel` using the outlier - /// state from the detector (or a fresh state if outlier detection - /// is disabled). + /// Drain completed connection futures. Wraps each bare service + /// into a `ReadyChannel` using the outlier state from the + /// registry (or a fresh state if outlier detection is disabled). /// /// If the preserved outlier state for a re-discovered endpoint - /// says it is still ejected, the new channel goes directly into - /// the detector's ejected pool — not the ready set — so no - /// traffic is routed to it until the housekeeping actor un-ejects. + /// says it is still ejected, the new channel is re-ejected with + /// the *remaining* ejection time so the ongoing backoff is + /// honored. If the deadline has already passed, the channel is + /// un-ejected immediately and routed to `ready`. fn poll_connecting(&mut self, cx: &mut Context<'_>) { while let Poll::Ready(Some((addr, svc))) = self.connecting.poll_next(cx) { - let state = match self.outlier.as_mut() { - Some(o) => o.register(addr.clone()), + let state = match self.outlier.as_ref() { + Some(o) => o.registry().add_channel(addr.clone()), None => Arc::new(OutlierChannelState::new()), }; - let is_ejected = state.is_ejected(); - let ready = ReadyChannel::new(addr.clone(), svc, state); - match self.outlier.as_mut() { - Some(o) if is_ejected => o.place_ejected(addr, ready), - _ => { - self.ready.insert(addr, ready); + let ready = ReadyChannel::new(addr.clone(), svc, state.clone()); + let remaining = self + .outlier + .as_ref() + .and_then(|o| o.registry().remaining_ejection(&state, Instant::now())); + self.place_after_connect(addr, ready, remaining); + } + } + + /// Route a freshly-connected `ReadyChannel` into the right pool + /// based on the preserved outlier state's `remaining` ejection + /// duration. Factored out so `poll_connecting` stays terse and + /// the three cases (fresh, mid-eject, past-deadline) are visible. + fn place_after_connect( + &mut self, + addr: EndpointAddress, + ready: ReadyChannel, + remaining: Option, + ) { + match remaining { + None => { + self.ready.insert(addr, ready); + } + Some(d) if d.is_zero() => { + if let Some(o) = self.outlier.as_ref() { + o.registry().note_uneject(ready.outlier()); + } + self.ready.insert(addr, ready); + } + Some(d) => { + let ejected = ready.eject( + EjectionConfig { + timeout: d, + needs_reconnect: false, + }, + self.connector.clone(), + ); + tracing::debug!("outlier detection: re-eject {addr} for {d:?}"); + let _ = self.ejected.add(addr, ejected); + } + } + } + + /// Drain eject requests from the outlier detector's mpsc and + /// transition the named `ReadyChannel`s into ejected ones. The + /// per-channel ejection state has already been flipped by + /// `record_outcome`; this step is the visible transition on the + /// LB side. + fn poll_eject_requests(&mut self, cx: &mut Context<'_>) { + loop { + let Some(o) = self.outlier.as_mut() else { + return; + }; + let addr = match o.poll_eject_request(cx) { + Poll::Ready(Some(a)) => a, + _ => return, + }; + let registry = o.registry().clone(); + // The eject signal arrives once `try_eject` has flipped + // the channel's state and the cluster-wide + // `ejected_count`. If the channel is no longer in `ready` + // (e.g. discovery removed it), there's nothing to do. + let Some(ch) = self.ready.swap_remove(&addr) else { + continue; + }; + let state = ch.outlier().clone(); + match registry.remaining_ejection(&state, Instant::now()) { + Some(d) if !d.is_zero() => { + let ejected = ch.eject( + EjectionConfig { + timeout: d, + needs_reconnect: false, + }, + self.connector.clone(), + ); + tracing::debug!("outlier detection: eject {addr} for {d:?}"); + let _ = self.ejected.add(addr, ejected); + } + Some(_) => { + // Deadline already past — un-eject immediately. + registry.note_uneject(&state); + self.ready.insert(addr, ch); + } + None => { + // State is no longer ejected (concurrent uneject?) — restore. + self.ready.insert(addr, ch); } } } } - /// Drain outlier ejection-signal transitions, moving channels - /// between `ready` and the detector's ejected pool. - fn poll_outlier(&mut self, cx: &mut Context<'_>) { - if let Some(o) = self.outlier.as_mut() { - o.poll_signals(cx, &mut self.ready); + /// Drain completed `EjectedChannel` timers. Each yields either an + /// `UnejectedChannel::Ready(svc)` (timer expired, reuse the + /// connection) or `UnejectedChannel::Connecting(future)` (timer + /// expired but a fresh connect was requested). The address's + /// outlier state is cleared and the channel is routed back into + /// `ready` or `connecting` accordingly. + fn poll_unejection(&mut self, cx: &mut Context<'_>) { + while let Poll::Ready(Some((addr, unejected))) = self.ejected.poll_next(cx) { + let state = match self.outlier.as_ref() { + Some(o) => o.registry().add_channel(addr.clone()), + None => Arc::new(OutlierChannelState::new()), + }; + if let Some(o) = self.outlier.as_ref() { + o.registry().note_uneject(&state); + } + match unejected { + UnejectedChannel::Ready(svc) => { + tracing::debug!("outlier detection: uneject {addr}"); + let ready = ReadyChannel::new(addr.clone(), svc, state); + self.ready.insert(addr, ready); + } + UnejectedChannel::Connecting(future) => { + // `needs_reconnect = false` for A50, so this arm + // is unused today; handle it for completeness in + // case a future policy sets it. + let _ = self.connecting.add(addr, future); + } + } } } } @@ -228,8 +346,13 @@ where fn poll_ready(&mut self, cx: &mut Context<'_>) -> Poll> { let discover_result = self.poll_discover(cx); + // Drain un-ejection completions BEFORE servicing eject requests + // so a freshly un-ejected channel can immediately serve traffic + // (and so cluster-wide `ejected_count` is current when the next + // eject is evaluated). + self.poll_unejection(cx); self.poll_connecting(cx); - self.poll_outlier(cx); + self.poll_eject_requests(cx); if !self.ready.is_empty() { return Poll::Ready(Ok(())); @@ -265,6 +388,7 @@ where // an owned service and outlier handle for the async block; both // are `Arc`-shared, so cloning is cheap. let mut svc = picked.clone(); + let addr = picked.addr().clone(); let outlier_state = picked.outlier().clone(); let registry = self.outlier.as_ref().map(|o| o.registry().clone()); LbFuture::Pending(Box::pin(async move { @@ -275,10 +399,9 @@ where if let Some(registry) = registry.as_ref() { // Per-RPC outlier detection: bump the channel's // counter and (inside `record_outcome`) possibly - // eject if the failure-percentage threshold is - // crossed. Treat any `Err` outcome as a failure for - // outlier purposes. - registry.record_outcome(&outlier_state, result.is_ok()); + // dispatch an eject request to the LB. Treat any + // `Err` outcome as a failure for outlier purposes. + registry.record_outcome(&addr, &outlier_state, result.is_ok()); } result.map_err(|e| LbError::LbChannelCallError(e.into())) })) @@ -850,13 +973,13 @@ mod tests { let _ = lb.call("hello").await; } - // poll_ready drains the ejection signal and moves 8084. + // poll_ready drains the eject mpsc and transitions 8084 into + // `self.ejected` via `ReadyChannel::eject`. let _ = poll_ready_now(&mut lb); - let ejected = lb.outlier.as_ref().unwrap().ejected(); assert!( - ejected.contains_key(&addr(8084)), - "8084 should be ejected; ejected map: {:?}, ready keys: {:?}", - ejected.keys().collect::>(), + lb.ejected.contains_key(&addr(8084)), + "8084 should be ejected; ejected.len()={}, ready keys: {:?}", + lb.ejected.len(), lb.ready.keys().collect::>(), ); assert!(!lb.ready.contains_key(&addr(8084))); @@ -880,7 +1003,7 @@ mod tests { call_each(&mut lb, 50).await; let _ = poll_ready_now(&mut lb); - assert_eq!(lb.outlier.as_ref().unwrap().ejected().len(), 0); + assert_eq!(lb.ejected.len(), 0); assert_eq!(lb.ready.len(), 5); } @@ -977,21 +1100,17 @@ mod tests { "8084 must be ejected before re-insert" ); assert!( - lb.outlier - .as_ref() - .unwrap() - .ejected() - .contains_key(&addr(8084)), + lb.ejected.contains_key(&addr(8084)), "8084 should be in the ejected pool" ); - // Re-insert 8084. The ejected slot's old ReadyChannel is - // dropped, but the registry entry (is_ejected=true) is - // preserved. The new channel should land in the ejected pool, - // not in `ready`. Drive the steps explicitly because - // `lb.ready` is non-empty throughout (8080..=8083), so - // `drive_to_ready` may return before the new 8084 connect - // resolves. + // Re-insert 8084. The ejected slot's old EjectedChannel is + // cancelled, but the registry entry (is_ejected=true, + // ejected_at_nanos preserved) survives. The new channel + // should be re-ejected with the *remaining* ejection time. + // Drive the steps explicitly because `lb.ready` is non-empty + // throughout (8080..=8083), so `drive_to_ready` may return + // before the new 8084 connect resolves. tx.send(Ok(Change::Insert(addr(8084), IdleChannel::new(addr(8084))))) .await .unwrap(); @@ -1000,7 +1119,7 @@ mod tests { // 2. Synchronously resolve the new connect future. connector.resolve_all(); // 3. Drain the now-ready connecting future; `poll_connecting` - // sees `state.is_ejected() == true` and calls `place_ejected`. + // sees `state.is_ejected() == true` and re-ejects. let _ = poll_ready_now(&mut lb); assert!( @@ -1008,13 +1127,64 @@ mod tests { "8084 must not be in ready while still logically ejected" ); assert!( - lb.outlier - .as_ref() - .unwrap() - .ejected() - .contains_key(&addr(8084)), + lb.ejected.contains_key(&addr(8084)), "8084 must remain in the ejected pool after re-insert" ); assert!(state_8084.is_ejected()); } + + /// Once `base × multiplier` time elapses on an ejected channel, + /// the [`EjectedChannel`]'s timer fires and the LB's + /// `poll_unejection` should move the channel back to `ready`. + #[tokio::test(start_paused = true)] + async fn test_outlier_detection_timer_driven_unejection() { + let mut config = fp_config(50, 5, 3); + // Short base for fast test; multiplier is 1 on first eject. + config.base_ejection_time = Duration::from_secs(10); + config.max_ejection_time = Duration::from_secs(60); + + let (tx, discover) = new_discover(); + let (mut lb, connector, registry) = make_lb_with_outlier(discover, config); + + for port in 8080..=8084 { + tx.send(Ok(Change::Insert(addr(port), IdleChannel::new(addr(port))))) + .await + .unwrap(); + } + drive_to_ready(&mut lb, &connector).await; + connector + .service(&addr(8084)) + .fail_call + .store(true, Ordering::Relaxed); + for _ in 0..100 { + let _ = lb.call("hello").await; + } + let _ = poll_ready_now(&mut lb); + assert!( + lb.ejected.contains_key(&addr(8084)), + "8084 must be ejected before the timer fires" + ); + assert!(registry.add_channel(addr(8084)).is_ejected()); + + // Stop 8084 from failing so it can serve again, then advance + // past `base × multiplier = 10s`. + connector + .service(&addr(8084)) + .fail_call + .store(false, Ordering::Relaxed); + tokio::time::advance(Duration::from_secs(11)).await; + // Drive poll_ready; `EjectedChannel`'s timer fires and + // `poll_unejection` routes 8084 back to ready. + let _ = poll_ready_now(&mut lb); + + assert!( + !lb.ejected.contains_key(&addr(8084)), + "8084 must leave the ejected pool once the timer fires" + ); + assert!( + lb.ready.contains_key(&addr(8084)), + "8084 must be back in ready after un-ejection" + ); + assert!(!registry.add_channel(addr(8084)).is_ejected()); + } } diff --git a/tonic-xds/src/client/loadbalance/outlier_detection.rs b/tonic-xds/src/client/loadbalance/outlier_detection.rs index e4fd685d2..80666cb1f 100644 --- a/tonic-xds/src/client/loadbalance/outlier_detection.rs +++ b/tonic-xds/src/client/loadbalance/outlier_detection.rs @@ -1,46 +1,51 @@ //! gRFC A50 outlier detection. //! -//! The algorithm is split between the data path and a spawned actor: +//! The algorithm is split between the data path, the load balancer, +//! and a spawned actor: //! //! - **Per-RPC detection** runs inline on each call completion via //! [`OutlierStatsRegistry::record_outcome`]. The wrapper records the //! outcome on the channel's [`OutlierChannelState`], evaluates the -//! failure-percentage threshold against the channel's local -//! counters, and ejects the channel directly by flipping its -//! `watch::Sender`. Cluster-wide gates (`minimum_hosts`, -//! `max_ejection_percent`) are enforced via two atomic counters on -//! the registry, kept in sync as channels cross thresholds. +//! failure-percentage threshold, and on transition to ejected sends +//! the address through an mpsc channel for the LB to consume. +//! Cluster-wide gates (`minimum_hosts`, `max_ejection_percent`) are +//! enforced via two atomic counters on the registry, kept in sync +//! as channels cross thresholds. +//! - **The load balancer** drains the eject mpsc in `poll_ready`, +//! consumes the matching [`ReadyChannel`] via +//! [`ReadyChannel::eject`], and tracks the resulting +//! [`EjectedChannel`] in a `KeyedFutures`. Each ejected channel's +//! internal sleep fires at exactly `base × multiplier` (capped by +//! `max_ejection_time`) after ejection, yielding +//! [`UnejectedChannel::Ready`]; the LB drains it on the next +//! `poll_ready` and routes the channel back to the ready set. //! - **Interval-based housekeeping** runs in a spawned actor (see //! [`spawn_actor`]). It resets per-channel counters at the -//! `config.interval` boundary, un-ejects channels whose -//! `base × multiplier` backoff has elapsed, and decrements -//! multipliers for non-ejected channels. The actor never makes -//! ejection decisions. -//! -//! `LoadBalancer::poll_ready` observes ejections in O(1) per -//! transition by polling a `FuturesUnordered` -//! over each channel's signal. +//! `config.interval` boundary and decrements multipliers for +//! non-ejected channels. Un-ejection is timer-driven by +//! [`EjectedChannel`] — the actor never un-ejects. //! //! Only the failure-percentage algorithm is dispatched. The //! success-rate algorithm (cross-endpoint mean/stdev) is left to a //! follow-up. //! //! [gRFC A50]: https://github.com/grpc/proposal/blob/master/A50-xds-outlier-detection.md +//! [`ReadyChannel`]: crate::client::loadbalance::channel_state::ReadyChannel +//! [`ReadyChannel::eject`]: crate::client::loadbalance::channel_state::ReadyChannel::eject +//! [`EjectedChannel`]: crate::client::loadbalance::channel_state::EjectedChannel +//! [`UnejectedChannel::Ready`]: crate::client::loadbalance::channel_state::UnejectedChannel::Ready -use std::collections::HashMap; -use std::pin::Pin; use std::sync::Arc; +use std::sync::Mutex; use std::sync::atomic::{AtomicU64, Ordering}; use std::task::{Context, Poll}; -use std::time::Instant; +use std::time::{Duration, Instant}; use dashmap::DashMap; -use indexmap::IndexMap; -use tokio_stream::StreamMap; -use tokio_stream::wrappers::WatchStream; +use tokio::sync::mpsc; use crate::client::endpoint::EndpointAddress; -use crate::client::loadbalance::channel_state::{OutlierChannelState, ReadyChannel}; +use crate::client::loadbalance::channel_state::OutlierChannelState; use crate::common::async_util::AbortOnDrop; use crate::xds::resource::outlier_detection::OutlierDetectionConfig; @@ -65,8 +70,9 @@ impl Rng for FastRandRng { /// [`Self::record_outcome`] after each RPC completion. /// - The spawned actor task, which calls [`Self::run_housekeeping`] /// on every `config.interval` tick. -/// - The load balancer's `poll_ready`, which subscribes to per-channel -/// ejection signals via [`OutlierChannelState::subscribe`]. +/// - The load balancer's `poll_ready`, which drains the eject mpsc +/// (via [`OutlierDetector::poll_eject_request`]) and calls +/// [`Self::note_uneject`] when an `EjectedChannel`'s timer fires. pub(crate) struct OutlierStatsRegistry { /// Per-endpoint state, keyed by address. Inserted by the LB on /// channel creation and removed on disconnect. @@ -79,6 +85,15 @@ pub(crate) struct OutlierStatsRegistry { ejected_count: AtomicU64, config: OutlierDetectionConfig, rng: Box, + /// Sender half of the eject signal. `record_outcome` pushes an + /// address through on transition to ejected; the LB's + /// [`OutlierDetector`] drains the receiver in `poll_ready` and + /// consumes the matching `ReadyChannel`. + eject_tx: mpsc::UnboundedSender, + /// Receiver half, handed to the LB at construction time. Wrapped + /// in a `Mutex>` so [`Self::take_eject_rx`] can move it + /// out exactly once. Outside that hand-off there is no contention. + eject_rx: Mutex>>, } impl OutlierStatsRegistry { @@ -89,15 +104,28 @@ impl OutlierStatsRegistry { /// Build a registry with a custom [`Rng`]. pub(crate) fn with_rng(config: OutlierDetectionConfig, rng: Box) -> Arc { + let (eject_tx, eject_rx) = mpsc::unbounded_channel(); Arc::new(Self { channels: DashMap::new(), qualifying_count: AtomicU64::new(0), ejected_count: AtomicU64::new(0), config, rng, + eject_tx, + eject_rx: Mutex::new(Some(eject_rx)), }) } + /// Take the eject-signal receiver. Called exactly once by + /// [`OutlierDetector::new`]. + fn take_eject_rx(&self) -> mpsc::UnboundedReceiver { + self.eject_rx + .lock() + .expect("eject_rx mutex poisoned") + .take() + .expect("OutlierStatsRegistry::take_eject_rx called more than once") + } + /// Register a channel and return the `Arc` /// the load balancer wires into the channel; the same `Arc` is /// retained in the registry so the actor can iterate it. If a @@ -132,8 +160,15 @@ impl OutlierStatsRegistry { /// Per-RPC entry point. Called by the load balancer's call wrapper /// after each RPC completion. Increments the channel's success or /// failure counter and then evaluates the failure-percentage - /// threshold; if all gates pass, ejects the channel inline. - pub(crate) fn record_outcome(&self, state: &OutlierChannelState, success: bool) { + /// threshold; if all gates pass and the channel was not already + /// ejected, marks it ejected and sends the address through the + /// eject mpsc for the LB to consume. + pub(crate) fn record_outcome( + &self, + addr: &EndpointAddress, + state: &OutlierChannelState, + success: bool, + ) { if success { state.record_success(); } else { @@ -179,20 +214,60 @@ impl OutlierStatsRegistry { if state.try_eject(Instant::now()) { self.ejected_count.fetch_add(1, Ordering::Relaxed); + // The LB drains this in `poll_ready` and consumes the + // `ReadyChannel` via `ReadyChannel::eject`. If the LB has + // dropped its receiver (shutdown), the send fails silently + // — the channel will be cleaned up by `forget`. + let _ = self.eject_tx.send(addr.clone()); + } + } + + /// Clear the ejection on `state` and decrement the cluster-wide + /// `ejected_count`. Returns whether the transition fired (so + /// callers can guard against double-counting). Called by the LB + /// when an `EjectedChannel`'s timer fires and yields + /// `UnejectedChannel::Ready`. + pub(crate) fn note_uneject(&self, state: &OutlierChannelState) -> bool { + if state.try_uneject() { + self.ejected_count.fetch_sub(1, Ordering::Relaxed); + true + } else { + false } } - /// Interval-boundary housekeeping. Called by the spawned actor on - /// each `config.interval` tick. Resets counters, un-ejects - /// channels whose backoff has elapsed, and decrements multipliers - /// for non-ejected channels. - pub(crate) fn run_housekeeping(&self, now: Instant) { - // Cap the un-ejection backoff at `max(base, max_ejection_time)`. + /// Compute how long `state` still has to remain ejected, or + /// `None` if it is not currently ejected. Returns + /// `Some(Duration::ZERO)` if the deadline has already passed + /// (caller should un-eject immediately rather than starting a + /// fresh sleep). Used by the LB on initial ejection and on + /// re-discovery to size the `EjectionConfig::timeout`. + pub(crate) fn remaining_ejection( + &self, + state: &OutlierChannelState, + now: Instant, + ) -> Option { + let elapsed = state.ejected_duration(now)?; + let multiplier = state.ejection_multiplier(); let cap = self .config .base_ejection_time .max(self.config.max_ejection_time); + let target = self + .config + .base_ejection_time + .checked_mul(multiplier) + .unwrap_or(cap) + .min(cap); + Some(target.checked_sub(elapsed).unwrap_or_default()) + } + /// Interval-boundary housekeeping. Called by the spawned actor on + /// each `config.interval` tick. Resets counters and decrements + /// multipliers for non-ejected channels. Does **not** un-eject — + /// un-ejection is timer-driven by each `EjectedChannel` and + /// handled by the LB when the channel resolves. + pub(crate) fn run_housekeeping(&self) { for entry in self.channels.iter() { let state = entry.value(); @@ -203,16 +278,7 @@ impl OutlierStatsRegistry { self.qualifying_count.fetch_sub(1, Ordering::Relaxed); } - if state.is_ejected() { - let multiplier = state.ejection_multiplier(); - let elapsed = state.ejected_duration(now).unwrap_or_default(); - if let Some(scaled) = self.config.base_ejection_time.checked_mul(multiplier) - && elapsed >= scaled.min(cap) - && state.try_uneject() - { - self.ejected_count.fetch_sub(1, Ordering::Relaxed); - } - } else { + if !state.is_ejected() { state.decrement_multiplier(); } } @@ -236,121 +302,57 @@ pub(crate) fn spawn_actor(registry: Arc) -> AbortOnDrop { ticker.set_missed_tick_behavior(tokio::time::MissedTickBehavior::Skip); loop { ticker.tick().await; - registry.run_housekeeping(Instant::now()); + registry.run_housekeeping(); } }); AbortOnDrop(task) } -/// All per-LB outlier-detection state: the shared registry, the pool -/// of currently-ejected channels (whose connections are kept alive -/// across ejection), the per-channel ejection-signal streams -/// aggregated for O(1) observation in `poll_ready`, and the handle to -/// the housekeeping actor (dropped with the LB). +/// Per-LB outlier-detection plumbing: the shared registry, the +/// receiver half of the eject signal mpsc, and the handle to the +/// housekeeping actor (dropped with the LB). /// -/// `LoadBalancer` holds this as `Option>`: `None` -/// when outlier detection is disabled, `Some` when enabled. -pub(crate) struct OutlierDetector { +/// `LoadBalancer` holds this as `Option`: `None` +/// when outlier detection is disabled, `Some` when enabled. The +/// pool of ejected channels themselves lives directly on the LB in a +/// `KeyedFutures<_, UnejectedChannel<_>>` — see the channel state +/// machine in [`channel_state`] for the type-state transitions. +/// +/// [`channel_state`]: crate::client::loadbalance::channel_state +pub(crate) struct OutlierDetector { registry: Arc, - ejected: HashMap>, - ejection_signals: StreamMap>, + eject_rx: mpsc::UnboundedReceiver, _actor: AbortOnDrop, } -impl OutlierDetector { - /// Build from a registry, spawning the housekeeping actor. +impl OutlierDetector { + /// Build from a registry, spawning the housekeeping actor and + /// taking ownership of the eject-signal receiver. pub(crate) fn new(registry: Arc) -> Self { + let eject_rx = registry.take_eject_rx(); let _actor = spawn_actor(registry.clone()); Self { registry, - ejected: HashMap::new(), - ejection_signals: StreamMap::new(), + eject_rx, _actor, } } - /// Shared registry handle — clone to hand to the data path. + /// Shared registry handle. pub(crate) fn registry(&self) -> &Arc { &self.registry } - /// Register a newly-connected channel for tracking and (on first - /// registration only) subscribe to its ejection signal. Returns - /// the per-channel state for the load balancer to wire into - /// [`ReadyChannel`]. - /// - /// When an endpoint is re-discovered (Insert for an address whose - /// registry entry was preserved), the existing signal subscription - /// is left in place so any pending ejection transition is not - /// dropped. - pub(crate) fn register(&mut self, addr: EndpointAddress) -> Arc { - let state = self.registry.add_channel(addr.clone()); - if !self.ejection_signals.contains_key(&addr) { - self.ejection_signals - .insert(addr, WatchStream::from_changes(state.subscribe())); - } - state - } - - /// Drop all bookkeeping for `addr`: ejection slot, signal stream, - /// registry entry. Used when the endpoint is removed from the - /// cluster. - pub(crate) fn forget(&mut self, addr: &EndpointAddress) { - self.ejected.remove(addr); - self.ejection_signals.remove(addr); - self.registry.remove_channel(addr); - } - - /// Drop the ejected-pool entry for `addr` (which holds an obsolete - /// `ReadyChannel`) but preserve the registry entry — counters, - /// ejection multiplier, and ejection flag carry across the - /// reconnect. Used when an endpoint is re-discovered. - /// - /// Matches grpc-go (`internal/xds/balancer/outlierdetection`) and - /// Envoy (`BaseDynamicClusterImpl::updateDynamicHostList` reusing - /// existing `HostSharedPtr`s): outlier state is keyed by stable - /// endpoint identity and survives transient discovery flaps. - pub(crate) fn clear_active_slots(&mut self, addr: &EndpointAddress) { - self.ejected.remove(addr); - } - - /// Place a freshly-connected channel directly into the ejected - /// pool. Used by the load balancer when the preserved state for a - /// re-discovered endpoint says it is still ejected; this avoids a - /// brief window of routing traffic to a logically-ejected channel - /// until the housekeeping actor un-ejects it. - pub(crate) fn place_ejected(&mut self, addr: EndpointAddress, ch: ReadyChannel) { - self.ejected.insert(addr, ch); - } - - /// Drain ejection-signal transitions, moving channels between - /// `ready` and the internal ejected pool. O(k) per call where k is - /// the number of pending signal changes. - pub(crate) fn poll_signals( + /// Poll for the next address whose data path has decided to + /// eject. Returns `Poll::Pending` when no eject decision is + /// queued; returns `Poll::Ready(None)` only if the registry has + /// been dropped (which can't happen while this detector holds an + /// `Arc`). + pub(crate) fn poll_eject_request( &mut self, cx: &mut Context<'_>, - ready: &mut IndexMap>, - ) { - use futures_core::Stream; - while let Poll::Ready(Some((addr, ejected))) = - Pin::new(&mut self.ejection_signals).poll_next(cx) - { - if ejected { - if let Some(ch) = ready.swap_remove(&addr) { - tracing::debug!("outlier detection: eject {addr}"); - self.ejected.insert(addr, ch); - } - } else if let Some(ch) = self.ejected.remove(&addr) { - tracing::debug!("outlier detection: uneject {addr}"); - ready.insert(addr, ch); - } - } - } - - /// Number of currently-ejected channels. - #[cfg(test)] - pub(crate) fn ejected(&self) -> &HashMap> { - &self.ejected + ) -> Poll> { + self.eject_rx.poll_recv(cx) } } @@ -426,15 +428,16 @@ mod tests { /// Drive `n` outcomes through `record_outcome` for one channel. fn drive( registry: &OutlierStatsRegistry, + a: &EndpointAddress, state: &OutlierChannelState, successes: u64, failures: u64, ) { for _ in 0..successes { - registry.record_outcome(state, true); + registry.record_outcome(a, state, true); } for _ in 0..failures { - registry.record_outcome(state, false); + registry.record_outcome(a, state, false); } } @@ -446,9 +449,9 @@ mod tests { let bad = registry.add_channel(addr(8084)); for port in 8080..=8083 { let s = registry.add_channel(addr(port)); - drive(®istry, &s, 100, 0); + drive(®istry, &addr(port), &s, 100, 0); } - drive(®istry, &bad, 10, 90); + drive(®istry, &addr(8084), &bad, 10, 90); assert!(bad.is_ejected()); assert_eq!(registry.ejected_count.load(Ordering::Relaxed), 1); } @@ -460,7 +463,7 @@ mod tests { for port in 8080..=8084 { let s = registry.add_channel(addr(port)); // 30% failure → below 50% threshold. - drive(®istry, &s, 70, 30); + drive(®istry, &addr(port), &s, 70, 30); all.push(s); } for s in &all { @@ -475,7 +478,7 @@ mod tests { let mut all = vec![]; for port in 8080..=8084 { let s = registry.add_channel(addr(port)); - drive(®istry, &s, 50, 50); + drive(®istry, &addr(port), &s, 50, 50); all.push(s); } for s in &all { @@ -490,7 +493,7 @@ mod tests { let mut all = vec![]; for port in 8080..=8081 { let s = registry.add_channel(addr(port)); - drive(®istry, &s, 0, 100); + drive(®istry, &addr(port), &s, 0, 100); all.push(s); } for s in &all { @@ -502,10 +505,10 @@ mod tests { fn request_volume_filters_low_traffic() { let registry = OutlierStatsRegistry::with_rng(fp_config(50, 100, 3), FixedRng::boxed(99)); let bad = registry.add_channel(addr(8080)); - drive(®istry, &bad, 0, 5); + drive(®istry, &addr(8080), &bad, 0, 5); for port in 8081..=8084 { let s = registry.add_channel(addr(port)); - drive(®istry, &s, 200, 0); + drive(®istry, &addr(port), &s, 200, 0); } assert!(!bad.is_ejected()); } @@ -522,7 +525,7 @@ mod tests { let mut all = vec![]; for port in 8080..=8084 { let s = registry.add_channel(addr(port)); - drive(®istry, &s, 0, 100); + drive(®istry, &addr(port), &s, 0, 100); all.push(s); } for s in &all { @@ -538,15 +541,16 @@ mod tests { let mut all = vec![]; for port in 8080..=8084 { - let s = registry.add_channel(addr(port)); - all.push(s); + let a = addr(port); + let s = registry.add_channel(a.clone()); + all.push((a, s)); } // Drive all hosts to bad state in parallel pseudo-order. - for s in &all { - drive(®istry, s, 0, 100); + for (a, s) in &all { + drive(®istry, a, s, 0, 100); } - let ejected = all.iter().filter(|s| s.is_ejected()).count(); + let ejected = all.iter().filter(|(_, s)| s.is_ejected()).count(); // 5 hosts × 20% = 1 max ejection. assert_eq!(ejected, 1); } @@ -557,11 +561,11 @@ mod tests { let mut all = vec![]; for port in 8080..=8083 { let s = registry.add_channel(addr(port)); - drive(®istry, &s, 100, 0); + drive(®istry, &addr(port), &s, 100, 0); all.push(s); } let bad = registry.add_channel(addr(8084)); - drive(®istry, &bad, 0, 100); + drive(®istry, &addr(8084), &bad, 0, 100); assert!(bad.is_ejected()); assert_eq!(registry.ejected_count.load(Ordering::Relaxed), 1); // Each healthy host crossed request_volume; bad too. So @@ -573,6 +577,25 @@ mod tests { assert_eq!(registry.qualifying_count.load(Ordering::Relaxed), 4); } + #[test] + fn ejection_dispatches_address_through_mpsc() { + let registry = OutlierStatsRegistry::with_rng(fp_config(50, 10, 3), FixedRng::boxed(99)); + let mut rx = registry.take_eject_rx(); + let bad = registry.add_channel(addr(8084)); + for port in 8080..=8083 { + let s = registry.add_channel(addr(port)); + drive(®istry, &addr(port), &s, 100, 0); + } + drive(®istry, &addr(8084), &bad, 10, 90); + + // Eject dispatched exactly once via the mpsc. + assert_eq!(rx.try_recv(), Ok(addr(8084))); + assert!(matches!( + rx.try_recv(), + Err(mpsc::error::TryRecvError::Empty) + )); + } + // ----- Housekeeping ----- #[test] @@ -580,11 +603,11 @@ mod tests { let registry = OutlierStatsRegistry::with_rng(fp_config(50, 10, 3), FixedRng::boxed(99)); for port in 8080..=8083 { let s = registry.add_channel(addr(port)); - drive(®istry, &s, 100, 0); + drive(®istry, &addr(port), &s, 100, 0); } assert_eq!(registry.qualifying_count.load(Ordering::Relaxed), 4); - registry.run_housekeeping(Instant::now()); + registry.run_housekeeping(); assert_eq!(registry.qualifying_count.load(Ordering::Relaxed), 0); for port in 8080..=8083 { let s = registry.channels.get(&addr(port)).unwrap(); @@ -593,59 +616,113 @@ mod tests { } #[test] - fn housekeeping_unejects_after_base_time() { + fn housekeeping_decrements_multiplier_on_healthy_interval() { + let registry = OutlierStatsRegistry::with_rng(base_config(), FixedRng::boxed(99)); + let s = registry.add_channel(addr(8080)); + // Force multiplier to 3 directly (no traffic, no eject). + s.set_ejection_multiplier(3); + + registry.run_housekeeping(); + assert_eq!(s.ejection_multiplier(), 2); + } + + #[test] + fn housekeeping_leaves_ejected_multipliers_alone() { + let registry = OutlierStatsRegistry::with_rng(base_config(), FixedRng::boxed(99)); + let s = registry.add_channel(addr(8080)); + s.try_eject(Instant::now()); + s.set_ejection_multiplier(3); + + registry.run_housekeeping(); + // Ejected channels keep their multiplier; un-ejection is the + // LB's job (timer-driven via EjectedChannel). + assert_eq!(s.ejection_multiplier(), 3); + assert!(s.is_ejected()); + } + + // ----- remaining_ejection / note_uneject ----- + + #[test] + fn remaining_ejection_returns_full_duration_for_fresh_eject() { let mut config = fp_config(50, 10, 3); config.base_ejection_time = Duration::from_secs(10); config.max_ejection_time = Duration::from_secs(60); let registry = OutlierStatsRegistry::with_rng(config, FixedRng::boxed(99)); - - let bad = registry.add_channel(addr(8084)); - for port in 8080..=8083 { - let s = registry.add_channel(addr(port)); - drive(®istry, &s, 100, 0); - } - drive(®istry, &bad, 0, 100); - assert!(bad.is_ejected()); - - // Advance fewer than base_ejection_time ⇒ stays ejected. + let s = registry.add_channel(addr(8080)); let t0 = Instant::now(); - registry.run_housekeeping(t0 + Duration::from_secs(9)); - assert!(bad.is_ejected()); - - // After base_ejection_time × 1 elapsed ⇒ uneject. - registry.run_housekeeping(t0 + Duration::from_secs(20)); - assert!(!bad.is_ejected()); - assert_eq!(registry.ejected_count.load(Ordering::Relaxed), 0); + s.try_eject(t0); + // Multiplier is 1 after the first eject, so target = 10s. + let remaining = registry.remaining_ejection(&s, t0).unwrap(); + assert_eq!(remaining, Duration::from_secs(10)); } #[test] - fn housekeeping_decrements_multiplier_on_healthy_interval() { - let registry = OutlierStatsRegistry::with_rng(base_config(), FixedRng::boxed(99)); + fn remaining_ejection_capped_at_max_ejection_time() { + let mut config = fp_config(50, 10, 3); + config.base_ejection_time = Duration::from_secs(10); + config.max_ejection_time = Duration::from_secs(15); + let registry = OutlierStatsRegistry::with_rng(config, FixedRng::boxed(99)); let s = registry.add_channel(addr(8080)); - // Force multiplier to 3 directly (no traffic, no eject). - s.set_ejection_multiplier(3); + let t0 = Instant::now(); + s.try_eject(t0); + s.set_ejection_multiplier(10); // base * 10 = 100s, but cap = 15s. + let remaining = registry.remaining_ejection(&s, t0).unwrap(); + assert_eq!(remaining, Duration::from_secs(15)); + } - registry.run_housekeeping(Instant::now()); - assert_eq!(s.ejection_multiplier(), 2); + #[test] + fn remaining_ejection_subtracts_elapsed_for_re_discovery() { + let mut config = fp_config(50, 10, 3); + config.base_ejection_time = Duration::from_secs(30); + config.max_ejection_time = Duration::from_secs(60); + let registry = OutlierStatsRegistry::with_rng(config, FixedRng::boxed(99)); + let s = registry.add_channel(addr(8080)); + let t0 = Instant::now(); + s.try_eject(t0); + // Re-discovered 10s into the ejection — should still have 20s left. + let remaining = registry + .remaining_ejection(&s, t0 + Duration::from_secs(10)) + .unwrap(); + assert_eq!(remaining, Duration::from_secs(20)); } #[test] - fn housekeeping_caps_ejection_at_max_ejection_time() { + fn remaining_ejection_zero_past_deadline() { let mut config = fp_config(50, 10, 3); config.base_ejection_time = Duration::from_secs(10); - config.max_ejection_time = Duration::from_secs(15); + config.max_ejection_time = Duration::from_secs(60); let registry = OutlierStatsRegistry::with_rng(config, FixedRng::boxed(99)); + let s = registry.add_channel(addr(8080)); + let t0 = Instant::now(); + s.try_eject(t0); + // 60s have passed but target is 10s — caller should un-eject. + let remaining = registry + .remaining_ejection(&s, t0 + Duration::from_secs(60)) + .unwrap(); + assert_eq!(remaining, Duration::ZERO); + } + #[test] + fn remaining_ejection_none_when_not_ejected() { + let registry = OutlierStatsRegistry::with_rng(base_config(), FixedRng::boxed(99)); + let s = registry.add_channel(addr(8080)); + assert!(registry.remaining_ejection(&s, Instant::now()).is_none()); + } + + #[test] + fn note_uneject_clears_state_and_decrements_counter() { + let registry = OutlierStatsRegistry::with_rng(base_config(), FixedRng::boxed(99)); let s = registry.add_channel(addr(8080)); - // Pretend 8080 was ejected long ago with a huge multiplier. s.try_eject(Instant::now()); - s.set_ejection_multiplier(10); - registry.ejected_count.fetch_add(0, Ordering::Relaxed); // try_eject already added 1 + registry.ejected_count.fetch_add(1, Ordering::Relaxed); + assert!(s.is_ejected()); - // base * multiplier = 100s, but cap = 15s. Sweep at 16s ⇒ uneject. - let t0 = Instant::now(); - registry.run_housekeeping(t0 + Duration::from_secs(16)); + assert!(registry.note_uneject(&s)); assert!(!s.is_ejected()); + assert_eq!(registry.ejected_count.load(Ordering::Relaxed), 0); + + // Second call is a no-op. + assert!(!registry.note_uneject(&s)); } // ----- Spawned actor ----- @@ -687,7 +764,7 @@ mod tests { } #[test] - fn channel_state_try_eject_uneject_flips_signal() { + fn channel_state_try_eject_uneject_transitions_atomically() { let s = OutlierChannelState::new(); assert!(!s.is_ejected()); assert!(s.try_eject(Instant::now())); From 7cf9053857a0644f06bc02eb21b3a20041183970 Mon Sep 17 00:00:00 2001 From: Yu Liu <60283975+LYZJU2019@users.noreply.github.com> Date: Tue, 12 May 2026 11:01:33 -0700 Subject: [PATCH 26/39] fix(tonic-xds): decrement multiplier on un-eject to match A50 step 6.b MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A50 step 6 runs once per interval and (a) un-ejects hosts whose backoff has elapsed, then (b) decrements the multiplier for every non-ejected host — in the same sweep. Envoy implements this exactly, so a host un-ejected at sweep N has its multiplier decremented at sweep N. In this PR's design un-ejection is timer-driven (each EjectedChannel holds its own Sleep), decoupled from the housekeeping sweep. With the previous note_uneject, the multiplier was only decremented at the next housekeeping interval — leaving a window where a re-eject during that window would see a stale (one-higher) multiplier and back off too aggressively relative to the spec. Apply the decrement inside note_uneject so it happens atomically with the transition. The actor's housekeeping decrement still runs at each interval; saturating arithmetic keeps the eventual decrement-to-zero correct. Adds a focused test (`re_eject_after_uneject_uses_fresh_multiplier`) verifying that a re-ejection immediately after un-ejection sizes the remaining-ejection duration with the fresh multiplier (base × 1), not the stale one (base × 2). --- .../client/loadbalance/outlier_detection.rs | 63 +++++++++++++++++-- 1 file changed, 57 insertions(+), 6 deletions(-) diff --git a/tonic-xds/src/client/loadbalance/outlier_detection.rs b/tonic-xds/src/client/loadbalance/outlier_detection.rs index 80666cb1f..bd19fc6bb 100644 --- a/tonic-xds/src/client/loadbalance/outlier_detection.rs +++ b/tonic-xds/src/client/loadbalance/outlier_detection.rs @@ -222,14 +222,25 @@ impl OutlierStatsRegistry { } } - /// Clear the ejection on `state` and decrement the cluster-wide - /// `ejected_count`. Returns whether the transition fired (so - /// callers can guard against double-counting). Called by the LB - /// when an `EjectedChannel`'s timer fires and yields - /// `UnejectedChannel::Ready`. + /// Clear the ejection on `state`, decrement the cluster-wide + /// `ejected_count`, and decrement the channel's ejection + /// multiplier (matching gRFC A50 step 6.b, which decrements + /// multiplier in the same sweep that un-ejects). Returns whether + /// the transition fired (so callers can guard against + /// double-counting). Called by the LB when an `EjectedChannel`'s + /// timer fires and yields `UnejectedChannel::Ready`. pub(crate) fn note_uneject(&self, state: &OutlierChannelState) -> bool { if state.try_uneject() { self.ejected_count.fetch_sub(1, Ordering::Relaxed); + // Per A50, the same sweep that un-ejects also decrements + // the multiplier. Since our un-ejection is timer-driven + // (decoupled from the housekeeping sweep), we apply the + // decrement here to avoid a window where a re-eject would + // see a stale (one-higher) multiplier and back off too + // aggressively. The actor's housekeeping decrement still + // runs at each interval; saturating arithmetic ensures + // the eventual decrement to zero stays correct. + state.decrement_multiplier(); true } else { false @@ -713,16 +724,56 @@ mod tests { fn note_uneject_clears_state_and_decrements_counter() { let registry = OutlierStatsRegistry::with_rng(base_config(), FixedRng::boxed(99)); let s = registry.add_channel(addr(8080)); - s.try_eject(Instant::now()); + s.try_eject(Instant::now()); // bumps multiplier 0 → 1 registry.ejected_count.fetch_add(1, Ordering::Relaxed); assert!(s.is_ejected()); + assert_eq!(s.ejection_multiplier(), 1); assert!(registry.note_uneject(&s)); assert!(!s.is_ejected()); assert_eq!(registry.ejected_count.load(Ordering::Relaxed), 0); + // A50 step 6.b: same sweep that un-ejects also decrements + // the multiplier. + assert_eq!(s.ejection_multiplier(), 0); // Second call is a no-op. assert!(!registry.note_uneject(&s)); + assert_eq!(s.ejection_multiplier(), 0); + } + + /// Re-ejecting a channel immediately after un-ejection should + /// produce a backoff sized for multiplier=1, not multiplier=2 — + /// i.e. it should *not* punish the channel for the previous + /// ejection that has just finished serving its cooldown. This is + /// what gRFC A50 prescribes and what Envoy does (un-eject and + /// decrement happen at the same sweep). + #[test] + fn re_eject_after_uneject_uses_fresh_multiplier() { + let mut config = fp_config(50, 10, 3); + config.base_ejection_time = Duration::from_secs(10); + config.max_ejection_time = Duration::from_secs(300); + let registry = OutlierStatsRegistry::with_rng(config, FixedRng::boxed(99)); + let s = registry.add_channel(addr(8080)); + + let t0 = Instant::now(); + s.try_eject(t0); // multiplier 0 → 1 + registry.ejected_count.fetch_add(1, Ordering::Relaxed); + assert_eq!(s.ejection_multiplier(), 1); + + // Backoff elapses; LB calls note_uneject. + registry.note_uneject(&s); + assert_eq!(s.ejection_multiplier(), 0); + + // Channel immediately misbehaves again and gets re-ejected. + let t1 = t0 + Duration::from_secs(11); + s.try_eject(t1); // multiplier 0 → 1, not 1 → 2 + assert_eq!(s.ejection_multiplier(), 1); + // Remaining ejection duration should be `base * 1 = 10s`, + // not `base * 2 = 20s`. + assert_eq!( + registry.remaining_ejection(&s, t1).unwrap(), + Duration::from_secs(10), + ); } // ----- Spawned actor ----- From 3ef3748fcbcd14b60840a59e64a45db2561e43d4 Mon Sep 17 00:00:00 2001 From: Yu Liu <60283975+LYZJU2019@users.noreply.github.com> Date: Tue, 12 May 2026 11:04:28 -0700 Subject: [PATCH 27/39] fix(tonic-xds): make decrement_multiplier atomic via fetch_update MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The load-then-store implementation could lose decrements when: - actor housekeeping decrements concurrently with note_uneject (both call decrement_multiplier); - or either of those races a data-path try_eject (which does fetch_add on the same atomic). Swap to fetch_update with a saturating closure so the read-modify- write is atomic. Bias was bounded at ±1 before, so this is not a correctness fix per se — just closes a small race window cleanly. --- tonic-xds/src/client/loadbalance/channel_state.rs | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/tonic-xds/src/client/loadbalance/channel_state.rs b/tonic-xds/src/client/loadbalance/channel_state.rs index 472ba95c8..3e6354e8d 100644 --- a/tonic-xds/src/client/loadbalance/channel_state.rs +++ b/tonic-xds/src/client/loadbalance/channel_state.rs @@ -207,12 +207,15 @@ impl OutlierChannelState { } /// Decrement the multiplier saturating at zero. Called by the - /// actor on healthy intervals. + /// actor on healthy intervals and by `note_uneject` on un-ejection. + /// Uses `fetch_update` so the load-and-store is atomic against + /// concurrent `try_eject` (`fetch_add`) and other decrements. pub(crate) fn decrement_multiplier(&self) { - let prev = self.ejection_multiplier.load(Ordering::Relaxed); - if prev > 0 { - self.ejection_multiplier.store(prev - 1, Ordering::Relaxed); - } + let _ = self + .ejection_multiplier + .fetch_update(Ordering::Relaxed, Ordering::Relaxed, |v| { + if v > 0 { Some(v - 1) } else { None } + }); } /// Test-only setter for the ejection multiplier; lets tests drive From 66d2d6e8e9e8d670a613845250b4bb0eec63d20d Mon Sep 17 00:00:00 2001 From: Yu Liu <60283975+LYZJU2019@users.noreply.github.com> Date: Tue, 12 May 2026 11:08:28 -0700 Subject: [PATCH 28/39] fix(tonic-xds): error instead of panic when an OutlierStatsRegistry is wired twice MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The registry's eject-signal mpsc receiver is one-shot — a registry can drive at most one LoadBalancer. The previous implementation panicked at runtime if a misuse handed the same registry to two `with_outlier` calls. Return a typed error (`RegistryAlreadyWired`) from `OutlierStatsRegistry::take_eject_rx`, propagated through `OutlierDetector::new` and `LoadBalancer::with_outlier`. `LoadBalancer::new` stays infallible because the `outlier=None` path does not invoke the registry hand-off. Adds `test_outlier_registry_cannot_be_wired_twice` to lock the contract. --- .../src/client/loadbalance/loadbalancer.rs | 54 ++++++++++++++++--- .../client/loadbalance/outlier_detection.rs | 33 ++++++++---- 2 files changed, 71 insertions(+), 16 deletions(-) diff --git a/tonic-xds/src/client/loadbalance/loadbalancer.rs b/tonic-xds/src/client/loadbalance/loadbalancer.rs index 4cedff7dd..63c4fedaf 100644 --- a/tonic-xds/src/client/loadbalance/loadbalancer.rs +++ b/tonic-xds/src/client/loadbalance/loadbalancer.rs @@ -35,7 +35,9 @@ use crate::client::loadbalance::channel_state::{ }; use crate::client::loadbalance::errors::LbError; use crate::client::loadbalance::keyed_futures::KeyedFutures; -use crate::client::loadbalance::outlier_detection::{OutlierDetector, OutlierStatsRegistry}; +use crate::client::loadbalance::outlier_detection::{ + OutlierDetector, OutlierStatsRegistry, RegistryAlreadyWired, +}; use crate::client::loadbalance::pickers::ChannelPicker; /// Future returned by [`LoadBalancer::call`]. @@ -110,27 +112,36 @@ where connector: Arc, picker: Arc, Req> + Send + Sync>, ) -> Self { - Self::with_outlier(discovery, connector, picker, None) + // Infallible: `with_outlier(_, _, _, None)` never touches the + // outlier-detection construction path. + match Self::with_outlier(discovery, connector, picker, None) { + Ok(lb) => lb, + Err(_) => unreachable!("with_outlier(.., None) cannot wire a registry"), + } } /// Create a load balancer, optionally enabling outlier detection. /// When `outlier` is `Some`, the registry's housekeeping actor is /// spawned and its lifetime is bound to the load balancer. + /// Returns [`RegistryAlreadyWired`] if the provided registry has + /// already been wired to another load balancer — a registry's + /// eject-signal receiver is one-shot. pub(crate) fn with_outlier( discovery: D, connector: Arc, picker: Arc, Req> + Send + Sync>, outlier: Option>, - ) -> Self { - Self { + ) -> Result { + let outlier = outlier.map(OutlierDetector::new).transpose()?; + Ok(Self { discovery, connector, connecting: KeyedFutures::new(), ready: IndexMap::new(), ejected: KeyedFutures::new(), - outlier: outlier.map(OutlierDetector::new), + outlier, picker, - } + }) } /// Purge all per-endpoint state for `addr`: the connecting @@ -928,7 +939,8 @@ mod tests { Arc::new(P2cPicker); let registry = OutlierStatsRegistry::with_rng(config, Box::new(AlwaysFireRng)); let lb = - LoadBalancer::with_outlier(discover, connector.clone(), picker, Some(registry.clone())); + LoadBalancer::with_outlier(discover, connector.clone(), picker, Some(registry.clone())) + .expect("registry not yet wired"); (lb, connector, registry) } @@ -1187,4 +1199,32 @@ mod tests { ); assert!(!registry.add_channel(addr(8084)).is_ejected()); } + + /// Sharing one `OutlierStatsRegistry` across two `LoadBalancer`s is + /// not supported — the eject-signal receiver is one-shot. The + /// second `with_outlier` call must return an error rather than + /// panic. + #[tokio::test] + async fn test_outlier_registry_cannot_be_wired_twice() { + let (_tx1, discover1) = new_discover(); + let (_tx2, discover2) = new_discover(); + let connector = Arc::new(MockConnector::new()); + let picker: Arc, &'static str> + Send + Sync> = + Arc::new(P2cPicker); + let registry = OutlierStatsRegistry::with_rng(fp_config(50, 5, 3), Box::new(AlwaysFireRng)); + + // First wiring succeeds. + LoadBalancer::with_outlier( + discover1, + connector.clone(), + picker.clone(), + Some(registry.clone()), + ) + .expect("first wire"); + + // Second wiring of the same registry must error, not panic. + let result = + LoadBalancer::with_outlier(discover2, connector, picker, Some(registry.clone())); + assert!(result.is_err()); + } } diff --git a/tonic-xds/src/client/loadbalance/outlier_detection.rs b/tonic-xds/src/client/loadbalance/outlier_detection.rs index bd19fc6bb..d3066b600 100644 --- a/tonic-xds/src/client/loadbalance/outlier_detection.rs +++ b/tonic-xds/src/client/loadbalance/outlier_detection.rs @@ -49,6 +49,14 @@ use crate::client::loadbalance::channel_state::OutlierChannelState; use crate::common::async_util::AbortOnDrop; use crate::xds::resource::outlier_detection::OutlierDetectionConfig; +/// Construction-time error returned when a single +/// [`OutlierStatsRegistry`] is wired to more than one load balancer. +/// The registry's eject-signal receiver is one-shot; reuse is not +/// supported. +#[derive(Debug, thiserror::Error)] +#[error("OutlierStatsRegistry is already wired to a LoadBalancer")] +pub(crate) struct RegistryAlreadyWired; + /// Probability source for `enforcing_*` rolls. pub(crate) trait Rng: Send + Sync + 'static { /// Return a uniform random `u32` in `0..100`. @@ -117,13 +125,17 @@ impl OutlierStatsRegistry { } /// Take the eject-signal receiver. Called exactly once by - /// [`OutlierDetector::new`]. - fn take_eject_rx(&self) -> mpsc::UnboundedReceiver { + /// [`OutlierDetector::new`]. Returns + /// [`RegistryAlreadyWired`] if a previous call has already taken + /// the receiver — a registry can drive at most one load balancer. + fn take_eject_rx( + &self, + ) -> Result, RegistryAlreadyWired> { self.eject_rx .lock() .expect("eject_rx mutex poisoned") .take() - .expect("OutlierStatsRegistry::take_eject_rx called more than once") + .ok_or(RegistryAlreadyWired) } /// Register a channel and return the `Arc` @@ -338,15 +350,18 @@ pub(crate) struct OutlierDetector { impl OutlierDetector { /// Build from a registry, spawning the housekeeping actor and - /// taking ownership of the eject-signal receiver. - pub(crate) fn new(registry: Arc) -> Self { - let eject_rx = registry.take_eject_rx(); + /// taking ownership of the eject-signal receiver. Returns + /// [`RegistryAlreadyWired`] if the registry's receiver has + /// already been taken (i.e. this registry is already driving + /// another load balancer); a registry can drive at most one LB. + pub(crate) fn new(registry: Arc) -> Result { + let eject_rx = registry.take_eject_rx()?; let _actor = spawn_actor(registry.clone()); - Self { + Ok(Self { registry, eject_rx, _actor, - } + }) } /// Shared registry handle. @@ -591,7 +606,7 @@ mod tests { #[test] fn ejection_dispatches_address_through_mpsc() { let registry = OutlierStatsRegistry::with_rng(fp_config(50, 10, 3), FixedRng::boxed(99)); - let mut rx = registry.take_eject_rx(); + let mut rx = registry.take_eject_rx().expect("receiver available"); let bad = registry.add_channel(addr(8084)); for port in 8080..=8083 { let s = registry.add_channel(addr(port)); From 48ae8985461226a32c9edbe3e2651f7837e49c0e Mon Sep 17 00:00:00 2001 From: Yu Liu <60283975+LYZJU2019@users.noreply.github.com> Date: Tue, 12 May 2026 11:14:23 -0700 Subject: [PATCH 29/39] refactor(tonic-xds): give OutlierChannelState its own address MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Store the endpoint address directly on `OutlierChannelState` at construction time and expose `state.addr()` so downstream callers don't need to thread `(addr, state)` pairs alongside the state. API impact: - `OutlierChannelState::new(addr)` now takes the address explicitly. - `OutlierStatsRegistry::record_outcome(state, success)` drops its `addr` parameter; the mpsc dispatch reads `state.addr()`. - `ReadyChannel::addr()` (added earlier this PR only for the `record_outcome` thread-through) is removed — no remaining caller. The data path now passes just the `Arc` to `record_outcome`, which is cleaner and removes the awkwardness of two parameters that always travel together. --- .../src/client/loadbalance/channel_state.rs | 44 +++++++------ .../src/client/loadbalance/loadbalancer.rs | 7 +- .../client/loadbalance/outlier_detection.rs | 65 +++++++++---------- 3 files changed, 59 insertions(+), 57 deletions(-) diff --git a/tonic-xds/src/client/loadbalance/channel_state.rs b/tonic-xds/src/client/loadbalance/channel_state.rs index 3e6354e8d..29db913c8 100644 --- a/tonic-xds/src/client/loadbalance/channel_state.rs +++ b/tonic-xds/src/client/loadbalance/channel_state.rs @@ -76,15 +76,23 @@ impl EndpointCounters { /// load balancer (consults `is_ejected` / `ejected_duration` on /// reconnect). /// -/// All fields are atomics so the data path can mutate them without -/// locking. Ejection state is encoded in [`Self::ejected_at_nanos`]: -/// zero means not ejected, non-zero is the nanos-since-epoch of the -/// ejection's start. [`Self::try_eject`] / [`Self::try_uneject`] use -/// CAS to flip the field atomically and report whether the transition -/// fired (so callers can update registry-level counters exactly once -/// per transition). +/// All mutable fields are atomics so the data path can mutate them +/// without locking. Ejection state is encoded in +/// [`Self::ejected_at_nanos`]: zero means not ejected, non-zero is the +/// nanos-since-epoch of the ejection's start. [`Self::try_eject`] / +/// [`Self::try_uneject`] use CAS to flip the field atomically and +/// report whether the transition fired (so callers can update +/// registry-level counters exactly once per transition). +/// +/// The `addr` field is set at construction and never changes, so +/// downstream callers (the registry's eject-mpsc dispatch in +/// particular) can recover the address from the state alone — no +/// need to thread `(addr, state)` pairs through the data path. #[derive(Debug)] pub(crate) struct OutlierChannelState { + /// Endpoint address this state belongs to. Immutable for the + /// lifetime of the state object. + addr: EndpointAddress, counters: EndpointCounters, /// Whether this channel currently contributes to the registry's /// `qualifying_count`. Set when `total` first reaches @@ -103,15 +111,10 @@ pub(crate) struct OutlierChannelState { epoch: Instant, } -impl Default for OutlierChannelState { - fn default() -> Self { - Self::new() - } -} - impl OutlierChannelState { - pub(crate) fn new() -> Self { + pub(crate) fn new(addr: EndpointAddress) -> Self { Self { + addr, counters: EndpointCounters::default(), is_qualifying: AtomicBool::new(false), ejection_multiplier: AtomicU32::new(0), @@ -120,6 +123,11 @@ impl OutlierChannelState { } } + /// Endpoint address this state belongs to. + pub(crate) fn addr(&self) -> &EndpointAddress { + &self.addr + } + pub(crate) fn record_success(&self) { self.counters.record_success(); } @@ -342,11 +350,6 @@ impl ReadyChannel { &self.outlier } - /// Endpoint address this channel was created for. - pub(crate) fn addr(&self) -> &EndpointAddress { - &self.addr - } - /// Eject this channel (e.g., due to outlier detection). Consumes /// self. The outlier state remains in the registry; only the /// service and address are passed into [`EjectedChannel`] (which @@ -515,7 +518,8 @@ mod tests { } fn wrap_ready(addr: EndpointAddress, svc: MockService) -> ReadyChannel { - ReadyChannel::new(addr, svc, Arc::new(OutlierChannelState::new())) + let state = Arc::new(OutlierChannelState::new(addr.clone())); + ReadyChannel::new(addr, svc, state) } #[tokio::test] diff --git a/tonic-xds/src/client/loadbalance/loadbalancer.rs b/tonic-xds/src/client/loadbalance/loadbalancer.rs index 63c4fedaf..f37218623 100644 --- a/tonic-xds/src/client/loadbalance/loadbalancer.rs +++ b/tonic-xds/src/client/loadbalance/loadbalancer.rs @@ -215,7 +215,7 @@ where while let Poll::Ready(Some((addr, svc))) = self.connecting.poll_next(cx) { let state = match self.outlier.as_ref() { Some(o) => o.registry().add_channel(addr.clone()), - None => Arc::new(OutlierChannelState::new()), + None => Arc::new(OutlierChannelState::new(addr.clone())), }; let ready = ReadyChannel::new(addr.clone(), svc, state.clone()); let remaining = self @@ -318,7 +318,7 @@ where while let Poll::Ready(Some((addr, unejected))) = self.ejected.poll_next(cx) { let state = match self.outlier.as_ref() { Some(o) => o.registry().add_channel(addr.clone()), - None => Arc::new(OutlierChannelState::new()), + None => Arc::new(OutlierChannelState::new(addr.clone())), }; if let Some(o) = self.outlier.as_ref() { o.registry().note_uneject(&state); @@ -399,7 +399,6 @@ where // an owned service and outlier handle for the async block; both // are `Arc`-shared, so cloning is cheap. let mut svc = picked.clone(); - let addr = picked.addr().clone(); let outlier_state = picked.outlier().clone(); let registry = self.outlier.as_ref().map(|o| o.registry().clone()); LbFuture::Pending(Box::pin(async move { @@ -412,7 +411,7 @@ where // counter and (inside `record_outcome`) possibly // dispatch an eject request to the LB. Treat any // `Err` outcome as a failure for outlier purposes. - registry.record_outcome(&addr, &outlier_state, result.is_ok()); + registry.record_outcome(&outlier_state, result.is_ok()); } result.map_err(|e| LbError::LbChannelCallError(e.into())) })) diff --git a/tonic-xds/src/client/loadbalance/outlier_detection.rs b/tonic-xds/src/client/loadbalance/outlier_detection.rs index d3066b600..6e1bed53b 100644 --- a/tonic-xds/src/client/loadbalance/outlier_detection.rs +++ b/tonic-xds/src/client/loadbalance/outlier_detection.rs @@ -145,8 +145,8 @@ impl OutlierStatsRegistry { /// state continuity across reconnect cycles is preserved. pub(crate) fn add_channel(&self, addr: EndpointAddress) -> Arc { self.channels - .entry(addr) - .or_insert_with(|| Arc::new(OutlierChannelState::new())) + .entry(addr.clone()) + .or_insert_with(|| Arc::new(OutlierChannelState::new(addr))) .clone() } @@ -175,12 +175,7 @@ impl OutlierStatsRegistry { /// threshold; if all gates pass and the channel was not already /// ejected, marks it ejected and sends the address through the /// eject mpsc for the LB to consume. - pub(crate) fn record_outcome( - &self, - addr: &EndpointAddress, - state: &OutlierChannelState, - success: bool, - ) { + pub(crate) fn record_outcome(&self, state: &OutlierChannelState, success: bool) { if success { state.record_success(); } else { @@ -230,7 +225,7 @@ impl OutlierStatsRegistry { // `ReadyChannel` via `ReadyChannel::eject`. If the LB has // dropped its receiver (shutdown), the send fails silently // — the channel will be cleaned up by `forget`. - let _ = self.eject_tx.send(addr.clone()); + let _ = self.eject_tx.send(state.addr().clone()); } } @@ -454,16 +449,15 @@ mod tests { /// Drive `n` outcomes through `record_outcome` for one channel. fn drive( registry: &OutlierStatsRegistry, - a: &EndpointAddress, state: &OutlierChannelState, successes: u64, failures: u64, ) { for _ in 0..successes { - registry.record_outcome(a, state, true); + registry.record_outcome(state, true); } for _ in 0..failures { - registry.record_outcome(a, state, false); + registry.record_outcome(state, false); } } @@ -475,9 +469,9 @@ mod tests { let bad = registry.add_channel(addr(8084)); for port in 8080..=8083 { let s = registry.add_channel(addr(port)); - drive(®istry, &addr(port), &s, 100, 0); + drive(®istry, &s, 100, 0); } - drive(®istry, &addr(8084), &bad, 10, 90); + drive(®istry, &bad, 10, 90); assert!(bad.is_ejected()); assert_eq!(registry.ejected_count.load(Ordering::Relaxed), 1); } @@ -489,7 +483,7 @@ mod tests { for port in 8080..=8084 { let s = registry.add_channel(addr(port)); // 30% failure → below 50% threshold. - drive(®istry, &addr(port), &s, 70, 30); + drive(®istry, &s, 70, 30); all.push(s); } for s in &all { @@ -504,7 +498,7 @@ mod tests { let mut all = vec![]; for port in 8080..=8084 { let s = registry.add_channel(addr(port)); - drive(®istry, &addr(port), &s, 50, 50); + drive(®istry, &s, 50, 50); all.push(s); } for s in &all { @@ -519,7 +513,7 @@ mod tests { let mut all = vec![]; for port in 8080..=8081 { let s = registry.add_channel(addr(port)); - drive(®istry, &addr(port), &s, 0, 100); + drive(®istry, &s, 0, 100); all.push(s); } for s in &all { @@ -531,10 +525,10 @@ mod tests { fn request_volume_filters_low_traffic() { let registry = OutlierStatsRegistry::with_rng(fp_config(50, 100, 3), FixedRng::boxed(99)); let bad = registry.add_channel(addr(8080)); - drive(®istry, &addr(8080), &bad, 0, 5); + drive(®istry, &bad, 0, 5); for port in 8081..=8084 { let s = registry.add_channel(addr(port)); - drive(®istry, &addr(port), &s, 200, 0); + drive(®istry, &s, 200, 0); } assert!(!bad.is_ejected()); } @@ -551,7 +545,7 @@ mod tests { let mut all = vec![]; for port in 8080..=8084 { let s = registry.add_channel(addr(port)); - drive(®istry, &addr(port), &s, 0, 100); + drive(®istry, &s, 0, 100); all.push(s); } for s in &all { @@ -567,16 +561,15 @@ mod tests { let mut all = vec![]; for port in 8080..=8084 { - let a = addr(port); - let s = registry.add_channel(a.clone()); - all.push((a, s)); + let s = registry.add_channel(addr(port)); + all.push(s); } // Drive all hosts to bad state in parallel pseudo-order. - for (a, s) in &all { - drive(®istry, a, s, 0, 100); + for s in &all { + drive(®istry, s, 0, 100); } - let ejected = all.iter().filter(|(_, s)| s.is_ejected()).count(); + let ejected = all.iter().filter(|s| s.is_ejected()).count(); // 5 hosts × 20% = 1 max ejection. assert_eq!(ejected, 1); } @@ -587,11 +580,11 @@ mod tests { let mut all = vec![]; for port in 8080..=8083 { let s = registry.add_channel(addr(port)); - drive(®istry, &addr(port), &s, 100, 0); + drive(®istry, &s, 100, 0); all.push(s); } let bad = registry.add_channel(addr(8084)); - drive(®istry, &addr(8084), &bad, 0, 100); + drive(®istry, &bad, 0, 100); assert!(bad.is_ejected()); assert_eq!(registry.ejected_count.load(Ordering::Relaxed), 1); // Each healthy host crossed request_volume; bad too. So @@ -610,9 +603,9 @@ mod tests { let bad = registry.add_channel(addr(8084)); for port in 8080..=8083 { let s = registry.add_channel(addr(port)); - drive(®istry, &addr(port), &s, 100, 0); + drive(®istry, &s, 100, 0); } - drive(®istry, &addr(8084), &bad, 10, 90); + drive(®istry, &bad, 10, 90); // Eject dispatched exactly once via the mpsc. assert_eq!(rx.try_recv(), Ok(addr(8084))); @@ -629,7 +622,7 @@ mod tests { let registry = OutlierStatsRegistry::with_rng(fp_config(50, 10, 3), FixedRng::boxed(99)); for port in 8080..=8083 { let s = registry.add_channel(addr(port)); - drive(®istry, &addr(port), &s, 100, 0); + drive(®istry, &s, 100, 0); } assert_eq!(registry.qualifying_count.load(Ordering::Relaxed), 4); @@ -821,7 +814,7 @@ mod tests { #[test] fn channel_state_records_and_resets() { - let s = OutlierChannelState::new(); + let s = OutlierChannelState::new(addr(8080)); s.record_success(); s.record_success(); s.record_failure(); @@ -831,7 +824,7 @@ mod tests { #[test] fn channel_state_try_eject_uneject_transitions_atomically() { - let s = OutlierChannelState::new(); + let s = OutlierChannelState::new(addr(8080)); assert!(!s.is_ejected()); assert!(s.try_eject(Instant::now())); assert!(s.is_ejected()); @@ -841,4 +834,10 @@ mod tests { assert!(!s.is_ejected()); assert!(!s.try_uneject()); } + + #[test] + fn channel_state_remembers_its_address() { + let s = OutlierChannelState::new(addr(9090)); + assert_eq!(s.addr(), &addr(9090)); + } } From 7c903eb9b5936aaab1623d4e6d735ac200dab4bc Mon Sep 17 00:00:00 2001 From: Yu Liu <60283975+LYZJU2019@users.noreply.github.com> Date: Tue, 12 May 2026 11:36:23 -0700 Subject: [PATCH 30/39] docs(tonic-xds): trim outlier-detection doc comments Tighten doc comments across the outlier-detection module, the channel state machine, and the LB. Remove rationale-style narrative and references to past designs; keep API contracts, gRFC references, and non-obvious invariants. No code changes. --- .../src/client/loadbalance/channel_state.rs | 140 ++++-------- .../src/client/loadbalance/keyed_futures.rs | 4 +- .../src/client/loadbalance/loadbalancer.rs | 170 +++++---------- .../client/loadbalance/outlier_detection.rs | 205 ++++++------------ .../src/xds/resource/outlier_detection.rs | 32 +-- 5 files changed, 183 insertions(+), 368 deletions(-) diff --git a/tonic-xds/src/client/loadbalance/channel_state.rs b/tonic-xds/src/client/loadbalance/channel_state.rs index 29db913c8..a4a57d4bb 100644 --- a/tonic-xds/src/client/loadbalance/channel_state.rs +++ b/tonic-xds/src/client/loadbalance/channel_state.rs @@ -60,9 +60,8 @@ impl EndpointCounters { } /// Read and zero both counters. The two swaps are not atomic against - /// each other — RPCs landing between them may bias the snapshot by - /// a small number of events, well below the precision of the - /// failure-percentage threshold. + /// each other; bias from in-flight RPCs is bounded and well below + /// the precision of the failure-percentage threshold. pub(crate) fn snapshot_and_reset(&self) -> (u64, u64) { let s = self.success.swap(0, Ordering::Relaxed); let f = self.failure.swap(0, Ordering::Relaxed); @@ -70,44 +69,29 @@ impl EndpointCounters { } } -/// Per-channel outlier-detection state, shared (via `Arc`) between -/// the data path (per-RPC outcome recording + threshold-based ejection), -/// the outlier-detection actor (interval-based housekeeping), and the -/// load balancer (consults `is_ejected` / `ejected_duration` on -/// reconnect). +/// Per-channel outlier-detection state, shared via `Arc` between the +/// data path (per-RPC outcome recording + threshold-based ejection), +/// the housekeeping actor, and the load balancer. /// -/// All mutable fields are atomics so the data path can mutate them -/// without locking. Ejection state is encoded in -/// [`Self::ejected_at_nanos`]: zero means not ejected, non-zero is the -/// nanos-since-epoch of the ejection's start. [`Self::try_eject`] / -/// [`Self::try_uneject`] use CAS to flip the field atomically and -/// report whether the transition fired (so callers can update -/// registry-level counters exactly once per transition). -/// -/// The `addr` field is set at construction and never changes, so -/// downstream callers (the registry's eject-mpsc dispatch in -/// particular) can recover the address from the state alone — no -/// need to thread `(addr, state)` pairs through the data path. +/// Ejection state is encoded in [`Self::ejected_at_nanos`]: zero means +/// not ejected, non-zero is the nanos-since-epoch of the ejection's +/// start. [`Self::try_eject`] / [`Self::try_uneject`] use CAS so callers +/// can update registry-level counters exactly once per transition. #[derive(Debug)] pub(crate) struct OutlierChannelState { - /// Endpoint address this state belongs to. Immutable for the - /// lifetime of the state object. addr: EndpointAddress, counters: EndpointCounters, - /// Whether this channel currently contributes to the registry's - /// `qualifying_count`. Set when `total` first reaches - /// `request_volume` in the current interval; cleared on counter - /// reset. + /// `true` while this channel is counted in the registry's + /// `qualifying_count` (i.e. has hit `request_volume` in the + /// current interval). is_qualifying: AtomicBool, - /// Number of times this channel has been ejected. Bumped on each - /// ejection; decremented (saturating) on each healthy interval. + /// Bumped on each ejection; decremented (saturating) on each + /// healthy interval. ejection_multiplier: AtomicU32, - /// `0` when not ejected. Otherwise nanos since [`Self::epoch`] of - /// the current ejection's start. Single source of truth for - /// "is this channel ejected right now?". + /// `0` when not ejected; otherwise nanos since [`Self::epoch`] of + /// the current ejection's start. ejected_at_nanos: AtomicU64, - /// Reference instant used as the origin for `ejected_at_nanos`. - /// Established at construction and never changes. + /// Origin for `ejected_at_nanos`. Set at construction. epoch: Instant, } @@ -136,10 +120,8 @@ impl OutlierChannelState { self.counters.record_failure(); } - /// Read the current counter values without resetting. Returns - /// `(success, failure)`. The two reads are not atomic against - /// each other but the difference is bounded by concurrent in-flight - /// RPCs and is below the precision of the failure-percentage check. + /// Returns `(success, failure)` without resetting. The two reads + /// are not atomic together; bias is bounded by in-flight RPCs. pub(crate) fn counters(&self) -> (u64, u64) { let s = self.counters.success.load(Ordering::Relaxed); let f = self.counters.failure.load(Ordering::Relaxed); @@ -151,9 +133,9 @@ impl OutlierChannelState { self.counters.snapshot_and_reset() } - /// Try to set `is_qualifying` to `true`. Returns `true` if this - /// call performed the false → true transition, so callers can - /// increment a registry-level counter exactly once per crossing. + /// Set `is_qualifying` to `true`. Returns `true` if this call + /// performed the false → true transition (so the caller can bump + /// the registry counter exactly once per crossing). pub(crate) fn mark_qualifying(&self) -> bool { !self.is_qualifying.swap(true, Ordering::AcqRel) } @@ -164,10 +146,8 @@ impl OutlierChannelState { } /// Atomically mark this channel as ejected starting at `now`. - /// Returns `true` if this call performed the not-ejected → - /// ejected transition (so callers can update registry-level - /// counters exactly once per ejection). Bumps the multiplier on - /// transition. + /// Returns `true` on the not-ejected → ejected transition and + /// bumps the multiplier; `false` if already ejected. pub(crate) fn try_eject(&self, now: Instant) -> bool { let nanos = now .saturating_duration_since(self.epoch) @@ -187,8 +167,8 @@ impl OutlierChannelState { true } - /// Atomically clear the ejection. Returns `true` if this call - /// performed the ejected → not-ejected transition. + /// Atomically clear the ejection. Returns `true` on the + /// ejected → not-ejected transition. pub(crate) fn try_uneject(&self) -> bool { self.ejected_at_nanos.swap(0, Ordering::AcqRel) != 0 } @@ -214,10 +194,8 @@ impl OutlierChannelState { self.ejection_multiplier.load(Ordering::Relaxed) } - /// Decrement the multiplier saturating at zero. Called by the - /// actor on healthy intervals and by `note_uneject` on un-ejection. - /// Uses `fetch_update` so the load-and-store is atomic against - /// concurrent `try_eject` (`fetch_add`) and other decrements. + /// Decrement the multiplier, saturating at zero. Atomic against + /// concurrent `try_eject` and other decrements. pub(crate) fn decrement_multiplier(&self) { let _ = self .ejection_multiplier @@ -226,8 +204,8 @@ impl OutlierChannelState { }); } - /// Test-only setter for the ejection multiplier; lets tests drive - /// housekeeping behavior without going through `try_eject`. + /// Test-only multiplier setter for driving housekeeping without + /// going through `try_eject`. #[cfg(test)] pub(crate) fn set_ejection_multiplier(&self, value: u32) { self.ejection_multiplier.store(value, Ordering::Relaxed); @@ -245,10 +223,8 @@ pub(crate) struct EjectionConfig { /// Result of an ejection expiring. pub(crate) enum UnejectedChannel { - /// The channel is ready to serve again (ejection expired, no - /// reconnect needed). The consumer wraps the bare service into a - /// [`ReadyChannel`] using the registry-supplied - /// [`OutlierChannelState`]. + /// Connection reused; the caller wraps the service back into a + /// [`ReadyChannel`]. Ready(S), /// A fresh connection has been started. Connecting(ConnectingChannel), @@ -284,14 +260,10 @@ impl IdleChannel { /// A channel that is in the process of connecting. /// -/// Implements [`Future`] -- resolves to the connected service `S` -/// when the connection completes. The consumer wraps that into a -/// [`ReadyChannel`] (attaching its [`OutlierChannelState`]). -/// Cancellation is handled externally via [`KeyedFutures::cancel`]. -/// -/// `ConnectingChannel` deliberately does not carry an -/// [`OutlierChannelState`]: it does not serve traffic, so it has -/// nothing to count or signal. +/// `impl Future` — resolves to the connected service when +/// the connection completes. The caller wraps the resolved service +/// into a [`ReadyChannel`]. Cancellation is handled externally via +/// [`KeyedFutures::cancel`]. /// /// [`KeyedFutures::cancel`]: crate::client::loadbalance::keyed_futures::KeyedFutures::cancel pub(crate) struct ConnectingChannel { @@ -299,9 +271,6 @@ pub(crate) struct ConnectingChannel { } impl ConnectingChannel { - /// Start a connection. The address is kept by the caller (it is - /// typically the key in a `KeyedFutures` map); only the future is - /// stored here. pub(crate) fn new(fut: BoxFuture, _addr: EndpointAddress) -> Self { Self { inner: fut } } @@ -322,11 +291,8 @@ impl Future for ConnectingChannel { /// A channel that is connected and ready to serve requests. /// /// Holds the raw service `S` and delegates [`Service`] calls directly, -/// preserving `S::Future` and `S::Error` with no wrapping or type -/// erasure. The `Arc` is shared with the outlier- -/// detection actor for stats accumulation and edge-triggered ejection; -/// because only `ReadyChannel` serves traffic, only `ReadyChannel` -/// carries this state. +/// preserving `S::Future` and `S::Error`. Shares +/// [`OutlierChannelState`] with the outlier-detection actor via `Arc`. #[derive(Clone)] pub(crate) struct ReadyChannel { addr: EndpointAddress, @@ -335,8 +301,6 @@ pub(crate) struct ReadyChannel { } impl ReadyChannel { - /// Wrap a connected service `S` into a [`ReadyChannel`] using the - /// caller-supplied outlier state. pub(crate) fn new(addr: EndpointAddress, inner: S, outlier: Arc) -> Self { Self { addr, @@ -350,10 +314,8 @@ impl ReadyChannel { &self.outlier } - /// Eject this channel (e.g., due to outlier detection). Consumes - /// self. The outlier state remains in the registry; only the - /// service and address are passed into [`EjectedChannel`] (which - /// just times the cooldown). + /// Eject this channel. Consumes self; the outlier state remains + /// in the registry. pub(crate) fn eject(self, config: EjectionConfig, connector: Arc) -> EjectedChannel where C: Connector + Send + Sync + 'static, @@ -368,9 +330,8 @@ impl ReadyChannel { } } - /// Start reconnecting. Consumes self, dropping the old connection. - /// The outlier state remains in the registry; the consumer - /// re-attaches it when the new [`ReadyChannel`] is constructed. + /// Drop the connection and start a fresh connect for the same + /// address. The outlier state remains in the registry. pub(crate) fn reconnect>( self, connector: Arc, @@ -412,18 +373,13 @@ impl Load for ReadyChannel { // --------------------------------------------------------------------------- pin_project! { - /// A channel that has been ejected and is cooling down. - /// - /// The underlying connection is kept alive but cannot serve - /// requests. Implements [`Future`] -- resolves once the ejection - /// timer expires to either: - /// - [`UnejectedChannel::Ready`] if no reconnect is needed - /// - [`UnejectedChannel::Connecting`] if a fresh connection is required + /// A channel that has been ejected and is cooling down. The + /// underlying connection is kept alive but cannot serve requests. /// - /// `EjectedChannel` deliberately does not carry an - /// [`OutlierChannelState`]: the state lives in the registry, keyed - /// by address, and the consumer re-attaches it when the channel - /// transitions back to [`ReadyChannel`]. + /// `impl Future>` — resolves when + /// `config.timeout` elapses, to [`UnejectedChannel::Ready`] if + /// `needs_reconnect` is false, otherwise + /// [`UnejectedChannel::Connecting`]. pub(crate) struct EjectedChannel { addr: EndpointAddress, inner: S, diff --git a/tonic-xds/src/client/loadbalance/keyed_futures.rs b/tonic-xds/src/client/loadbalance/keyed_futures.rs index c7f48aeaf..701ff865f 100644 --- a/tonic-xds/src/client/loadbalance/keyed_futures.rs +++ b/tonic-xds/src/client/loadbalance/keyed_futures.rs @@ -89,9 +89,7 @@ where self.futures.len() } - /// Returns true if a future is currently tracked for `key`. - /// Cancelled-but-not-yet-drained futures still count, since their - /// cancellation token entry is removed eagerly by [`Self::cancel`]. + /// True if a live (non-cancelled) future is tracked for `key`. pub(crate) fn contains_key(&self, key: &K) -> bool { self.cancellations.contains_key(key) } diff --git a/tonic-xds/src/client/loadbalance/loadbalancer.rs b/tonic-xds/src/client/loadbalance/loadbalancer.rs index f37218623..75c6dffb0 100644 --- a/tonic-xds/src/client/loadbalance/loadbalancer.rs +++ b/tonic-xds/src/client/loadbalance/loadbalancer.rs @@ -1,20 +1,16 @@ //! Load balancer tower service. //! -//! Receives endpoint updates via [`tower::discover::Discover`] (yielding -//! [`IdleChannel`]s), manages the connection lifecycle via the channel state -//! machine, and routes requests to ready endpoints via a [`ChannelPicker`]. +//! Receives endpoint updates via [`tower::discover::Discover`], +//! manages the connection lifecycle via the channel state machine, +//! and routes requests to ready endpoints via a [`ChannelPicker`]. //! -//! Outlier detection is integrated via an optional [`OutlierDetector`]. -//! Ejection decisions originate on the data path (per-RPC) and are -//! signaled to the LB via an mpsc channel. The LB consumes the named -//! [`ReadyChannel`] via [`ReadyChannel::eject`], obtaining an -//! [`EjectedChannel`] whose internal sleep fires exactly at -//! `base × multiplier` (capped by `max_ejection_time`); ejected -//! channels live in a second [`KeyedFutures`] (mirroring the existing -//! pattern for `ConnectingChannel`) until their timer yields -//! [`UnejectedChannel`], at which point the channel is routed back -//! into `ready` (`UnejectedChannel::Ready`) or `connecting` -//! (`UnejectedChannel::Connecting`). +//! Outlier detection (gRFC A50) is integrated via an optional +//! [`OutlierDetector`]. Eject requests arrive on an mpsc channel from +//! the data path; the LB consumes the matching [`ReadyChannel`] via +//! [`ReadyChannel::eject`] and tracks the resulting +//! [`EjectedChannel`] in [`Self::ejected`]. When the timer fires, the +//! resolved [`UnejectedChannel`] is routed back into `ready` or +//! `connecting`. //! //! [`EjectedChannel`]: crate::client::loadbalance::channel_state::EjectedChannel //! [`UnejectedChannel`]: crate::client::loadbalance::channel_state::UnejectedChannel @@ -40,10 +36,8 @@ use crate::client::loadbalance::outlier_detection::{ }; use crate::client::loadbalance::pickers::ChannelPicker; -/// Future returned by [`LoadBalancer::call`]. -/// -/// Either resolves immediately with an [`LbError`], or drives `poll_ready` + -/// `call` on the selected channel asynchronously. +/// Future returned by [`LoadBalancer::call`]. Either resolves +/// immediately with an [`LbError`] or drives the selected channel. pub(crate) enum LbFuture { Error(Option), Pending(Pin> + Send>>), @@ -74,28 +68,18 @@ impl Future for LbFuture { /// `C::Service` is the underlying service type held in ready channels. /// - `Req`: The request type. pub(crate) struct LoadBalancer { - /// Discovery stream providing endpoint additions/removals. discovery: D, - /// Connector for creating connections from idle channels. connector: Arc, - /// In-flight connection attempts, keyed by endpoint address. - /// `ConnectingChannel` resolves to the bare service; the LB wraps - /// it into a `ReadyChannel` with an outlier state when it - /// transitions to ready. + /// In-flight connection attempts. connecting: KeyedFutures, - /// Ready-to-serve channels, keyed by endpoint address. + /// Ready-to-serve channels. ready: IndexMap>, - /// Channels currently ejected by outlier detection. Each entry is - /// an [`EjectedChannel`] whose `Sleep` fires when the ejection - /// window expires; the resolved [`UnejectedChannel`] is drained in - /// `poll_ready` and routed back into `ready` (or `connecting` if - /// the underlying connection needs replacing). + /// Currently-ejected channels. Each entry is an + /// [`EjectedChannel`] whose `Sleep` fires when the ejection + /// window expires. ejected: KeyedFutures>, - /// Outlier-detection plumbing: shared registry, eject-signal - /// receiver, and the housekeeping actor handle. `None` disables - /// outlier detection. + /// `None` disables outlier detection. outlier: Option, - /// Channel picker for load balancing. picker: Arc, Req> + Send + Sync>, } @@ -112,20 +96,16 @@ where connector: Arc, picker: Arc, Req> + Send + Sync>, ) -> Self { - // Infallible: `with_outlier(_, _, _, None)` never touches the - // outlier-detection construction path. - match Self::with_outlier(discovery, connector, picker, None) { - Ok(lb) => lb, - Err(_) => unreachable!("with_outlier(.., None) cannot wire a registry"), - } + // Infallible: `with_outlier(.., None)` never wires a registry. + Self::with_outlier(discovery, connector, picker, None) + .expect("with_outlier(.., None) is infallible") } /// Create a load balancer, optionally enabling outlier detection. /// When `outlier` is `Some`, the registry's housekeeping actor is - /// spawned and its lifetime is bound to the load balancer. - /// Returns [`RegistryAlreadyWired`] if the provided registry has - /// already been wired to another load balancer — a registry's - /// eject-signal receiver is one-shot. + /// spawned and bound to this LB. Returns + /// [`RegistryAlreadyWired`] if the registry already drives + /// another LB. pub(crate) fn with_outlier( discovery: D, connector: Arc, @@ -144,10 +124,8 @@ where }) } - /// Purge all per-endpoint state for `addr`: the connecting - /// future, the ready slot, the ejected channel (if any), and the - /// outlier-detection registry entry. Used when discovery says the - /// endpoint is gone from the cluster. + /// Purge all state for `addr`, including the outlier-detection + /// registry entry. Called on `Change::Remove`. fn purge_endpoint(&mut self, addr: &EndpointAddress) { let _ = self.connecting.cancel(addr); self.ready.swap_remove(addr); @@ -157,33 +135,23 @@ where } } - /// Clear stale slots that held the old service (in-flight - /// connecting future, ready entry, ejected channel) but - /// **preserve** the outlier-detection registry entry — counters, - /// ejection multiplier, and ejection flag carry across the - /// reconnect. Used when discovery re-inserts an endpoint we - /// already track. - /// - /// This matches grpc-go and Envoy: outlier state is keyed by - /// stable endpoint identity and survives a transient discovery - /// flap, so a brief disappearance does not wipe what we already - /// know about the endpoint's health. + /// Clear stale connecting/ready/ejected slots for `addr` but + /// preserve the outlier-detection registry entry. Called on + /// `Change::Insert` so transient discovery flaps don't lose + /// counters or ejection state, matching grpc-go and Envoy. fn reset_active_slots(&mut self, addr: &EndpointAddress) { let _ = self.connecting.cancel(addr); self.ready.swap_remove(addr); let _ = self.ejected.cancel(addr); } - /// Drain pending discovery events. Either resolves to an error - /// ([`LbError::DiscoverClosed`] or [`LbError::DiscoverError`]) or stays - /// pending — there is no success outcome since the loop only exits on - /// pending or error. + /// Drain pending discovery events. Resolves to an error + /// ([`LbError::DiscoverClosed`] or [`LbError::DiscoverError`]) + /// or stays pending — there is no success outcome. fn poll_discover(&mut self, cx: &mut Context<'_>) -> Poll { loop { match ready!(Pin::new(&mut self.discovery).poll_discover(cx)) { None => { - // tower::discover::Discover::poll_discover() returns Ready(None) when the - // discover object is closed, as indicated by Stream trait. tracing::error!("discover object is closed"); return Poll::Ready(LbError::DiscoverClosed); } @@ -202,15 +170,10 @@ where } } - /// Drain completed connection futures. Wraps each bare service - /// into a `ReadyChannel` using the outlier state from the - /// registry (or a fresh state if outlier detection is disabled). - /// - /// If the preserved outlier state for a re-discovered endpoint - /// says it is still ejected, the new channel is re-ejected with - /// the *remaining* ejection time so the ongoing backoff is - /// honored. If the deadline has already passed, the channel is - /// un-ejected immediately and routed to `ready`. + /// Drain completed connection futures. If the outlier state for + /// a re-discovered endpoint is still ejected, the new channel is + /// re-ejected for the *remaining* duration; if the deadline has + /// already passed, it is un-ejected and routed to `ready`. fn poll_connecting(&mut self, cx: &mut Context<'_>) { while let Poll::Ready(Some((addr, svc))) = self.connecting.poll_next(cx) { let state = match self.outlier.as_ref() { @@ -226,10 +189,9 @@ where } } - /// Route a freshly-connected `ReadyChannel` into the right pool - /// based on the preserved outlier state's `remaining` ejection - /// duration. Factored out so `poll_connecting` stays terse and - /// the three cases (fresh, mid-eject, past-deadline) are visible. + /// Route a freshly-connected `ReadyChannel` based on its + /// preserved outlier state: `None` → ready; `Some(0)` → un-eject + /// then ready; `Some(d)` → ejected for `d`. fn place_after_connect( &mut self, addr: EndpointAddress, @@ -261,10 +223,9 @@ where } /// Drain eject requests from the outlier detector's mpsc and - /// transition the named `ReadyChannel`s into ejected ones. The - /// per-channel ejection state has already been flipped by - /// `record_outcome`; this step is the visible transition on the - /// LB side. + /// move each named `ReadyChannel` into [`Self::ejected`]. The + /// per-channel ejection flag has already been set by + /// `record_outcome`. fn poll_eject_requests(&mut self, cx: &mut Context<'_>) { loop { let Some(o) = self.outlier.as_mut() else { @@ -275,10 +236,8 @@ where _ => return, }; let registry = o.registry().clone(); - // The eject signal arrives once `try_eject` has flipped - // the channel's state and the cluster-wide - // `ejected_count`. If the channel is no longer in `ready` - // (e.g. discovery removed it), there's nothing to do. + // Channel may have been removed by discovery in the + // meantime; if so, nothing to eject. let Some(ch) = self.ready.swap_remove(&addr) else { continue; }; @@ -296,24 +255,21 @@ where let _ = self.ejected.add(addr, ejected); } Some(_) => { - // Deadline already past — un-eject immediately. + // Deadline already past — un-eject. registry.note_uneject(&state); self.ready.insert(addr, ch); } None => { - // State is no longer ejected (concurrent uneject?) — restore. + // No longer ejected (raced with un-eject). self.ready.insert(addr, ch); } } } } - /// Drain completed `EjectedChannel` timers. Each yields either an - /// `UnejectedChannel::Ready(svc)` (timer expired, reuse the - /// connection) or `UnejectedChannel::Connecting(future)` (timer - /// expired but a fresh connect was requested). The address's - /// outlier state is cleared and the channel is routed back into - /// `ready` or `connecting` accordingly. + /// Drain completed `EjectedChannel` timers. Clears the + /// outlier state and routes the resolved channel back into + /// `ready` or `connecting`. fn poll_unejection(&mut self, cx: &mut Context<'_>) { while let Poll::Ready(Some((addr, unejected))) = self.ejected.poll_next(cx) { let state = match self.outlier.as_ref() { @@ -329,10 +285,9 @@ where let ready = ReadyChannel::new(addr.clone(), svc, state); self.ready.insert(addr, ready); } + // `needs_reconnect = false` for A50; this arm is + // reserved for future policies. UnejectedChannel::Connecting(future) => { - // `needs_reconnect = false` for A50, so this arm - // is unused today; handle it for completeness in - // case a future policy sets it. let _ = self.connecting.add(addr, future); } } @@ -357,10 +312,8 @@ where fn poll_ready(&mut self, cx: &mut Context<'_>) -> Poll> { let discover_result = self.poll_discover(cx); - // Drain un-ejection completions BEFORE servicing eject requests - // so a freshly un-ejected channel can immediately serve traffic - // (and so cluster-wide `ejected_count` is current when the next - // eject is evaluated). + // Un-ejections before ejections so `ejected_count` is current + // when the next eject is evaluated. self.poll_unejection(cx); self.poll_connecting(cx); self.poll_eject_requests(cx); @@ -369,15 +322,13 @@ where return Poll::Ready(Ok(())); } - // No ready endpoints. Check if we should fail fast. + // No ready endpoints. Fail fast iff discovery is closed and + // nothing else can produce one. match discover_result { Poll::Ready(LbError::DiscoverClosed) if self.connecting.len() == 0 => { - // Discovery is closed and nothing is connecting — no progress is possible. Poll::Ready(Err(LbError::Stagnation)) } Poll::Ready(e) => { - // Other discovery errors (or DiscoverClosed with connecting in flight) - // are non-fatal — log and stay pending. tracing::warn!("discovery yielded error: {e}"); Poll::Pending } @@ -395,9 +346,8 @@ where let Some(picked) = self.picker.pick(&req, &self.ready) else { return LbFuture::Error(Some(LbError::Unavailable)); }; - // `picked` is a read-only borrow into `self.ready`. Clone to get - // an owned service and outlier handle for the async block; both - // are `Arc`-shared, so cloning is cheap. + // Cheap clones (all Arc-shared internals) so the async block + // can take ownership without holding the picker borrow. let mut svc = picked.clone(); let outlier_state = picked.outlier().clone(); let registry = self.outlier.as_ref().map(|o| o.registry().clone()); @@ -407,10 +357,6 @@ where .map_err(|e| LbError::LbChannelPollReadyError(e.into()))?; let result = svc.call(req).await; if let Some(registry) = registry.as_ref() { - // Per-RPC outlier detection: bump the channel's - // counter and (inside `record_outcome`) possibly - // dispatch an eject request to the LB. Treat any - // `Err` outcome as a failure for outlier purposes. registry.record_outcome(&outlier_state, result.is_ok()); } result.map_err(|e| LbError::LbChannelCallError(e.into())) diff --git a/tonic-xds/src/client/loadbalance/outlier_detection.rs b/tonic-xds/src/client/loadbalance/outlier_detection.rs index 6e1bed53b..df3e78796 100644 --- a/tonic-xds/src/client/loadbalance/outlier_detection.rs +++ b/tonic-xds/src/client/loadbalance/outlier_detection.rs @@ -1,39 +1,31 @@ -//! gRFC A50 outlier detection. +//! [gRFC A50] outlier detection. //! -//! The algorithm is split between the data path, the load balancer, -//! and a spawned actor: +//! Work is split across three sites: //! -//! - **Per-RPC detection** runs inline on each call completion via -//! [`OutlierStatsRegistry::record_outcome`]. The wrapper records the -//! outcome on the channel's [`OutlierChannelState`], evaluates the -//! failure-percentage threshold, and on transition to ejected sends -//! the address through an mpsc channel for the LB to consume. -//! Cluster-wide gates (`minimum_hosts`, `max_ejection_percent`) are -//! enforced via two atomic counters on the registry, kept in sync -//! as channels cross thresholds. -//! - **The load balancer** drains the eject mpsc in `poll_ready`, +//! - **Data path** ([`OutlierStatsRegistry::record_outcome`]): runs +//! inline per RPC. Updates per-channel counters, applies the +//! failure-percentage gate, and on transition to ejected sends the +//! address through an mpsc channel. +//! - **Load balancer**: drains the eject mpsc in `poll_ready`, //! consumes the matching [`ReadyChannel`] via //! [`ReadyChannel::eject`], and tracks the resulting //! [`EjectedChannel`] in a `KeyedFutures`. Each ejected channel's -//! internal sleep fires at exactly `base × multiplier` (capped by -//! `max_ejection_time`) after ejection, yielding -//! [`UnejectedChannel::Ready`]; the LB drains it on the next -//! `poll_ready` and routes the channel back to the ready set. -//! - **Interval-based housekeeping** runs in a spawned actor (see -//! [`spawn_actor`]). It resets per-channel counters at the -//! `config.interval` boundary and decrements multipliers for -//! non-ejected channels. Un-ejection is timer-driven by -//! [`EjectedChannel`] — the actor never un-ejects. +//! sleep fires at `base × multiplier` (capped by +//! `max_ejection_time`); the LB then routes the resolved +//! [`UnejectedChannel`] back into the ready set. +//! - **Housekeeping actor** ([`spawn_actor`]): on each +//! `config.interval` tick, resets counters and decrements +//! multipliers for non-ejected channels. The actor never ejects or +//! un-ejects. //! -//! Only the failure-percentage algorithm is dispatched. The -//! success-rate algorithm (cross-endpoint mean/stdev) is left to a -//! follow-up. +//! Only the failure-percentage algorithm is implemented; success-rate +//! (cross-endpoint mean/stdev) is left to a follow-up. //! //! [gRFC A50]: https://github.com/grpc/proposal/blob/master/A50-xds-outlier-detection.md //! [`ReadyChannel`]: crate::client::loadbalance::channel_state::ReadyChannel //! [`ReadyChannel::eject`]: crate::client::loadbalance::channel_state::ReadyChannel::eject //! [`EjectedChannel`]: crate::client::loadbalance::channel_state::EjectedChannel -//! [`UnejectedChannel::Ready`]: crate::client::loadbalance::channel_state::UnejectedChannel::Ready +//! [`UnejectedChannel`]: crate::client::loadbalance::channel_state::UnejectedChannel use std::sync::Arc; use std::sync::Mutex; @@ -49,10 +41,8 @@ use crate::client::loadbalance::channel_state::OutlierChannelState; use crate::common::async_util::AbortOnDrop; use crate::xds::resource::outlier_detection::OutlierDetectionConfig; -/// Construction-time error returned when a single -/// [`OutlierStatsRegistry`] is wired to more than one load balancer. -/// The registry's eject-signal receiver is one-shot; reuse is not -/// supported. +/// Returned when an [`OutlierStatsRegistry`] is handed to a second +/// load balancer. The eject-signal receiver is one-shot. #[derive(Debug, thiserror::Error)] #[error("OutlierStatsRegistry is already wired to a LoadBalancer")] pub(crate) struct RegistryAlreadyWired; @@ -73,34 +63,23 @@ impl Rng for FastRandRng { } /// Shared outlier-detection state, owned by `Arc` and accessed -/// concurrently by: -/// - The load balancer's call wrapper, which calls -/// [`Self::record_outcome`] after each RPC completion. -/// - The spawned actor task, which calls [`Self::run_housekeeping`] -/// on every `config.interval` tick. -/// - The load balancer's `poll_ready`, which drains the eject mpsc -/// (via [`OutlierDetector::poll_eject_request`]) and calls -/// [`Self::note_uneject`] when an `EjectedChannel`'s timer fires. +/// concurrently by the data path ([`Self::record_outcome`]), the +/// housekeeping actor ([`Self::run_housekeeping`]), and the load +/// balancer ([`Self::note_uneject`], [`Self::remaining_ejection`]). pub(crate) struct OutlierStatsRegistry { - /// Per-endpoint state, keyed by address. Inserted by the LB on - /// channel creation and removed on disconnect. channels: DashMap>, - /// Number of channels currently with `total >= request_volume` in - /// the active interval. Drives the `minimum_hosts` gate. + /// Channels with `total >= request_volume` in the active + /// interval. Drives the `minimum_hosts` gate. qualifying_count: AtomicU64, - /// Number of channels currently ejected. Drives the + /// Channels currently ejected. Drives the /// `max_ejection_percent` cap. ejected_count: AtomicU64, config: OutlierDetectionConfig, rng: Box, - /// Sender half of the eject signal. `record_outcome` pushes an - /// address through on transition to ejected; the LB's - /// [`OutlierDetector`] drains the receiver in `poll_ready` and - /// consumes the matching `ReadyChannel`. + /// Sender half of the eject signal. The receiver is owned by the + /// LB's [`OutlierDetector`]. eject_tx: mpsc::UnboundedSender, - /// Receiver half, handed to the LB at construction time. Wrapped - /// in a `Mutex>` so [`Self::take_eject_rx`] can move it - /// out exactly once. Outside that hand-off there is no contention. + /// Receiver moved out exactly once by [`Self::take_eject_rx`]. eject_rx: Mutex>>, } @@ -124,10 +103,9 @@ impl OutlierStatsRegistry { }) } - /// Take the eject-signal receiver. Called exactly once by - /// [`OutlierDetector::new`]. Returns - /// [`RegistryAlreadyWired`] if a previous call has already taken - /// the receiver — a registry can drive at most one load balancer. + /// Take the eject-signal receiver. Returns + /// [`RegistryAlreadyWired`] on a second call — a registry can + /// drive at most one load balancer. fn take_eject_rx( &self, ) -> Result, RegistryAlreadyWired> { @@ -138,11 +116,8 @@ impl OutlierStatsRegistry { .ok_or(RegistryAlreadyWired) } - /// Register a channel and return the `Arc` - /// the load balancer wires into the channel; the same `Arc` is - /// retained in the registry so the actor can iterate it. If a - /// state for this address already exists, returns it untouched — - /// state continuity across reconnect cycles is preserved. + /// Get or create the state for `addr`. Idempotent — existing + /// state is preserved across reconnect. pub(crate) fn add_channel(&self, addr: EndpointAddress) -> Arc { self.channels .entry(addr.clone()) @@ -150,9 +125,8 @@ impl OutlierStatsRegistry { .clone() } - /// Forget a channel. Drops the registry's reference; cluster-wide - /// counters are decremented if the channel was qualifying or - /// ejected. + /// Drop the state for `addr`, decrementing cluster-wide counters + /// (`qualifying_count`, `ejected_count`) if it was contributing. pub(crate) fn remove_channel(&self, addr: &EndpointAddress) { if let Some((_, state)) = self.channels.remove(addr) { if state.clear_qualifying() { @@ -169,12 +143,9 @@ impl OutlierStatsRegistry { self.channels.len() } - /// Per-RPC entry point. Called by the load balancer's call wrapper - /// after each RPC completion. Increments the channel's success or - /// failure counter and then evaluates the failure-percentage - /// threshold; if all gates pass and the channel was not already - /// ejected, marks it ejected and sends the address through the - /// eject mpsc for the LB to consume. + /// Per-RPC entry point. Records the outcome and, if all gates + /// pass, transitions the channel to ejected and dispatches the + /// address on the eject mpsc. pub(crate) fn record_outcome(&self, state: &OutlierChannelState, success: bool) { if success { state.record_success(); @@ -190,9 +161,8 @@ impl OutlierStatsRegistry { let total = s + f; let request_volume = u64::from(fp.request_volume); - // Track when each channel first qualifies in the current - // interval, so the `minimum_hosts` gate can be checked with a - // single atomic load. + // Bump `qualifying_count` exactly once per channel per + // interval so the `minimum_hosts` gate is a single atomic load. if total >= request_volume && state.mark_qualifying() { self.qualifying_count.fetch_add(1, Ordering::Relaxed); } @@ -221,32 +191,19 @@ impl OutlierStatsRegistry { if state.try_eject(Instant::now()) { self.ejected_count.fetch_add(1, Ordering::Relaxed); - // The LB drains this in `poll_ready` and consumes the - // `ReadyChannel` via `ReadyChannel::eject`. If the LB has - // dropped its receiver (shutdown), the send fails silently - // — the channel will be cleaned up by `forget`. + // Send failure (LB receiver dropped during shutdown) is + // ignored; the registry will be torn down momentarily. let _ = self.eject_tx.send(state.addr().clone()); } } - /// Clear the ejection on `state`, decrement the cluster-wide - /// `ejected_count`, and decrement the channel's ejection - /// multiplier (matching gRFC A50 step 6.b, which decrements - /// multiplier in the same sweep that un-ejects). Returns whether - /// the transition fired (so callers can guard against - /// double-counting). Called by the LB when an `EjectedChannel`'s - /// timer fires and yields `UnejectedChannel::Ready`. + /// Clear the ejection: flip the state, decrement + /// `ejected_count`, and decrement the multiplier (gRFC A50 + /// step 6.b: same sweep that un-ejects also decrements). Returns + /// `true` on the ejected → not-ejected transition. pub(crate) fn note_uneject(&self, state: &OutlierChannelState) -> bool { if state.try_uneject() { self.ejected_count.fetch_sub(1, Ordering::Relaxed); - // Per A50, the same sweep that un-ejects also decrements - // the multiplier. Since our un-ejection is timer-driven - // (decoupled from the housekeeping sweep), we apply the - // decrement here to avoid a window where a re-eject would - // see a stale (one-higher) multiplier and back off too - // aggressively. The actor's housekeeping decrement still - // runs at each interval; saturating arithmetic ensures - // the eventual decrement to zero stays correct. state.decrement_multiplier(); true } else { @@ -254,12 +211,10 @@ impl OutlierStatsRegistry { } } - /// Compute how long `state` still has to remain ejected, or - /// `None` if it is not currently ejected. Returns - /// `Some(Duration::ZERO)` if the deadline has already passed - /// (caller should un-eject immediately rather than starting a - /// fresh sleep). Used by the LB on initial ejection and on - /// re-discovery to size the `EjectionConfig::timeout`. + /// Time remaining on `state`'s ejection (capped by + /// `max_ejection_time`). `None` if not ejected; + /// `Some(Duration::ZERO)` if the deadline has passed (caller + /// should un-eject rather than start a fresh sleep). pub(crate) fn remaining_ejection( &self, state: &OutlierChannelState, @@ -280,39 +235,31 @@ impl OutlierStatsRegistry { Some(target.checked_sub(elapsed).unwrap_or_default()) } - /// Interval-boundary housekeeping. Called by the spawned actor on - /// each `config.interval` tick. Resets counters and decrements - /// multipliers for non-ejected channels. Does **not** un-eject — - /// un-ejection is timer-driven by each `EjectedChannel` and - /// handled by the LB when the channel resolves. + /// Interval-boundary housekeeping. Resets counters and + /// decrements multipliers for non-ejected channels. Does not + /// un-eject — that is driven by each `EjectedChannel`'s timer. pub(crate) fn run_housekeeping(&self) { for entry in self.channels.iter() { let state = entry.value(); - - // Reset counters; clear `is_qualifying` and adjust the - // registry-level counter in lockstep. state.snapshot_and_reset(); if state.clear_qualifying() { self.qualifying_count.fetch_sub(1, Ordering::Relaxed); } - if !state.is_ejected() { state.decrement_multiplier(); } } } - /// `max_ejection_percent` resolved against the current channel - /// count. Updated as channels come and go. + /// Resolve `max_ejection_percent` against the current channel count. fn max_ejections(&self) -> u64 { self.channels.len() as u64 * u64::from(self.config.max_ejection_percent.get()) / 100 } } -/// Spawn the housekeeping actor. The task ticks every -/// `config.interval` and calls -/// [`OutlierStatsRegistry::run_housekeeping`]. Dropping the returned -/// [`AbortOnDrop`] stops the task. +/// Spawn the housekeeping actor. Ticks every `config.interval` and +/// calls [`OutlierStatsRegistry::run_housekeeping`]. Dropping the +/// returned [`AbortOnDrop`] stops the task. pub(crate) fn spawn_actor(registry: Arc) -> AbortOnDrop { let interval = registry.config.interval; let task = tokio::spawn(async move { @@ -326,17 +273,9 @@ pub(crate) fn spawn_actor(registry: Arc) -> AbortOnDrop { AbortOnDrop(task) } -/// Per-LB outlier-detection plumbing: the shared registry, the -/// receiver half of the eject signal mpsc, and the handle to the -/// housekeeping actor (dropped with the LB). -/// -/// `LoadBalancer` holds this as `Option`: `None` -/// when outlier detection is disabled, `Some` when enabled. The -/// pool of ejected channels themselves lives directly on the LB in a -/// `KeyedFutures<_, UnejectedChannel<_>>` — see the channel state -/// machine in [`channel_state`] for the type-state transitions. -/// -/// [`channel_state`]: crate::client::loadbalance::channel_state +/// Per-LB outlier-detection plumbing: shared registry, eject-signal +/// receiver, and the housekeeping actor handle (aborted on drop). The +/// LB holds this as `Option`. pub(crate) struct OutlierDetector { registry: Arc, eject_rx: mpsc::UnboundedReceiver, @@ -344,11 +283,10 @@ pub(crate) struct OutlierDetector { } impl OutlierDetector { - /// Build from a registry, spawning the housekeeping actor and - /// taking ownership of the eject-signal receiver. Returns - /// [`RegistryAlreadyWired`] if the registry's receiver has - /// already been taken (i.e. this registry is already driving - /// another load balancer); a registry can drive at most one LB. + /// Take ownership of the registry's eject-signal receiver and + /// spawn the housekeeping actor. Returns + /// [`RegistryAlreadyWired`] if the registry is already wired to + /// another LB. pub(crate) fn new(registry: Arc) -> Result { let eject_rx = registry.take_eject_rx()?; let _actor = spawn_actor(registry.clone()); @@ -364,11 +302,7 @@ impl OutlierDetector { &self.registry } - /// Poll for the next address whose data path has decided to - /// eject. Returns `Poll::Pending` when no eject decision is - /// queued; returns `Poll::Ready(None)` only if the registry has - /// been dropped (which can't happen while this detector holds an - /// `Arc`). + /// Poll for the next address the data path has decided to eject. pub(crate) fn poll_eject_request( &mut self, cx: &mut Context<'_>, @@ -749,12 +683,9 @@ mod tests { assert_eq!(s.ejection_multiplier(), 0); } - /// Re-ejecting a channel immediately after un-ejection should - /// produce a backoff sized for multiplier=1, not multiplier=2 — - /// i.e. it should *not* punish the channel for the previous - /// ejection that has just finished serving its cooldown. This is - /// what gRFC A50 prescribes and what Envoy does (un-eject and - /// decrement happen at the same sweep). + /// A50 step 6.b: un-eject and multiplier decrement happen at the + /// same sweep. Re-eject right after un-eject must size the + /// backoff with the *decremented* multiplier. #[test] fn re_eject_after_uneject_uses_fresh_multiplier() { let mut config = fp_config(50, 10, 3); diff --git a/tonic-xds/src/xds/resource/outlier_detection.rs b/tonic-xds/src/xds/resource/outlier_detection.rs index 159ff7735..970232bea 100644 --- a/tonic-xds/src/xds/resource/outlier_detection.rs +++ b/tonic-xds/src/xds/resource/outlier_detection.rs @@ -2,32 +2,19 @@ //! //! [`OutlierDetectionConfig`] is the input to the outlier-detection //! algorithm. The two sub-configs gate which ejection algorithms run. -//! -//! Note: A50 specifies outlier detection as a load-balancing policy -//! wrapping a `child_policy`. `tonic-xds` currently runs P2C as its -//! only load balancer, so there is no `child_policy` field here yet — -//! it will be added when more balancers are supported. Integration -//! with the data path is via an mpsc channel of ejection decisions -//! polled by the [`LoadBalancer`] tower service, which marks the -//! corresponding [`ReadyChannel`] as ejected via [`EjectedChannel`]. +//! The `child_policy` field from A50 is not modeled — `tonic-xds` +//! currently runs P2C as its only load balancer. //! //! [gRFC A50]: https://github.com/grpc/proposal/blob/master/A50-xds-outlier-detection.md -//! [`LoadBalancer`]: crate::client::loadbalance::loadbalancer::LoadBalancer -//! [`ReadyChannel`]: crate::client::loadbalance::channel_state::ReadyChannel -//! [`EjectedChannel`]: crate::client::loadbalance::channel_state::EjectedChannel use std::time::Duration; -/// A 0–100 percentage. Construction is fallible; once held, every -/// `Percentage` is guaranteed to be in range, so the algorithm never -/// has to re-validate. +/// A 0–100 percentage, validated at construction. #[derive(Debug, Clone, Copy, PartialEq, Eq)] pub(crate) struct Percentage(u8); impl Percentage { /// Construct from a raw value, returning `Err` if it exceeds 100. - /// Accepts `u32` to match the proto wire type without forcing callers - /// to cast at every site. pub(crate) fn new(value: u32) -> Result { if value > 100 { Err(PercentageError(value)) @@ -73,9 +60,8 @@ pub(crate) struct SuccessRateConfig { /// An endpoint is a candidate for ejection when its success rate falls /// below `mean - stdev * (stdev_factor / 1000.0)`. pub stdev_factor: u32, - /// Probability that a flagged candidate is actually ejected — *not* - /// the success-rate threshold (which is derived from `stdev_factor`). - /// Set to 0 to disable enforcement while still computing statistics. + /// Probability that a flagged candidate is actually ejected. + /// Set to 0 to compute statistics without enforcing. pub enforcing_success_rate: Percentage, /// Minimum number of candidate endpoints required to run the algorithm. pub minimum_hosts: u32, @@ -90,9 +76,8 @@ pub(crate) struct FailurePercentageConfig { /// Failure rate at or above which an endpoint is a candidate for /// ejection. pub threshold: Percentage, - /// Probability that a flagged candidate is actually ejected — *not* - /// the failure-rate threshold (that is `threshold` above). Set to 0 - /// to disable enforcement while still computing statistics. + /// Probability that a flagged candidate is actually ejected. + /// Set to 0 to compute statistics without enforcing. pub enforcing_failure_percentage: Percentage, /// Minimum number of candidate endpoints required to run the algorithm. pub minimum_hosts: u32, @@ -102,8 +87,7 @@ pub(crate) struct FailurePercentageConfig { } impl OutlierDetectionConfig { - /// True when at least one ejection algorithm is enabled and the detector - /// should do work. If false, the cluster can skip instantiating detection. + /// True when at least one ejection algorithm is enabled. pub(crate) fn is_enabled(&self) -> bool { self.success_rate.is_some() || self.failure_percentage.is_some() } From f4e1e8c5ca12d77ebf6382f2db920a82d6e77c53 Mon Sep 17 00:00:00 2001 From: Yu Liu <60283975+LYZJU2019@users.noreply.github.com> Date: Tue, 12 May 2026 13:49:37 -0700 Subject: [PATCH 31/39] refactor(tonic-xds): UnejectedChannel::Ready carries a full ReadyChannel MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Pass the outlier state through `ReadyChannel::eject` → `EjectedChannel` so its `Future::poll` can yield `UnejectedChannel::Ready(ReadyChannel)` with the state already reattached, instead of `Ready(S)` and asking the LB to rebuild. Symmetric ends for the `Ready ↔ Ejected` transition (both speak `ReadyChannel`), and `poll_unejection`'s ready arm drops its state-lookup + `ReadyChannel::new` rebuild — it just calls `note_uneject(ready.outlier())` and inserts. The `Connecting` arm stays asymmetric since the fresh connect produces a bare service. --- .../src/client/loadbalance/channel_state.rs | 20 ++++++++++++----- .../src/client/loadbalance/loadbalancer.rs | 22 +++++++++---------- 2 files changed, 25 insertions(+), 17 deletions(-) diff --git a/tonic-xds/src/client/loadbalance/channel_state.rs b/tonic-xds/src/client/loadbalance/channel_state.rs index a4a57d4bb..398ef13a4 100644 --- a/tonic-xds/src/client/loadbalance/channel_state.rs +++ b/tonic-xds/src/client/loadbalance/channel_state.rs @@ -223,9 +223,9 @@ pub(crate) struct EjectionConfig { /// Result of an ejection expiring. pub(crate) enum UnejectedChannel { - /// Connection reused; the caller wraps the service back into a - /// [`ReadyChannel`]. - Ready(S), + /// Cooldown elapsed; the original connection is reused with its + /// outlier state reattached. + Ready(ReadyChannel), /// A fresh connection has been started. Connecting(ConnectingChannel), } @@ -314,8 +314,9 @@ impl ReadyChannel { &self.outlier } - /// Eject this channel. Consumes self; the outlier state remains - /// in the registry. + /// Eject this channel. Consumes self; the outlier state is moved + /// into the [`EjectedChannel`] so it can be reattached to the + /// [`ReadyChannel`] produced when the cooldown elapses. pub(crate) fn eject(self, config: EjectionConfig, connector: Arc) -> EjectedChannel where C: Connector + Send + Sync + 'static, @@ -324,6 +325,7 @@ impl ReadyChannel { EjectedChannel { addr: self.addr, inner: self.inner, + outlier: self.outlier, config, connector, ejection_timer, @@ -383,6 +385,7 @@ pin_project! { pub(crate) struct EjectedChannel { addr: EndpointAddress, inner: S, + outlier: Arc, config: EjectionConfig, connector: Arc + Send + Sync>, #[pin] @@ -404,7 +407,12 @@ impl Future for EjectedChannel { this.addr.clone(), ))) } else { - Poll::Ready(UnejectedChannel::Ready(this.inner.clone())) + let ready = ReadyChannel::new( + this.addr.clone(), + this.inner.clone(), + this.outlier.clone(), + ); + Poll::Ready(UnejectedChannel::Ready(ready)) } } Poll::Pending => Poll::Pending, diff --git a/tonic-xds/src/client/loadbalance/loadbalancer.rs b/tonic-xds/src/client/loadbalance/loadbalancer.rs index 75c6dffb0..1b995cb74 100644 --- a/tonic-xds/src/client/loadbalance/loadbalancer.rs +++ b/tonic-xds/src/client/loadbalance/loadbalancer.rs @@ -268,26 +268,26 @@ where } /// Drain completed `EjectedChannel` timers. Clears the - /// outlier state and routes the resolved channel back into - /// `ready` or `connecting`. + /// registry-level ejection counter and routes the resolved + /// channel back into `ready` (with its outlier state already + /// reattached) or `connecting`. fn poll_unejection(&mut self, cx: &mut Context<'_>) { while let Poll::Ready(Some((addr, unejected))) = self.ejected.poll_next(cx) { - let state = match self.outlier.as_ref() { - Some(o) => o.registry().add_channel(addr.clone()), - None => Arc::new(OutlierChannelState::new(addr.clone())), - }; - if let Some(o) = self.outlier.as_ref() { - o.registry().note_uneject(&state); - } match unejected { - UnejectedChannel::Ready(svc) => { + UnejectedChannel::Ready(ready) => { + if let Some(o) = self.outlier.as_ref() { + o.registry().note_uneject(ready.outlier()); + } tracing::debug!("outlier detection: uneject {addr}"); - let ready = ReadyChannel::new(addr.clone(), svc, state); self.ready.insert(addr, ready); } // `needs_reconnect = false` for A50; this arm is // reserved for future policies. UnejectedChannel::Connecting(future) => { + if let Some(o) = self.outlier.as_ref() { + let state = o.registry().add_channel(addr.clone()); + o.registry().note_uneject(&state); + } let _ = self.connecting.add(addr, future); } } From fa12110612378959b4a251d4d8d01f1880184b93 Mon Sep 17 00:00:00 2001 From: Yu Liu <60283975+LYZJU2019@users.noreply.github.com> Date: Tue, 12 May 2026 13:56:06 -0700 Subject: [PATCH 32/39] refactor(tonic-xds): drop unused addr parameter from ConnectingChannel::new `ConnectingChannel` stopped constructing `ReadyChannel` internally when outlier state was added (the registry-supplied `Arc` has to come from the LB, not the channel-state type), and the second parameter `_addr` has been ignored ever since. Drop it; callers already hand `KeyedFutures` the canonical address as the key. --- .../src/client/loadbalance/channel_state.rs | 17 ++++++----------- 1 file changed, 6 insertions(+), 11 deletions(-) diff --git a/tonic-xds/src/client/loadbalance/channel_state.rs b/tonic-xds/src/client/loadbalance/channel_state.rs index 398ef13a4..159838991 100644 --- a/tonic-xds/src/client/loadbalance/channel_state.rs +++ b/tonic-xds/src/client/loadbalance/channel_state.rs @@ -250,7 +250,7 @@ impl IdleChannel { where C::Service: Send + 'static, { - ConnectingChannel::new(connector.connect(&self.addr), self.addr) + ConnectingChannel::new(connector.connect(&self.addr)) } } @@ -271,7 +271,7 @@ pub(crate) struct ConnectingChannel { } impl ConnectingChannel { - pub(crate) fn new(fut: BoxFuture, _addr: EndpointAddress) -> Self { + pub(crate) fn new(fut: BoxFuture) -> Self { Self { inner: fut } } } @@ -341,7 +341,7 @@ impl ReadyChannel { where S: Send + 'static, { - ConnectingChannel::new(connector.connect(&self.addr), self.addr) + ConnectingChannel::new(connector.connect(&self.addr)) } } @@ -402,10 +402,7 @@ impl Future for EjectedChannel { Poll::Ready(()) => { if this.config.needs_reconnect { let fut = this.connector.connect(this.addr); - Poll::Ready(UnejectedChannel::Connecting(ConnectingChannel::new( - fut, - this.addr.clone(), - ))) + Poll::Ready(UnejectedChannel::Connecting(ConnectingChannel::new(fut))) } else { let ready = ReadyChannel::new( this.addr.clone(), @@ -519,8 +516,7 @@ mod tests { #[tokio::test] async fn test_connecting_in_keyed_futures() { let (tx, rx) = tokio::sync::oneshot::channel::(); - let connecting = - ConnectingChannel::new(Box::pin(async move { rx.await.unwrap() }), test_addr()); + let connecting = ConnectingChannel::new(Box::pin(async move { rx.await.unwrap() })); let mut set: KeyedFutures = KeyedFutures::new(); set.add(test_addr(), connecting).unwrap(); @@ -537,8 +533,7 @@ mod tests { #[tokio::test] async fn test_connecting_cancelled_via_keyed_futures() { - let connecting = - ConnectingChannel::new(Box::pin(future::pending::()), test_addr()); + let connecting = ConnectingChannel::new(Box::pin(future::pending::())); let mut set: KeyedFutures = KeyedFutures::new(); set.add(test_addr(), connecting).unwrap(); From b55d41969216ccf11cd9cb708972b8e18eb691c4 Mon Sep 17 00:00:00 2001 From: Yu Liu <60283975+LYZJU2019@users.noreply.github.com> Date: Tue, 12 May 2026 14:04:51 -0700 Subject: [PATCH 33/39] refactor(tonic-xds): drop the Rng trait, call fastrand directly MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The `Rng` trait existed only as a test seam for the `enforcing_failure_percentage` probability roll, but every caller in both the algorithm tests and the LB integration tests uses `enforcing = 100` or `enforcing = 0` — values for which `roll` short- circuits without consulting the RNG. The trait, the `FastRandRng` default, and the test-side `FixedRng` / `AlwaysFireRng` impls were all bookkeeping for a code path none of them exercised. Inline the `fastrand::u32(0..100)` call into `roll` and remove the trait. `OutlierStatsRegistry::with_rng` collapses into `new`. --- .../src/client/loadbalance/loadbalancer.rs | 13 +-- .../client/loadbalance/outlier_detection.rs | 86 ++++++------------- 2 files changed, 27 insertions(+), 72 deletions(-) diff --git a/tonic-xds/src/client/loadbalance/loadbalancer.rs b/tonic-xds/src/client/loadbalance/loadbalancer.rs index 1b995cb74..ac58b6080 100644 --- a/tonic-xds/src/client/loadbalance/loadbalancer.rs +++ b/tonic-xds/src/client/loadbalance/loadbalancer.rs @@ -837,7 +837,7 @@ mod tests { // -- Outlier-detection integration tests -- - use crate::client::loadbalance::outlier_detection::{OutlierStatsRegistry, Rng}; + use crate::client::loadbalance::outlier_detection::OutlierStatsRegistry; use crate::xds::resource::outlier_detection::{ FailurePercentageConfig, OutlierDetectionConfig, Percentage, }; @@ -847,13 +847,6 @@ mod tests { Percentage::new(v).unwrap() } - struct AlwaysFireRng; - impl Rng for AlwaysFireRng { - fn pct_roll(&self) -> u32 { - 0 - } - } - fn fp_config( threshold: u32, request_volume: u32, @@ -882,7 +875,7 @@ mod tests { let connector = Arc::new(MockConnector::new()); let picker: Arc, &'static str> + Send + Sync> = Arc::new(P2cPicker); - let registry = OutlierStatsRegistry::with_rng(config, Box::new(AlwaysFireRng)); + let registry = OutlierStatsRegistry::new(config); let lb = LoadBalancer::with_outlier(discover, connector.clone(), picker, Some(registry.clone())) .expect("registry not yet wired"); @@ -1156,7 +1149,7 @@ mod tests { let connector = Arc::new(MockConnector::new()); let picker: Arc, &'static str> + Send + Sync> = Arc::new(P2cPicker); - let registry = OutlierStatsRegistry::with_rng(fp_config(50, 5, 3), Box::new(AlwaysFireRng)); + let registry = OutlierStatsRegistry::new(fp_config(50, 5, 3)); // First wiring succeeds. LoadBalancer::with_outlier( diff --git a/tonic-xds/src/client/loadbalance/outlier_detection.rs b/tonic-xds/src/client/loadbalance/outlier_detection.rs index df3e78796..8cff6dce9 100644 --- a/tonic-xds/src/client/loadbalance/outlier_detection.rs +++ b/tonic-xds/src/client/loadbalance/outlier_detection.rs @@ -47,21 +47,6 @@ use crate::xds::resource::outlier_detection::OutlierDetectionConfig; #[error("OutlierStatsRegistry is already wired to a LoadBalancer")] pub(crate) struct RegistryAlreadyWired; -/// Probability source for `enforcing_*` rolls. -pub(crate) trait Rng: Send + Sync + 'static { - /// Return a uniform random `u32` in `0..100`. - fn pct_roll(&self) -> u32; -} - -/// Default RNG backed by `fastrand`. -struct FastRandRng; - -impl Rng for FastRandRng { - fn pct_roll(&self) -> u32 { - fastrand::u32(0..100) - } -} - /// Shared outlier-detection state, owned by `Arc` and accessed /// concurrently by the data path ([`Self::record_outcome`]), the /// housekeeping actor ([`Self::run_housekeeping`]), and the load @@ -75,7 +60,6 @@ pub(crate) struct OutlierStatsRegistry { /// `max_ejection_percent` cap. ejected_count: AtomicU64, config: OutlierDetectionConfig, - rng: Box, /// Sender half of the eject signal. The receiver is owned by the /// LB's [`OutlierDetector`]. eject_tx: mpsc::UnboundedSender, @@ -84,20 +68,13 @@ pub(crate) struct OutlierStatsRegistry { } impl OutlierStatsRegistry { - /// Build a registry with the default RNG. pub(crate) fn new(config: OutlierDetectionConfig) -> Arc { - Self::with_rng(config, Box::new(FastRandRng)) - } - - /// Build a registry with a custom [`Rng`]. - pub(crate) fn with_rng(config: OutlierDetectionConfig, rng: Box) -> Arc { let (eject_tx, eject_rx) = mpsc::unbounded_channel(); Arc::new(Self { channels: DashMap::new(), qualifying_count: AtomicU64::new(0), ejected_count: AtomicU64::new(0), config, - rng, eject_tx, eject_rx: Mutex::new(Some(eject_rx)), }) @@ -185,7 +162,7 @@ impl OutlierStatsRegistry { if failure_pct <= u64::from(fp.threshold.get()) { return; } - if !roll(&*self.rng, fp.enforcing_failure_percentage.get()) { + if !roll(fp.enforcing_failure_percentage.get()) { return; } @@ -312,14 +289,14 @@ impl OutlierDetector { } /// Return true with probability `pct / 100` (clamped at 100 ⇒ always). -fn roll(rng: &dyn Rng, pct: u8) -> bool { +fn roll(pct: u8) -> bool { if pct >= 100 { return true; } if pct == 0 { return false; } - rng.pct_roll() < u32::from(pct) + fastrand::u32(0..100) < u32::from(pct) } #[cfg(test)] @@ -328,7 +305,7 @@ mod tests { use crate::xds::resource::outlier_detection::{ FailurePercentageConfig, OutlierDetectionConfig, Percentage, }; - use std::sync::atomic::{AtomicU32, Ordering}; + use std::sync::atomic::Ordering; use std::time::Duration; fn addr(port: u16) -> EndpointAddress { @@ -365,21 +342,6 @@ mod tests { c } - /// Deterministic RNG: `pct_roll()` returns a fixed value. - struct FixedRng(AtomicU32); - - impl FixedRng { - fn boxed(value: u32) -> Box { - Box::new(Self(AtomicU32::new(value))) - } - } - - impl Rng for FixedRng { - fn pct_roll(&self) -> u32 { - self.0.load(Ordering::Relaxed) - } - } - /// Drive `n` outcomes through `record_outcome` for one channel. fn drive( registry: &OutlierStatsRegistry, @@ -399,7 +361,7 @@ mod tests { #[test] fn ejects_above_threshold_inline() { - let registry = OutlierStatsRegistry::with_rng(fp_config(50, 10, 3), FixedRng::boxed(99)); + let registry = OutlierStatsRegistry::new(fp_config(50, 10, 3)); let bad = registry.add_channel(addr(8084)); for port in 8080..=8083 { let s = registry.add_channel(addr(port)); @@ -412,7 +374,7 @@ mod tests { #[test] fn skips_below_threshold() { - let registry = OutlierStatsRegistry::with_rng(fp_config(50, 10, 3), FixedRng::boxed(99)); + let registry = OutlierStatsRegistry::new(fp_config(50, 10, 3)); let mut all = vec![]; for port in 8080..=8084 { let s = registry.add_channel(addr(port)); @@ -428,7 +390,7 @@ mod tests { #[test] fn at_threshold_does_not_eject() { // A50 specifies a strict "greater than" comparison. - let registry = OutlierStatsRegistry::with_rng(fp_config(50, 10, 3), FixedRng::boxed(0)); + let registry = OutlierStatsRegistry::new(fp_config(50, 10, 3)); let mut all = vec![]; for port in 8080..=8084 { let s = registry.add_channel(addr(port)); @@ -442,7 +404,7 @@ mod tests { #[test] fn minimum_hosts_gates_ejection() { - let registry = OutlierStatsRegistry::with_rng(fp_config(50, 10, 5), FixedRng::boxed(99)); + let registry = OutlierStatsRegistry::new(fp_config(50, 10, 5)); // Only 2 hosts have request_volume ≥ 10; minimum_hosts is 5 ⇒ skip. let mut all = vec![]; for port in 8080..=8081 { @@ -457,7 +419,7 @@ mod tests { #[test] fn request_volume_filters_low_traffic() { - let registry = OutlierStatsRegistry::with_rng(fp_config(50, 100, 3), FixedRng::boxed(99)); + let registry = OutlierStatsRegistry::new(fp_config(50, 100, 3)); let bad = registry.add_channel(addr(8080)); drive(®istry, &bad, 0, 5); for port in 8081..=8084 { @@ -475,7 +437,7 @@ mod tests { .as_mut() .unwrap() .enforcing_failure_percentage = pct(0); - let registry = OutlierStatsRegistry::with_rng(config, FixedRng::boxed(0)); + let registry = OutlierStatsRegistry::new(config); let mut all = vec![]; for port in 8080..=8084 { let s = registry.add_channel(addr(port)); @@ -491,7 +453,7 @@ mod tests { fn max_ejection_percent_caps_concurrent_ejections() { let mut config = fp_config(50, 10, 3); config.max_ejection_percent = pct(20); - let registry = OutlierStatsRegistry::with_rng(config, FixedRng::boxed(99)); + let registry = OutlierStatsRegistry::new(config); let mut all = vec![]; for port in 8080..=8084 { @@ -510,7 +472,7 @@ mod tests { #[test] fn remove_channel_decrements_counters() { - let registry = OutlierStatsRegistry::with_rng(fp_config(50, 10, 3), FixedRng::boxed(99)); + let registry = OutlierStatsRegistry::new(fp_config(50, 10, 3)); let mut all = vec![]; for port in 8080..=8083 { let s = registry.add_channel(addr(port)); @@ -532,7 +494,7 @@ mod tests { #[test] fn ejection_dispatches_address_through_mpsc() { - let registry = OutlierStatsRegistry::with_rng(fp_config(50, 10, 3), FixedRng::boxed(99)); + let registry = OutlierStatsRegistry::new(fp_config(50, 10, 3)); let mut rx = registry.take_eject_rx().expect("receiver available"); let bad = registry.add_channel(addr(8084)); for port in 8080..=8083 { @@ -553,7 +515,7 @@ mod tests { #[test] fn housekeeping_resets_counters_and_qualifying() { - let registry = OutlierStatsRegistry::with_rng(fp_config(50, 10, 3), FixedRng::boxed(99)); + let registry = OutlierStatsRegistry::new(fp_config(50, 10, 3)); for port in 8080..=8083 { let s = registry.add_channel(addr(port)); drive(®istry, &s, 100, 0); @@ -570,7 +532,7 @@ mod tests { #[test] fn housekeeping_decrements_multiplier_on_healthy_interval() { - let registry = OutlierStatsRegistry::with_rng(base_config(), FixedRng::boxed(99)); + let registry = OutlierStatsRegistry::new(base_config()); let s = registry.add_channel(addr(8080)); // Force multiplier to 3 directly (no traffic, no eject). s.set_ejection_multiplier(3); @@ -581,7 +543,7 @@ mod tests { #[test] fn housekeeping_leaves_ejected_multipliers_alone() { - let registry = OutlierStatsRegistry::with_rng(base_config(), FixedRng::boxed(99)); + let registry = OutlierStatsRegistry::new(base_config()); let s = registry.add_channel(addr(8080)); s.try_eject(Instant::now()); s.set_ejection_multiplier(3); @@ -600,7 +562,7 @@ mod tests { let mut config = fp_config(50, 10, 3); config.base_ejection_time = Duration::from_secs(10); config.max_ejection_time = Duration::from_secs(60); - let registry = OutlierStatsRegistry::with_rng(config, FixedRng::boxed(99)); + let registry = OutlierStatsRegistry::new(config); let s = registry.add_channel(addr(8080)); let t0 = Instant::now(); s.try_eject(t0); @@ -614,7 +576,7 @@ mod tests { let mut config = fp_config(50, 10, 3); config.base_ejection_time = Duration::from_secs(10); config.max_ejection_time = Duration::from_secs(15); - let registry = OutlierStatsRegistry::with_rng(config, FixedRng::boxed(99)); + let registry = OutlierStatsRegistry::new(config); let s = registry.add_channel(addr(8080)); let t0 = Instant::now(); s.try_eject(t0); @@ -628,7 +590,7 @@ mod tests { let mut config = fp_config(50, 10, 3); config.base_ejection_time = Duration::from_secs(30); config.max_ejection_time = Duration::from_secs(60); - let registry = OutlierStatsRegistry::with_rng(config, FixedRng::boxed(99)); + let registry = OutlierStatsRegistry::new(config); let s = registry.add_channel(addr(8080)); let t0 = Instant::now(); s.try_eject(t0); @@ -644,7 +606,7 @@ mod tests { let mut config = fp_config(50, 10, 3); config.base_ejection_time = Duration::from_secs(10); config.max_ejection_time = Duration::from_secs(60); - let registry = OutlierStatsRegistry::with_rng(config, FixedRng::boxed(99)); + let registry = OutlierStatsRegistry::new(config); let s = registry.add_channel(addr(8080)); let t0 = Instant::now(); s.try_eject(t0); @@ -657,14 +619,14 @@ mod tests { #[test] fn remaining_ejection_none_when_not_ejected() { - let registry = OutlierStatsRegistry::with_rng(base_config(), FixedRng::boxed(99)); + let registry = OutlierStatsRegistry::new(base_config()); let s = registry.add_channel(addr(8080)); assert!(registry.remaining_ejection(&s, Instant::now()).is_none()); } #[test] fn note_uneject_clears_state_and_decrements_counter() { - let registry = OutlierStatsRegistry::with_rng(base_config(), FixedRng::boxed(99)); + let registry = OutlierStatsRegistry::new(base_config()); let s = registry.add_channel(addr(8080)); s.try_eject(Instant::now()); // bumps multiplier 0 → 1 registry.ejected_count.fetch_add(1, Ordering::Relaxed); @@ -691,7 +653,7 @@ mod tests { let mut config = fp_config(50, 10, 3); config.base_ejection_time = Duration::from_secs(10); config.max_ejection_time = Duration::from_secs(300); - let registry = OutlierStatsRegistry::with_rng(config, FixedRng::boxed(99)); + let registry = OutlierStatsRegistry::new(config); let s = registry.add_channel(addr(8080)); let t0 = Instant::now(); @@ -725,7 +687,7 @@ mod tests { async fn dropping_abort_stops_actor() { let mut config = base_config(); config.interval = Duration::from_millis(50); - let registry = OutlierStatsRegistry::with_rng(config, FixedRng::boxed(99)); + let registry = OutlierStatsRegistry::new(config); let s = registry.add_channel(addr(8080)); s.set_ejection_multiplier(5); From 6eb6b11942d2171d72acd37dcdab60114f9b8861 Mon Sep 17 00:00:00 2001 From: Yu Liu <60283975+LYZJU2019@users.noreply.github.com> Date: Wed, 27 May 2026 11:14:38 -0700 Subject: [PATCH 34/39] refactor(tonic-xds): ConnectingChannel resolves to ReadyChannel MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Previously, ConnectingChannel::Future resolved to the raw service `S` and LoadBalancer::poll_connecting wrapped it in a ReadyChannel by hand, minting outlier state along the way. That coupled the LB's loop body to the registry-vs-no-registry distinction and forced callers to know the wrapping rules. Thread an Option> through every transition (connect, eject, uneject, reconnect) instead. The resolved value is always ReadyChannel with outlier state attached — looked up from the registry when Some (idempotent), freshly minted otherwise. poll_connecting collapses to a plain forwarder and the LB exposes a single `registry()` accessor for threading. --- .../src/client/loadbalance/channel_state.rs | 133 ++++++++++++------ .../src/client/loadbalance/loadbalancer.rs | 48 ++++--- 2 files changed, 120 insertions(+), 61 deletions(-) diff --git a/tonic-xds/src/client/loadbalance/channel_state.rs b/tonic-xds/src/client/loadbalance/channel_state.rs index 159838991..7aedd7e17 100644 --- a/tonic-xds/src/client/loadbalance/channel_state.rs +++ b/tonic-xds/src/client/loadbalance/channel_state.rs @@ -35,8 +35,25 @@ use tower::Service; use tower::load::Load; use crate::client::endpoint::{Connector, EndpointAddress}; +use crate::client::loadbalance::outlier_detection::OutlierStatsRegistry; use crate::common::async_util::BoxFuture; +/// Returns the `Arc` that a freshly-resolved +/// [`ReadyChannel`] should carry for `addr`. When `registry` is +/// `Some`, the registry's `add_channel` returns the existing entry +/// (idempotent), so re-discovered addresses keep their counters and +/// ejection state. When `None`, the state is a throwaway that nothing +/// will ever read. +fn outlier_state_for( + registry: Option<&Arc>, + addr: EndpointAddress, +) -> Arc { + match registry { + Some(r) => r.add_channel(addr), + None => Arc::new(OutlierChannelState::new(addr)), + } +} + // --------------------------------------------------------------------------- // EndpointCounters / OutlierChannelState // --------------------------------------------------------------------------- @@ -226,7 +243,9 @@ pub(crate) enum UnejectedChannel { /// Cooldown elapsed; the original connection is reused with its /// outlier state reattached. Ready(ReadyChannel), - /// A fresh connection has been started. + /// A fresh connection has been started. The LB looks the existing + /// outlier state back up via the registry to call `note_uneject` + /// — `add_channel` is idempotent, so no Arc needs to ride along. Connecting(ConnectingChannel), } @@ -246,11 +265,18 @@ impl IdleChannel { } /// Start connecting to the endpoint. Consumes the idle channel. - pub(crate) fn connect(self, connector: Arc) -> ConnectingChannel + /// The resolved [`ReadyChannel`] will carry the + /// `Arc` produced for `addr` — looked up in + /// `registry` when `Some`, freshly minted otherwise. + pub(crate) fn connect( + self, + connector: Arc, + registry: Option>, + ) -> ConnectingChannel where C::Service: Send + 'static, { - ConnectingChannel::new(connector.connect(&self.addr)) + ConnectingChannel::new(connector.connect(&self.addr), self.addr, registry) } } @@ -260,27 +286,40 @@ impl IdleChannel { /// A channel that is in the process of connecting. /// -/// `impl Future` — resolves to the connected service when -/// the connection completes. The caller wraps the resolved service -/// into a [`ReadyChannel`]. Cancellation is handled externally via -/// [`KeyedFutures::cancel`]. +/// `impl Future>` — resolves to a fully-formed +/// `ReadyChannel` whose outlier state is looked up from `registry` (or +/// freshly minted when `registry` is `None`) at resolve time. +/// Cancellation is handled externally via [`KeyedFutures::cancel`]. /// /// [`KeyedFutures::cancel`]: crate::client::loadbalance::keyed_futures::KeyedFutures::cancel pub(crate) struct ConnectingChannel { + addr: EndpointAddress, + registry: Option>, inner: Pin + Send>>, } impl ConnectingChannel { - pub(crate) fn new(fut: BoxFuture) -> Self { - Self { inner: fut } + pub(crate) fn new( + fut: BoxFuture, + addr: EndpointAddress, + registry: Option>, + ) -> Self { + Self { + addr, + registry, + inner: fut, + } } } impl Future for ConnectingChannel { - type Output = S; + type Output = ReadyChannel; fn poll(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll { - self.get_mut().inner.as_mut().poll(cx) + let this = self.get_mut(); + let svc = std::task::ready!(this.inner.as_mut().poll(cx)); + let outlier = outlier_state_for(this.registry.as_ref(), this.addr.clone()); + Poll::Ready(ReadyChannel::new(this.addr.clone(), svc, outlier)) } } @@ -317,7 +356,12 @@ impl ReadyChannel { /// Eject this channel. Consumes self; the outlier state is moved /// into the [`EjectedChannel`] so it can be reattached to the /// [`ReadyChannel`] produced when the cooldown elapses. - pub(crate) fn eject(self, config: EjectionConfig, connector: Arc) -> EjectedChannel + pub(crate) fn eject( + self, + config: EjectionConfig, + connector: Arc, + registry: Option>, + ) -> EjectedChannel where C: Connector + Send + Sync + 'static, { @@ -326,6 +370,7 @@ impl ReadyChannel { addr: self.addr, inner: self.inner, outlier: self.outlier, + registry, config, connector, ejection_timer, @@ -333,15 +378,17 @@ impl ReadyChannel { } /// Drop the connection and start a fresh connect for the same - /// address. The outlier state remains in the registry. + /// address. The outlier state is re-attached when the new + /// connect resolves (via `registry` lookup if `Some`). pub(crate) fn reconnect>( self, connector: Arc, + registry: Option>, ) -> ConnectingChannel where S: Send + 'static, { - ConnectingChannel::new(connector.connect(&self.addr)) + ConnectingChannel::new(connector.connect(&self.addr), self.addr, registry) } } @@ -386,6 +433,7 @@ pin_project! { addr: EndpointAddress, inner: S, outlier: Arc, + registry: Option>, config: EjectionConfig, connector: Arc + Send + Sync>, #[pin] @@ -402,7 +450,9 @@ impl Future for EjectedChannel { Poll::Ready(()) => { if this.config.needs_reconnect { let fut = this.connector.connect(this.addr); - Poll::Ready(UnejectedChannel::Connecting(ConnectingChannel::new(fut))) + let connecting = + ConnectingChannel::new(fut, this.addr.clone(), this.registry.clone()); + Poll::Ready(UnejectedChannel::Connecting(connecting)) } else { let ready = ReadyChannel::new( this.addr.clone(), @@ -474,28 +524,22 @@ mod tests { #[tokio::test] async fn test_idle_to_connecting() { let connector = MockConnector::new(); - let _connecting = IdleChannel::new(test_addr()).connect(connector.clone()); + let _connecting = IdleChannel::new(test_addr()).connect(connector.clone(), None); assert_eq!(connector.connect_count.load(Ordering::SeqCst), 1); } - fn wrap_ready(addr: EndpointAddress, svc: MockService) -> ReadyChannel { - let state = Arc::new(OutlierChannelState::new(addr.clone())); - ReadyChannel::new(addr, svc, state) - } - #[tokio::test] - async fn test_connecting_future_yields_service() { + async fn test_connecting_future_yields_ready_channel() { let connector = MockConnector::new(); - let svc: MockService = IdleChannel::new(test_addr()).connect(connector).await; - // The bare service is what `ConnectingChannel` resolves to. - let _ready = wrap_ready(test_addr(), svc); + let ready: ReadyChannel = + IdleChannel::new(test_addr()).connect(connector, None).await; + assert_eq!(ready.outlier().addr(), &test_addr()); } #[tokio::test] async fn test_ready_service_delegates() { let connector = MockConnector::new(); - let svc = IdleChannel::new(test_addr()).connect(connector).await; - let mut ready = wrap_ready(test_addr(), svc); + let mut ready = IdleChannel::new(test_addr()).connect(connector, None).await; let resp: &str = ready.call("hello").await.unwrap(); assert_eq!(resp, "ok"); } @@ -503,11 +547,10 @@ mod tests { #[tokio::test] async fn test_ready_to_connecting_via_reconnect() { let connector = MockConnector::new(); - let svc = IdleChannel::new(test_addr()) - .connect(connector.clone()) + let ready = IdleChannel::new(test_addr()) + .connect(connector.clone(), None) .await; - let ready = wrap_ready(test_addr(), svc); - let _reconnecting = ready.reconnect(connector.clone()); + let _reconnecting = ready.reconnect(connector.clone(), None); assert_eq!(connector.connect_count.load(Ordering::SeqCst), 2); } @@ -516,9 +559,13 @@ mod tests { #[tokio::test] async fn test_connecting_in_keyed_futures() { let (tx, rx) = tokio::sync::oneshot::channel::(); - let connecting = ConnectingChannel::new(Box::pin(async move { rx.await.unwrap() })); + let connecting = ConnectingChannel::new( + Box::pin(async move { rx.await.unwrap() }), + test_addr(), + None, + ); - let mut set: KeyedFutures = KeyedFutures::new(); + let mut set: KeyedFutures> = KeyedFutures::new(); set.add(test_addr(), connecting).unwrap(); assert!(matches!(set.poll_next(&mut noop_cx()), Poll::Pending)); @@ -533,9 +580,13 @@ mod tests { #[tokio::test] async fn test_connecting_cancelled_via_keyed_futures() { - let connecting = ConnectingChannel::new(Box::pin(future::pending::())); + let connecting = ConnectingChannel::new( + Box::pin(future::pending::()), + test_addr(), + None, + ); - let mut set: KeyedFutures = KeyedFutures::new(); + let mut set: KeyedFutures> = KeyedFutures::new(); set.add(test_addr(), connecting).unwrap(); assert!(matches!(set.poll_next(&mut noop_cx()), Poll::Pending)); @@ -547,16 +598,16 @@ mod tests { #[tokio::test(start_paused = true)] async fn test_ejected_in_keyed_futures_ready() { let connector = MockConnector::new(); - let svc = IdleChannel::new(test_addr()) - .connect(connector.clone()) + let ready = IdleChannel::new(test_addr()) + .connect(connector.clone(), None) .await; - let ready = wrap_ready(test_addr(), svc); let ejected = ready.eject( EjectionConfig { timeout: Duration::from_secs(5), needs_reconnect: false, }, connector, + None, ); let mut set: KeyedFutures> = @@ -573,16 +624,16 @@ mod tests { #[tokio::test(start_paused = true)] async fn test_ejected_in_keyed_futures_needs_reconnect() { let connector = MockConnector::new(); - let svc = IdleChannel::new(test_addr()) - .connect(connector.clone()) + let ready = IdleChannel::new(test_addr()) + .connect(connector.clone(), None) .await; - let ready = wrap_ready(test_addr(), svc); let ejected = ready.eject( EjectionConfig { timeout: Duration::from_secs(5), needs_reconnect: true, }, connector.clone(), + None, ); let mut set: KeyedFutures> = diff --git a/tonic-xds/src/client/loadbalance/loadbalancer.rs b/tonic-xds/src/client/loadbalance/loadbalancer.rs index ac58b6080..f3191b66f 100644 --- a/tonic-xds/src/client/loadbalance/loadbalancer.rs +++ b/tonic-xds/src/client/loadbalance/loadbalancer.rs @@ -27,7 +27,7 @@ use tower::discover::{Change, Discover}; use crate::client::endpoint::{Connector, EndpointAddress}; use crate::client::loadbalance::channel_state::{ - EjectionConfig, IdleChannel, OutlierChannelState, ReadyChannel, UnejectedChannel, + EjectionConfig, IdleChannel, ReadyChannel, UnejectedChannel, }; use crate::client::loadbalance::errors::LbError; use crate::client::loadbalance::keyed_futures::KeyedFutures; @@ -70,8 +70,10 @@ impl Future for LbFuture { pub(crate) struct LoadBalancer { discovery: D, connector: Arc, - /// In-flight connection attempts. - connecting: KeyedFutures, + /// In-flight connection attempts. Resolves directly to a + /// [`ReadyChannel`] with outlier state already attached (looked + /// up in [`Self::outlier`]'s registry when present). + connecting: KeyedFutures>, /// Ready-to-serve channels. ready: IndexMap>, /// Currently-ejected channels. Each entry is an @@ -124,6 +126,13 @@ where }) } + /// The `Arc` that fresh `ReadyChannel`s + /// should attach state from. `None` when outlier detection is + /// disabled. + fn registry(&self) -> Option> { + self.outlier.as_ref().map(|o| o.registry().clone()) + } + /// Purge all state for `addr`, including the outlier-detection /// registry entry. Called on `Change::Remove`. fn purge_endpoint(&mut self, addr: &EndpointAddress) { @@ -159,7 +168,7 @@ where Some(Ok(Change::Insert(addr, idle))) => { tracing::trace!("discovery: insert {addr}"); self.reset_active_slots(&addr); - let connecting = idle.connect(self.connector.clone()); + let connecting = idle.connect(self.connector.clone(), self.registry()); let _ = self.connecting.add(addr, connecting); } Some(Ok(Change::Remove(addr))) => { @@ -170,21 +179,18 @@ where } } - /// Drain completed connection futures. If the outlier state for - /// a re-discovered endpoint is still ejected, the new channel is - /// re-ejected for the *remaining* duration; if the deadline has - /// already passed, it is un-ejected and routed to `ready`. + /// Drain completed connection futures. Each yields a fully-formed + /// `ReadyChannel` with outlier state already attached. If the + /// preserved outlier state for a re-discovered endpoint is still + /// ejected, the new channel is re-ejected for the *remaining* + /// duration; if the deadline has already passed, it is un-ejected + /// and routed to `ready`. fn poll_connecting(&mut self, cx: &mut Context<'_>) { - while let Poll::Ready(Some((addr, svc))) = self.connecting.poll_next(cx) { - let state = match self.outlier.as_ref() { - Some(o) => o.registry().add_channel(addr.clone()), - None => Arc::new(OutlierChannelState::new(addr.clone())), - }; - let ready = ReadyChannel::new(addr.clone(), svc, state.clone()); - let remaining = self - .outlier - .as_ref() - .and_then(|o| o.registry().remaining_ejection(&state, Instant::now())); + while let Poll::Ready(Some((addr, ready))) = self.connecting.poll_next(cx) { + let remaining = self.outlier.as_ref().and_then(|o| { + o.registry() + .remaining_ejection(ready.outlier(), Instant::now()) + }); self.place_after_connect(addr, ready, remaining); } } @@ -215,6 +221,7 @@ where needs_reconnect: false, }, self.connector.clone(), + self.registry(), ); tracing::debug!("outlier detection: re-eject {addr} for {d:?}"); let _ = self.ejected.add(addr, ejected); @@ -250,6 +257,7 @@ where needs_reconnect: false, }, self.connector.clone(), + Some(registry.clone()), ); tracing::debug!("outlier detection: eject {addr} for {d:?}"); let _ = self.ejected.add(addr, ejected); @@ -283,12 +291,12 @@ where } // `needs_reconnect = false` for A50; this arm is // reserved for future policies. - UnejectedChannel::Connecting(future) => { + UnejectedChannel::Connecting(connecting) => { if let Some(o) = self.outlier.as_ref() { let state = o.registry().add_channel(addr.clone()); o.registry().note_uneject(&state); } - let _ = self.connecting.add(addr, future); + let _ = self.connecting.add(addr, connecting); } } } From f3371b6268fc93e2b135c507b6d8e0f536f70fb9 Mon Sep 17 00:00:00 2001 From: Yu Liu <60283975+LYZJU2019@users.noreply.github.com> Date: Wed, 27 May 2026 13:07:05 -0700 Subject: [PATCH 35/39] refactor(tonic-xds): merge LB constructors behind ArcSwap config MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit LoadBalancer used to expose new() (no outlier) plus with_outlier() (handed a pre-built registry, fallible). Replace both with a single new(.., Arc>) that internally builds the registry, pairs the eject mpsc, and conditionally spawns the housekeeping actor based on config.is_enabled(). OutlierStatsRegistry.config now lives behind ArcSwap so future xDS subscription updates can re-store the config without rebuilding the LB. Until that wiring lands, the value is effectively read-once at construction (and the actor's tokio::time::interval still captures its period at spawn — interval changes need an actor restart, owned by the future xDS layer). The registry now owns the eject_tx and hands the rx back at construction, removing the Mutex> dance and the RegistryAlreadyWired error path entirely. The LB's outlier field becomes non-optional; the channel state machine's connect/eject/ reconnect APIs drop Option> and take the registry directly. OutlierDetectionConfig::default() is the disabled config (both algorithms None). --- .../src/client/loadbalance/channel_state.rs | 91 +++++----- .../src/client/loadbalance/loadbalancer.rs | 153 +++++++---------- .../client/loadbalance/outlier_detection.rs | 159 ++++++++++-------- .../src/xds/resource/outlier_detection.rs | 17 ++ 4 files changed, 206 insertions(+), 214 deletions(-) diff --git a/tonic-xds/src/client/loadbalance/channel_state.rs b/tonic-xds/src/client/loadbalance/channel_state.rs index 7aedd7e17..5f57ca2b0 100644 --- a/tonic-xds/src/client/loadbalance/channel_state.rs +++ b/tonic-xds/src/client/loadbalance/channel_state.rs @@ -38,22 +38,6 @@ use crate::client::endpoint::{Connector, EndpointAddress}; use crate::client::loadbalance::outlier_detection::OutlierStatsRegistry; use crate::common::async_util::BoxFuture; -/// Returns the `Arc` that a freshly-resolved -/// [`ReadyChannel`] should carry for `addr`. When `registry` is -/// `Some`, the registry's `add_channel` returns the existing entry -/// (idempotent), so re-discovered addresses keep their counters and -/// ejection state. When `None`, the state is a throwaway that nothing -/// will ever read. -fn outlier_state_for( - registry: Option<&Arc>, - addr: EndpointAddress, -) -> Arc { - match registry { - Some(r) => r.add_channel(addr), - None => Arc::new(OutlierChannelState::new(addr)), - } -} - // --------------------------------------------------------------------------- // EndpointCounters / OutlierChannelState // --------------------------------------------------------------------------- @@ -243,9 +227,10 @@ pub(crate) enum UnejectedChannel { /// Cooldown elapsed; the original connection is reused with its /// outlier state reattached. Ready(ReadyChannel), - /// A fresh connection has been started. The LB looks the existing - /// outlier state back up via the registry to call `note_uneject` - /// — `add_channel` is idempotent, so no Arc needs to ride along. + /// A fresh connection has been started. The LB looks the + /// existing outlier state back up via the registry to call + /// `note_uneject` — `add_channel` is idempotent, so no Arc needs + /// to ride along on the variant. Connecting(ConnectingChannel), } @@ -266,12 +251,13 @@ impl IdleChannel { /// Start connecting to the endpoint. Consumes the idle channel. /// The resolved [`ReadyChannel`] will carry the - /// `Arc` produced for `addr` — looked up in - /// `registry` when `Some`, freshly minted otherwise. + /// `Arc` from `registry.add_channel(addr)` — + /// idempotent, so a re-discovered or reconnected address keeps + /// its existing counters and ejection state. pub(crate) fn connect( self, connector: Arc, - registry: Option>, + registry: Arc, ) -> ConnectingChannel where C::Service: Send + 'static, @@ -287,14 +273,14 @@ impl IdleChannel { /// A channel that is in the process of connecting. /// /// `impl Future>` — resolves to a fully-formed -/// `ReadyChannel` whose outlier state is looked up from `registry` (or -/// freshly minted when `registry` is `None`) at resolve time. -/// Cancellation is handled externally via [`KeyedFutures::cancel`]. +/// `ReadyChannel` whose outlier state is looked up from `registry` +/// via `add_channel` at resolve time. Cancellation is handled +/// externally via [`KeyedFutures::cancel`]. /// /// [`KeyedFutures::cancel`]: crate::client::loadbalance::keyed_futures::KeyedFutures::cancel pub(crate) struct ConnectingChannel { addr: EndpointAddress, - registry: Option>, + registry: Arc, inner: Pin + Send>>, } @@ -302,7 +288,7 @@ impl ConnectingChannel { pub(crate) fn new( fut: BoxFuture, addr: EndpointAddress, - registry: Option>, + registry: Arc, ) -> Self { Self { addr, @@ -318,7 +304,7 @@ impl Future for ConnectingChannel { fn poll(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll { let this = self.get_mut(); let svc = std::task::ready!(this.inner.as_mut().poll(cx)); - let outlier = outlier_state_for(this.registry.as_ref(), this.addr.clone()); + let outlier = this.registry.add_channel(this.addr.clone()); Poll::Ready(ReadyChannel::new(this.addr.clone(), svc, outlier)) } } @@ -360,7 +346,7 @@ impl ReadyChannel { self, config: EjectionConfig, connector: Arc, - registry: Option>, + registry: Arc, ) -> EjectedChannel where C: Connector + Send + Sync + 'static, @@ -378,12 +364,12 @@ impl ReadyChannel { } /// Drop the connection and start a fresh connect for the same - /// address. The outlier state is re-attached when the new - /// connect resolves (via `registry` lookup if `Some`). + /// address. The outlier state is re-attached from `registry` + /// when the new connect resolves. pub(crate) fn reconnect>( self, connector: Arc, - registry: Option>, + registry: Arc, ) -> ConnectingChannel where S: Send + 'static, @@ -433,7 +419,7 @@ pin_project! { addr: EndpointAddress, inner: S, outlier: Arc, - registry: Option>, + registry: Arc, config: EjectionConfig, connector: Arc + Send + Sync>, #[pin] @@ -521,25 +507,40 @@ mod tests { Context::from_waker(Box::leak(Box::new(noop_waker()))) } + /// Throwaway registry for tests that don't observe ejection + /// state — defaults to the disabled config, so the housekeeping + /// actor would never run anyway. + fn test_registry() -> Arc { + use crate::xds::resource::outlier_detection::OutlierDetectionConfig; + use arc_swap::ArcSwap; + OutlierStatsRegistry::new(Arc::new(ArcSwap::from_pointee( + OutlierDetectionConfig::default(), + ))) + .0 + } + #[tokio::test] async fn test_idle_to_connecting() { let connector = MockConnector::new(); - let _connecting = IdleChannel::new(test_addr()).connect(connector.clone(), None); + let _connecting = IdleChannel::new(test_addr()).connect(connector.clone(), test_registry()); assert_eq!(connector.connect_count.load(Ordering::SeqCst), 1); } #[tokio::test] async fn test_connecting_future_yields_ready_channel() { let connector = MockConnector::new(); - let ready: ReadyChannel = - IdleChannel::new(test_addr()).connect(connector, None).await; + let ready: ReadyChannel = IdleChannel::new(test_addr()) + .connect(connector, test_registry()) + .await; assert_eq!(ready.outlier().addr(), &test_addr()); } #[tokio::test] async fn test_ready_service_delegates() { let connector = MockConnector::new(); - let mut ready = IdleChannel::new(test_addr()).connect(connector, None).await; + let mut ready = IdleChannel::new(test_addr()) + .connect(connector, test_registry()) + .await; let resp: &str = ready.call("hello").await.unwrap(); assert_eq!(resp, "ok"); } @@ -548,9 +549,9 @@ mod tests { async fn test_ready_to_connecting_via_reconnect() { let connector = MockConnector::new(); let ready = IdleChannel::new(test_addr()) - .connect(connector.clone(), None) + .connect(connector.clone(), test_registry()) .await; - let _reconnecting = ready.reconnect(connector.clone(), None); + let _reconnecting = ready.reconnect(connector.clone(), test_registry()); assert_eq!(connector.connect_count.load(Ordering::SeqCst), 2); } @@ -562,7 +563,7 @@ mod tests { let connecting = ConnectingChannel::new( Box::pin(async move { rx.await.unwrap() }), test_addr(), - None, + test_registry(), ); let mut set: KeyedFutures> = KeyedFutures::new(); @@ -583,7 +584,7 @@ mod tests { let connecting = ConnectingChannel::new( Box::pin(future::pending::()), test_addr(), - None, + test_registry(), ); let mut set: KeyedFutures> = KeyedFutures::new(); @@ -599,7 +600,7 @@ mod tests { async fn test_ejected_in_keyed_futures_ready() { let connector = MockConnector::new(); let ready = IdleChannel::new(test_addr()) - .connect(connector.clone(), None) + .connect(connector.clone(), test_registry()) .await; let ejected = ready.eject( EjectionConfig { @@ -607,7 +608,7 @@ mod tests { needs_reconnect: false, }, connector, - None, + test_registry(), ); let mut set: KeyedFutures> = @@ -625,7 +626,7 @@ mod tests { async fn test_ejected_in_keyed_futures_needs_reconnect() { let connector = MockConnector::new(); let ready = IdleChannel::new(test_addr()) - .connect(connector.clone(), None) + .connect(connector.clone(), test_registry()) .await; let ejected = ready.eject( EjectionConfig { @@ -633,7 +634,7 @@ mod tests { needs_reconnect: true, }, connector.clone(), - None, + test_registry(), ); let mut set: KeyedFutures> = diff --git a/tonic-xds/src/client/loadbalance/loadbalancer.rs b/tonic-xds/src/client/loadbalance/loadbalancer.rs index f3191b66f..bdd013a53 100644 --- a/tonic-xds/src/client/loadbalance/loadbalancer.rs +++ b/tonic-xds/src/client/loadbalance/loadbalancer.rs @@ -4,9 +4,13 @@ //! manages the connection lifecycle via the channel state machine, //! and routes requests to ready endpoints via a [`ChannelPicker`]. //! -//! Outlier detection (gRFC A50) is integrated via an optional -//! [`OutlierDetector`]. Eject requests arrive on an mpsc channel from -//! the data path; the LB consumes the matching [`ReadyChannel`] via +//! Outlier detection (gRFC A50) is integrated via [`OutlierDetector`], +//! which is always present. The actor inside is conditionally spawned +//! based on the [`OutlierDetectionConfig`] passed at construction — +//! `OutlierDetectionConfig::default()` is the disabled config, in +//! which case `record_outcome` short-circuits and no eject signals +//! ever fire. Eject requests arrive on an mpsc channel from the data +//! path; the LB consumes the matching [`ReadyChannel`] via //! [`ReadyChannel::eject`] and tracks the resulting //! [`EjectedChannel`] in [`Self::ejected`]. When the timer fires, the //! resolved [`UnejectedChannel`] is routed back into `ready` or @@ -25,16 +29,17 @@ use indexmap::IndexMap; use tower::Service; use tower::discover::{Change, Discover}; +use arc_swap::ArcSwap; + use crate::client::endpoint::{Connector, EndpointAddress}; use crate::client::loadbalance::channel_state::{ EjectionConfig, IdleChannel, ReadyChannel, UnejectedChannel, }; use crate::client::loadbalance::errors::LbError; use crate::client::loadbalance::keyed_futures::KeyedFutures; -use crate::client::loadbalance::outlier_detection::{ - OutlierDetector, OutlierStatsRegistry, RegistryAlreadyWired, -}; +use crate::client::loadbalance::outlier_detection::{OutlierDetector, OutlierStatsRegistry}; use crate::client::loadbalance::pickers::ChannelPicker; +use crate::xds::resource::outlier_detection::OutlierDetectionConfig; /// Future returned by [`LoadBalancer::call`]. Either resolves /// immediately with an [`LbError`] or drives the selected channel. @@ -71,8 +76,7 @@ pub(crate) struct LoadBalancer { discovery: D, connector: Arc, /// In-flight connection attempts. Resolves directly to a - /// [`ReadyChannel`] with outlier state already attached (looked - /// up in [`Self::outlier`]'s registry when present). + /// [`ReadyChannel`] with outlier state already attached. connecting: KeyedFutures>, /// Ready-to-serve channels. ready: IndexMap>, @@ -80,8 +84,12 @@ pub(crate) struct LoadBalancer { /// [`EjectedChannel`] whose `Sleep` fires when the ejection /// window expires. ejected: KeyedFutures>, - /// `None` disables outlier detection. - outlier: Option, + /// Per-LB outlier-detection plumbing. Always present; the + /// housekeeping actor inside is conditionally spawned based on + /// whether the config had an algorithm enabled at construction. + /// When disabled, the data path's `record_outcome` short-circuits + /// and nothing reads from `eject_rx`. + outlier: OutlierDetector, picker: Arc, Req> + Send + Sync>, } @@ -92,30 +100,22 @@ where C: Connector + Send + Sync + 'static, C::Service: Clone + Send + 'static, { - /// Create a load balancer with no outlier detection. + /// Construct a load balancer driven by `config`. Wrapping the + /// config in `ArcSwap` lets future xDS subscription updates + /// reconfigure detection without rebuilding the LB; until that + /// wiring lands, the value is effectively read-once at + /// construction. `OutlierDetectionConfig::default()` is the + /// disabled config — both algorithms `None` ⇒ no actor, no + /// ejection. pub(crate) fn new( discovery: D, connector: Arc, picker: Arc, Req> + Send + Sync>, + config: Arc>, ) -> Self { - // Infallible: `with_outlier(.., None)` never wires a registry. - Self::with_outlier(discovery, connector, picker, None) - .expect("with_outlier(.., None) is infallible") - } - - /// Create a load balancer, optionally enabling outlier detection. - /// When `outlier` is `Some`, the registry's housekeeping actor is - /// spawned and bound to this LB. Returns - /// [`RegistryAlreadyWired`] if the registry already drives - /// another LB. - pub(crate) fn with_outlier( - discovery: D, - connector: Arc, - picker: Arc, Req> + Send + Sync>, - outlier: Option>, - ) -> Result { - let outlier = outlier.map(OutlierDetector::new).transpose()?; - Ok(Self { + let (registry, eject_rx) = OutlierStatsRegistry::new(config); + let outlier = OutlierDetector::new(registry, eject_rx); + Self { discovery, connector, connecting: KeyedFutures::new(), @@ -123,14 +123,13 @@ where ejected: KeyedFutures::new(), outlier, picker, - }) + } } - /// The `Arc` that fresh `ReadyChannel`s - /// should attach state from. `None` when outlier detection is - /// disabled. - fn registry(&self) -> Option> { - self.outlier.as_ref().map(|o| o.registry().clone()) + /// Shared `Arc` for attaching per-channel + /// state when a [`ReadyChannel`] is born. + fn registry(&self) -> Arc { + self.outlier.registry().clone() } /// Purge all state for `addr`, including the outlier-detection @@ -139,9 +138,7 @@ where let _ = self.connecting.cancel(addr); self.ready.swap_remove(addr); let _ = self.ejected.cancel(addr); - if let Some(o) = self.outlier.as_ref() { - o.registry().remove_channel(addr); - } + self.outlier.registry().remove_channel(addr); } /// Clear stale connecting/ready/ejected slots for `addr` but @@ -187,10 +184,10 @@ where /// and routed to `ready`. fn poll_connecting(&mut self, cx: &mut Context<'_>) { while let Poll::Ready(Some((addr, ready))) = self.connecting.poll_next(cx) { - let remaining = self.outlier.as_ref().and_then(|o| { - o.registry() - .remaining_ejection(ready.outlier(), Instant::now()) - }); + let remaining = self + .outlier + .registry() + .remaining_ejection(ready.outlier(), Instant::now()); self.place_after_connect(addr, ready, remaining); } } @@ -209,9 +206,7 @@ where self.ready.insert(addr, ready); } Some(d) if d.is_zero() => { - if let Some(o) = self.outlier.as_ref() { - o.registry().note_uneject(ready.outlier()); - } + self.outlier.registry().note_uneject(ready.outlier()); self.ready.insert(addr, ready); } Some(d) => { @@ -235,14 +230,11 @@ where /// `record_outcome`. fn poll_eject_requests(&mut self, cx: &mut Context<'_>) { loop { - let Some(o) = self.outlier.as_mut() else { - return; - }; - let addr = match o.poll_eject_request(cx) { + let addr = match self.outlier.poll_eject_request(cx) { Poll::Ready(Some(a)) => a, _ => return, }; - let registry = o.registry().clone(); + let registry = self.outlier.registry().clone(); // Channel may have been removed by discovery in the // meantime; if so, nothing to eject. let Some(ch) = self.ready.swap_remove(&addr) else { @@ -257,7 +249,7 @@ where needs_reconnect: false, }, self.connector.clone(), - Some(registry.clone()), + registry.clone(), ); tracing::debug!("outlier detection: eject {addr} for {d:?}"); let _ = self.ejected.add(addr, ejected); @@ -281,21 +273,18 @@ where /// reattached) or `connecting`. fn poll_unejection(&mut self, cx: &mut Context<'_>) { while let Poll::Ready(Some((addr, unejected))) = self.ejected.poll_next(cx) { + let registry = self.outlier.registry(); match unejected { UnejectedChannel::Ready(ready) => { - if let Some(o) = self.outlier.as_ref() { - o.registry().note_uneject(ready.outlier()); - } + registry.note_uneject(ready.outlier()); tracing::debug!("outlier detection: uneject {addr}"); self.ready.insert(addr, ready); } // `needs_reconnect = false` for A50; this arm is // reserved for future policies. UnejectedChannel::Connecting(connecting) => { - if let Some(o) = self.outlier.as_ref() { - let state = o.registry().add_channel(addr.clone()); - o.registry().note_uneject(&state); - } + let state = registry.add_channel(addr.clone()); + registry.note_uneject(&state); let _ = self.connecting.add(addr, connecting); } } @@ -358,15 +347,13 @@ where // can take ownership without holding the picker borrow. let mut svc = picked.clone(); let outlier_state = picked.outlier().clone(); - let registry = self.outlier.as_ref().map(|o| o.registry().clone()); + let registry = self.outlier.registry().clone(); LbFuture::Pending(Box::pin(async move { tower::ServiceExt::ready(&mut svc) .await .map_err(|e| LbError::LbChannelPollReadyError(e.into()))?; let result = svc.call(req).await; - if let Some(registry) = registry.as_ref() { - registry.record_outcome(&outlier_state, result.is_ok()); - } + registry.record_outcome(&outlier_state, result.is_ok()); result.map_err(|e| LbError::LbChannelCallError(e.into())) })) } @@ -519,7 +506,8 @@ mod tests { let connector = Arc::new(MockConnector::new()); let picker: Arc, &'static str> + Send + Sync> = Arc::new(P2cPicker); - let lb = LoadBalancer::new(discover, connector.clone(), picker); + let config = Arc::new(ArcSwap::from_pointee(OutlierDetectionConfig::default())); + let lb = LoadBalancer::new(discover, connector.clone(), picker, config); (lb, connector) } @@ -875,7 +863,9 @@ mod tests { } } - /// Build an LB with outlier detection enabled. + /// Build an LB with outlier detection enabled. The returned + /// registry is the same `Arc` the LB owns; tests use it to + /// inspect ejected_count and the like. fn make_lb_with_outlier( discover: MockDiscover, config: OutlierDetectionConfig, @@ -883,10 +873,9 @@ mod tests { let connector = Arc::new(MockConnector::new()); let picker: Arc, &'static str> + Send + Sync> = Arc::new(P2cPicker); - let registry = OutlierStatsRegistry::new(config); - let lb = - LoadBalancer::with_outlier(discover, connector.clone(), picker, Some(registry.clone())) - .expect("registry not yet wired"); + let config = Arc::new(ArcSwap::from_pointee(config)); + let lb = LoadBalancer::new(discover, connector.clone(), picker, config); + let registry = lb.outlier.registry().clone(); (lb, connector, registry) } @@ -1145,32 +1134,4 @@ mod tests { ); assert!(!registry.add_channel(addr(8084)).is_ejected()); } - - /// Sharing one `OutlierStatsRegistry` across two `LoadBalancer`s is - /// not supported — the eject-signal receiver is one-shot. The - /// second `with_outlier` call must return an error rather than - /// panic. - #[tokio::test] - async fn test_outlier_registry_cannot_be_wired_twice() { - let (_tx1, discover1) = new_discover(); - let (_tx2, discover2) = new_discover(); - let connector = Arc::new(MockConnector::new()); - let picker: Arc, &'static str> + Send + Sync> = - Arc::new(P2cPicker); - let registry = OutlierStatsRegistry::new(fp_config(50, 5, 3)); - - // First wiring succeeds. - LoadBalancer::with_outlier( - discover1, - connector.clone(), - picker.clone(), - Some(registry.clone()), - ) - .expect("first wire"); - - // Second wiring of the same registry must error, not panic. - let result = - LoadBalancer::with_outlier(discover2, connector, picker, Some(registry.clone())); - assert!(result.is_err()); - } } diff --git a/tonic-xds/src/client/loadbalance/outlier_detection.rs b/tonic-xds/src/client/loadbalance/outlier_detection.rs index 8cff6dce9..2e0e4c805 100644 --- a/tonic-xds/src/client/loadbalance/outlier_detection.rs +++ b/tonic-xds/src/client/loadbalance/outlier_detection.rs @@ -28,11 +28,11 @@ //! [`UnejectedChannel`]: crate::client::loadbalance::channel_state::UnejectedChannel use std::sync::Arc; -use std::sync::Mutex; use std::sync::atomic::{AtomicU64, Ordering}; use std::task::{Context, Poll}; use std::time::{Duration, Instant}; +use arc_swap::ArcSwap; use dashmap::DashMap; use tokio::sync::mpsc; @@ -41,12 +41,6 @@ use crate::client::loadbalance::channel_state::OutlierChannelState; use crate::common::async_util::AbortOnDrop; use crate::xds::resource::outlier_detection::OutlierDetectionConfig; -/// Returned when an [`OutlierStatsRegistry`] is handed to a second -/// load balancer. The eject-signal receiver is one-shot. -#[derive(Debug, thiserror::Error)] -#[error("OutlierStatsRegistry is already wired to a LoadBalancer")] -pub(crate) struct RegistryAlreadyWired; - /// Shared outlier-detection state, owned by `Arc` and accessed /// concurrently by the data path ([`Self::record_outcome`]), the /// housekeeping actor ([`Self::run_housekeeping`]), and the load @@ -59,38 +53,31 @@ pub(crate) struct OutlierStatsRegistry { /// Channels currently ejected. Drives the /// `max_ejection_percent` cap. ejected_count: AtomicU64, - config: OutlierDetectionConfig, - /// Sender half of the eject signal. The receiver is owned by the - /// LB's [`OutlierDetector`]. + /// Shared config, hot-swappable. Readers `.load()` per call; + /// future xDS integration `.store()`s new configs on cluster + /// updates. `interval` changes also require an actor restart — + /// see [`spawn_actor`]. + config: Arc>, + /// Sender half of the eject signal. The receiver is paired + /// off and handed to the LB at construction (see [`Self::new`]). eject_tx: mpsc::UnboundedSender, - /// Receiver moved out exactly once by [`Self::take_eject_rx`]. - eject_rx: Mutex>>, } impl OutlierStatsRegistry { - pub(crate) fn new(config: OutlierDetectionConfig) -> Arc { + /// Construct the registry and the paired eject-signal receiver. + /// The LB owns the receiver; the registry owns the sender. + pub(crate) fn new( + config: Arc>, + ) -> (Arc, mpsc::UnboundedReceiver) { let (eject_tx, eject_rx) = mpsc::unbounded_channel(); - Arc::new(Self { + let registry = Arc::new(Self { channels: DashMap::new(), qualifying_count: AtomicU64::new(0), ejected_count: AtomicU64::new(0), config, eject_tx, - eject_rx: Mutex::new(Some(eject_rx)), - }) - } - - /// Take the eject-signal receiver. Returns - /// [`RegistryAlreadyWired`] on a second call — a registry can - /// drive at most one load balancer. - fn take_eject_rx( - &self, - ) -> Result, RegistryAlreadyWired> { - self.eject_rx - .lock() - .expect("eject_rx mutex poisoned") - .take() - .ok_or(RegistryAlreadyWired) + }); + (registry, eject_rx) } /// Get or create the state for `addr`. Idempotent — existing @@ -130,7 +117,8 @@ impl OutlierStatsRegistry { state.record_failure(); } - let Some(fp) = self.config.failure_percentage.as_ref() else { + let config = self.config.load(); + let Some(fp) = config.failure_percentage.as_ref() else { return; }; @@ -153,7 +141,7 @@ impl OutlierStatsRegistry { if self.qualifying_count.load(Ordering::Relaxed) < u64::from(fp.minimum_hosts) { return; } - if self.ejected_count.load(Ordering::Relaxed) >= self.max_ejections() { + if self.ejected_count.load(Ordering::Relaxed) >= self.max_ejections(&config) { return; } @@ -199,12 +187,9 @@ impl OutlierStatsRegistry { ) -> Option { let elapsed = state.ejected_duration(now)?; let multiplier = state.ejection_multiplier(); - let cap = self - .config - .base_ejection_time - .max(self.config.max_ejection_time); - let target = self - .config + let config = self.config.load(); + let cap = config.base_ejection_time.max(config.max_ejection_time); + let target = config .base_ejection_time .checked_mul(multiplier) .unwrap_or(cap) @@ -229,16 +214,20 @@ impl OutlierStatsRegistry { } /// Resolve `max_ejection_percent` against the current channel count. - fn max_ejections(&self) -> u64 { - self.channels.len() as u64 * u64::from(self.config.max_ejection_percent.get()) / 100 + fn max_ejections(&self, config: &OutlierDetectionConfig) -> u64 { + self.channels.len() as u64 * u64::from(config.max_ejection_percent.get()) / 100 } } /// Spawn the housekeeping actor. Ticks every `config.interval` and /// calls [`OutlierStatsRegistry::run_housekeeping`]. Dropping the /// returned [`AbortOnDrop`] stops the task. +/// +/// The `interval` is captured at spawn time; live updates require an +/// actor restart, which the xDS-integration layer will own. Other +/// config fields are re-read from the ArcSwap on each tick. pub(crate) fn spawn_actor(registry: Arc) -> AbortOnDrop { - let interval = registry.config.interval; + let interval = registry.config.load().interval; let task = tokio::spawn(async move { let mut ticker = tokio::time::interval(interval); ticker.set_missed_tick_behavior(tokio::time::MissedTickBehavior::Skip); @@ -251,27 +240,36 @@ pub(crate) fn spawn_actor(registry: Arc) -> AbortOnDrop { } /// Per-LB outlier-detection plumbing: shared registry, eject-signal -/// receiver, and the housekeeping actor handle (aborted on drop). The -/// LB holds this as `Option`. +/// receiver, and (when enabled) the housekeeping actor handle +/// (aborted on drop). The LB always owns one of these; the actor is +/// conditional on the config being enabled at construction. pub(crate) struct OutlierDetector { registry: Arc, eject_rx: mpsc::UnboundedReceiver, - _actor: AbortOnDrop, + /// `None` while config is disabled — `record_outcome` short- + /// circuits and the data path never sends through `eject_tx`, so + /// nothing reads or writes outlier state. + _actor: Option, } impl OutlierDetector { - /// Take ownership of the registry's eject-signal receiver and - /// spawn the housekeeping actor. Returns - /// [`RegistryAlreadyWired`] if the registry is already wired to - /// another LB. - pub(crate) fn new(registry: Arc) -> Result { - let eject_rx = registry.take_eject_rx()?; - let _actor = spawn_actor(registry.clone()); - Ok(Self { + /// Pair the registry with the eject-signal receiver and (if the + /// config currently has an algorithm enabled) spawn the + /// housekeeping actor. + pub(crate) fn new( + registry: Arc, + eject_rx: mpsc::UnboundedReceiver, + ) -> Self { + let _actor = registry + .config + .load() + .is_enabled() + .then(|| spawn_actor(registry.clone())); + Self { registry, eject_rx, _actor, - }) + } } /// Shared registry handle. @@ -316,6 +314,22 @@ mod tests { Percentage::new(v).unwrap() } + /// Build a registry whose config will never be swapped — these + /// tests exercise algorithm correctness, not config live-update. + fn make_registry( + config: OutlierDetectionConfig, + ) -> ( + Arc, + mpsc::UnboundedReceiver, + ) { + OutlierStatsRegistry::new(Arc::new(ArcSwap::from_pointee(config))) + } + + /// Convenience wrapper for tests that don't observe ejections. + fn make_registry_only(config: OutlierDetectionConfig) -> Arc { + make_registry(config).0 + } + fn base_config() -> OutlierDetectionConfig { OutlierDetectionConfig { interval: Duration::from_secs(1), @@ -361,7 +375,7 @@ mod tests { #[test] fn ejects_above_threshold_inline() { - let registry = OutlierStatsRegistry::new(fp_config(50, 10, 3)); + let registry = make_registry_only(fp_config(50, 10, 3)); let bad = registry.add_channel(addr(8084)); for port in 8080..=8083 { let s = registry.add_channel(addr(port)); @@ -374,7 +388,7 @@ mod tests { #[test] fn skips_below_threshold() { - let registry = OutlierStatsRegistry::new(fp_config(50, 10, 3)); + let registry = make_registry_only(fp_config(50, 10, 3)); let mut all = vec![]; for port in 8080..=8084 { let s = registry.add_channel(addr(port)); @@ -390,7 +404,7 @@ mod tests { #[test] fn at_threshold_does_not_eject() { // A50 specifies a strict "greater than" comparison. - let registry = OutlierStatsRegistry::new(fp_config(50, 10, 3)); + let registry = make_registry_only(fp_config(50, 10, 3)); let mut all = vec![]; for port in 8080..=8084 { let s = registry.add_channel(addr(port)); @@ -404,7 +418,7 @@ mod tests { #[test] fn minimum_hosts_gates_ejection() { - let registry = OutlierStatsRegistry::new(fp_config(50, 10, 5)); + let registry = make_registry_only(fp_config(50, 10, 5)); // Only 2 hosts have request_volume ≥ 10; minimum_hosts is 5 ⇒ skip. let mut all = vec![]; for port in 8080..=8081 { @@ -419,7 +433,7 @@ mod tests { #[test] fn request_volume_filters_low_traffic() { - let registry = OutlierStatsRegistry::new(fp_config(50, 100, 3)); + let registry = make_registry_only(fp_config(50, 100, 3)); let bad = registry.add_channel(addr(8080)); drive(®istry, &bad, 0, 5); for port in 8081..=8084 { @@ -437,7 +451,7 @@ mod tests { .as_mut() .unwrap() .enforcing_failure_percentage = pct(0); - let registry = OutlierStatsRegistry::new(config); + let registry = make_registry_only(config); let mut all = vec![]; for port in 8080..=8084 { let s = registry.add_channel(addr(port)); @@ -453,7 +467,7 @@ mod tests { fn max_ejection_percent_caps_concurrent_ejections() { let mut config = fp_config(50, 10, 3); config.max_ejection_percent = pct(20); - let registry = OutlierStatsRegistry::new(config); + let registry = make_registry_only(config); let mut all = vec![]; for port in 8080..=8084 { @@ -472,7 +486,7 @@ mod tests { #[test] fn remove_channel_decrements_counters() { - let registry = OutlierStatsRegistry::new(fp_config(50, 10, 3)); + let registry = make_registry_only(fp_config(50, 10, 3)); let mut all = vec![]; for port in 8080..=8083 { let s = registry.add_channel(addr(port)); @@ -494,8 +508,7 @@ mod tests { #[test] fn ejection_dispatches_address_through_mpsc() { - let registry = OutlierStatsRegistry::new(fp_config(50, 10, 3)); - let mut rx = registry.take_eject_rx().expect("receiver available"); + let (registry, mut rx) = make_registry(fp_config(50, 10, 3)); let bad = registry.add_channel(addr(8084)); for port in 8080..=8083 { let s = registry.add_channel(addr(port)); @@ -515,7 +528,7 @@ mod tests { #[test] fn housekeeping_resets_counters_and_qualifying() { - let registry = OutlierStatsRegistry::new(fp_config(50, 10, 3)); + let registry = make_registry_only(fp_config(50, 10, 3)); for port in 8080..=8083 { let s = registry.add_channel(addr(port)); drive(®istry, &s, 100, 0); @@ -532,7 +545,7 @@ mod tests { #[test] fn housekeeping_decrements_multiplier_on_healthy_interval() { - let registry = OutlierStatsRegistry::new(base_config()); + let registry = make_registry_only(base_config()); let s = registry.add_channel(addr(8080)); // Force multiplier to 3 directly (no traffic, no eject). s.set_ejection_multiplier(3); @@ -543,7 +556,7 @@ mod tests { #[test] fn housekeeping_leaves_ejected_multipliers_alone() { - let registry = OutlierStatsRegistry::new(base_config()); + let registry = make_registry_only(base_config()); let s = registry.add_channel(addr(8080)); s.try_eject(Instant::now()); s.set_ejection_multiplier(3); @@ -562,7 +575,7 @@ mod tests { let mut config = fp_config(50, 10, 3); config.base_ejection_time = Duration::from_secs(10); config.max_ejection_time = Duration::from_secs(60); - let registry = OutlierStatsRegistry::new(config); + let registry = make_registry_only(config); let s = registry.add_channel(addr(8080)); let t0 = Instant::now(); s.try_eject(t0); @@ -576,7 +589,7 @@ mod tests { let mut config = fp_config(50, 10, 3); config.base_ejection_time = Duration::from_secs(10); config.max_ejection_time = Duration::from_secs(15); - let registry = OutlierStatsRegistry::new(config); + let registry = make_registry_only(config); let s = registry.add_channel(addr(8080)); let t0 = Instant::now(); s.try_eject(t0); @@ -590,7 +603,7 @@ mod tests { let mut config = fp_config(50, 10, 3); config.base_ejection_time = Duration::from_secs(30); config.max_ejection_time = Duration::from_secs(60); - let registry = OutlierStatsRegistry::new(config); + let registry = make_registry_only(config); let s = registry.add_channel(addr(8080)); let t0 = Instant::now(); s.try_eject(t0); @@ -606,7 +619,7 @@ mod tests { let mut config = fp_config(50, 10, 3); config.base_ejection_time = Duration::from_secs(10); config.max_ejection_time = Duration::from_secs(60); - let registry = OutlierStatsRegistry::new(config); + let registry = make_registry_only(config); let s = registry.add_channel(addr(8080)); let t0 = Instant::now(); s.try_eject(t0); @@ -619,14 +632,14 @@ mod tests { #[test] fn remaining_ejection_none_when_not_ejected() { - let registry = OutlierStatsRegistry::new(base_config()); + let registry = make_registry_only(base_config()); let s = registry.add_channel(addr(8080)); assert!(registry.remaining_ejection(&s, Instant::now()).is_none()); } #[test] fn note_uneject_clears_state_and_decrements_counter() { - let registry = OutlierStatsRegistry::new(base_config()); + let registry = make_registry_only(base_config()); let s = registry.add_channel(addr(8080)); s.try_eject(Instant::now()); // bumps multiplier 0 → 1 registry.ejected_count.fetch_add(1, Ordering::Relaxed); @@ -653,7 +666,7 @@ mod tests { let mut config = fp_config(50, 10, 3); config.base_ejection_time = Duration::from_secs(10); config.max_ejection_time = Duration::from_secs(300); - let registry = OutlierStatsRegistry::new(config); + let registry = make_registry_only(config); let s = registry.add_channel(addr(8080)); let t0 = Instant::now(); @@ -687,7 +700,7 @@ mod tests { async fn dropping_abort_stops_actor() { let mut config = base_config(); config.interval = Duration::from_millis(50); - let registry = OutlierStatsRegistry::new(config); + let registry = make_registry_only(config); let s = registry.add_channel(addr(8080)); s.set_ejection_multiplier(5); diff --git a/tonic-xds/src/xds/resource/outlier_detection.rs b/tonic-xds/src/xds/resource/outlier_detection.rs index 970232bea..5cd2ffbd5 100644 --- a/tonic-xds/src/xds/resource/outlier_detection.rs +++ b/tonic-xds/src/xds/resource/outlier_detection.rs @@ -93,6 +93,23 @@ impl OutlierDetectionConfig { } } +impl Default for OutlierDetectionConfig { + /// Disabled by default: both ejection algorithms are `None`, so + /// [`Self::is_enabled`] returns `false`. The remaining fields use + /// A50's documented defaults — they're inert until an algorithm + /// is enabled via config update. + fn default() -> Self { + Self { + interval: Duration::from_secs(10), + base_ejection_time: Duration::from_secs(30), + max_ejection_time: Duration::from_secs(300), + max_ejection_percent: Percentage(10), + success_rate: None, + failure_percentage: None, + } + } +} + #[cfg(test)] mod tests { use super::*; From 00652c49752680c639471f4d623dd4e724adbae3 Mon Sep 17 00:00:00 2001 From: Yu Liu <60283975+LYZJU2019@users.noreply.github.com> Date: Wed, 27 May 2026 13:22:50 -0700 Subject: [PATCH 36/39] refactor(tonic-xds): bake outlier wrap into ConnectingChannel's inner future ConnectingChannel used to hold (addr, registry, BoxFuture) and do the registry.add_channel + ReadyChannel::new wrap inside Future::poll. Move the wrap into an async block built at construction time, so inner is BoxFuture> and poll is a plain forwarder. The struct becomes a one-field newtype and the wrap is no longer duplicated between the struct's poll and any other site that might want to construct a ReadyChannel. Costs one extra heap allocation per connect (the connector's BoxFuture becomes a captured field of the async block, which is itself boxed). Connects are not hot-path; the type-level clarity ("ConnectingChannel is just a future that yields a ReadyChannel") wins. --- .../src/client/loadbalance/channel_state.rs | 26 +++++++++---------- 1 file changed, 12 insertions(+), 14 deletions(-) diff --git a/tonic-xds/src/client/loadbalance/channel_state.rs b/tonic-xds/src/client/loadbalance/channel_state.rs index 5f57ca2b0..3b6147a8a 100644 --- a/tonic-xds/src/client/loadbalance/channel_state.rs +++ b/tonic-xds/src/client/loadbalance/channel_state.rs @@ -272,16 +272,15 @@ impl IdleChannel { /// A channel that is in the process of connecting. /// -/// `impl Future>` — resolves to a fully-formed -/// `ReadyChannel` whose outlier state is looked up from `registry` -/// via `add_channel` at resolve time. Cancellation is handled -/// externally via [`KeyedFutures::cancel`]. +/// `impl Future>` — the connector's +/// service-future is wrapped at construction time into an async +/// block that looks up the per-channel outlier state from `registry` +/// (via `add_channel`) and produces a fully-formed `ReadyChannel`. +/// Cancellation is handled externally via [`KeyedFutures::cancel`]. /// /// [`KeyedFutures::cancel`]: crate::client::loadbalance::keyed_futures::KeyedFutures::cancel pub(crate) struct ConnectingChannel { - addr: EndpointAddress, - registry: Arc, - inner: Pin + Send>>, + inner: Pin> + Send>>, } impl ConnectingChannel { @@ -291,9 +290,11 @@ impl ConnectingChannel { registry: Arc, ) -> Self { Self { - addr, - registry, - inner: fut, + inner: Box::pin(async move { + let svc = fut.await; + let outlier = registry.add_channel(addr.clone()); + ReadyChannel::new(addr, svc, outlier) + }), } } } @@ -302,10 +303,7 @@ impl Future for ConnectingChannel { type Output = ReadyChannel; fn poll(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll { - let this = self.get_mut(); - let svc = std::task::ready!(this.inner.as_mut().poll(cx)); - let outlier = this.registry.add_channel(this.addr.clone()); - Poll::Ready(ReadyChannel::new(this.addr.clone(), svc, outlier)) + self.get_mut().inner.as_mut().poll(cx) } } From ec228c3694b345cb3a9bcfa112a6d3e006461262 Mon Sep 17 00:00:00 2001 From: Yu Liu <60283975+LYZJU2019@users.noreply.github.com> Date: Wed, 27 May 2026 13:42:14 -0700 Subject: [PATCH 37/39] refactor(tonic-xds): inline purge_endpoint/reset_active_slots into poll_discover The original loadbalancer.rs had Insert/Remove match arms inlined. The helpers were extracted only when outlier detection added one or two lines per arm, but the inline pattern was already established and the additions don't grow the arms past the readable threshold. Inline restores the original style, makes the outlier-specific lines visible in-place, and leaves the A50 "preserve registry entry across re-insert" invariant as a one-line comment on the Insert arm where it applies. --- .../src/client/loadbalance/loadbalancer.rs | 32 +++++++------------ 1 file changed, 11 insertions(+), 21 deletions(-) diff --git a/tonic-xds/src/client/loadbalance/loadbalancer.rs b/tonic-xds/src/client/loadbalance/loadbalancer.rs index bdd013a53..9f65c03de 100644 --- a/tonic-xds/src/client/loadbalance/loadbalancer.rs +++ b/tonic-xds/src/client/loadbalance/loadbalancer.rs @@ -132,25 +132,6 @@ where self.outlier.registry().clone() } - /// Purge all state for `addr`, including the outlier-detection - /// registry entry. Called on `Change::Remove`. - fn purge_endpoint(&mut self, addr: &EndpointAddress) { - let _ = self.connecting.cancel(addr); - self.ready.swap_remove(addr); - let _ = self.ejected.cancel(addr); - self.outlier.registry().remove_channel(addr); - } - - /// Clear stale connecting/ready/ejected slots for `addr` but - /// preserve the outlier-detection registry entry. Called on - /// `Change::Insert` so transient discovery flaps don't lose - /// counters or ejection state, matching grpc-go and Envoy. - fn reset_active_slots(&mut self, addr: &EndpointAddress) { - let _ = self.connecting.cancel(addr); - self.ready.swap_remove(addr); - let _ = self.ejected.cancel(addr); - } - /// Drain pending discovery events. Resolves to an error /// ([`LbError::DiscoverClosed`] or [`LbError::DiscoverError`]) /// or stays pending — there is no success outcome. @@ -164,13 +145,22 @@ where Some(Err(e)) => return Poll::Ready(LbError::DiscoverError(e.into())), Some(Ok(Change::Insert(addr, idle))) => { tracing::trace!("discovery: insert {addr}"); - self.reset_active_slots(&addr); + let _ = self.connecting.cancel(&addr); + self.ready.swap_remove(&addr); + let _ = self.ejected.cancel(&addr); + // Note: the outlier-detection registry entry is + // intentionally preserved across re-insert so a + // transient discovery flap keeps its counters and + // ejection state (matching grpc-go and Envoy). let connecting = idle.connect(self.connector.clone(), self.registry()); let _ = self.connecting.add(addr, connecting); } Some(Ok(Change::Remove(addr))) => { tracing::trace!("discovery: remove {addr}"); - self.purge_endpoint(&addr); + let _ = self.connecting.cancel(&addr); + self.ready.swap_remove(&addr); + let _ = self.ejected.cancel(&addr); + self.outlier.registry().remove_channel(&addr); } } } From 1c6d119e7de191edd72fb7c5098d9b4832b00305 Mon Sep 17 00:00:00 2001 From: Yu Liu <60283975+LYZJU2019@users.noreply.github.com> Date: Wed, 27 May 2026 16:12:17 -0700 Subject: [PATCH 38/39] fix(tonic-xds): run failure-percentage at sweep, not per-RPC MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A50 §6 specifies the ejection algorithm runs at each timer-driven sweep over a snapshot of one interval's counters. The previous record_outcome evaluated the gates and dispatched eject signals inline on every RPC completion — a host could be ejected mid-interval on a burst (e.g. 50 failures in the first 100 ms of a 10 s interval), and the population gates (minimum_hosts, max_ejection_percent) were evaluated against a continuously-moving target rather than the snapshot population. Strip record_outcome to counter-increment only and move the entire failure-percentage algorithm into run_housekeeping, where it runs once per interval against a captured snapshot. Drop the running qualifying_count machinery (mark_qualifying / clear_qualifying / is_qualifying / the AtomicU64 field) — the sweep computes qualifying count from the snapshot, so the inter-RPC bookkeeping is no longer needed. Tests that previously asserted ejection after drive() now call run_housekeeping() to trigger the sweep, matching the new semantics. --- .../src/client/loadbalance/channel_state.rs | 19 +- .../src/client/loadbalance/loadbalancer.rs | 7 + .../client/loadbalance/outlier_detection.rs | 169 +++++++++--------- 3 files changed, 97 insertions(+), 98 deletions(-) diff --git a/tonic-xds/src/client/loadbalance/channel_state.rs b/tonic-xds/src/client/loadbalance/channel_state.rs index 3b6147a8a..396f6ac54 100644 --- a/tonic-xds/src/client/loadbalance/channel_state.rs +++ b/tonic-xds/src/client/loadbalance/channel_state.rs @@ -26,7 +26,7 @@ use std::future::Future; use std::pin::Pin; use std::sync::Arc; -use std::sync::atomic::{AtomicBool, AtomicU32, AtomicU64, Ordering}; +use std::sync::atomic::{AtomicU32, AtomicU64, Ordering}; use std::task::{Context, Poll}; use std::time::{Duration, Instant}; @@ -82,10 +82,6 @@ impl EndpointCounters { pub(crate) struct OutlierChannelState { addr: EndpointAddress, counters: EndpointCounters, - /// `true` while this channel is counted in the registry's - /// `qualifying_count` (i.e. has hit `request_volume` in the - /// current interval). - is_qualifying: AtomicBool, /// Bumped on each ejection; decremented (saturating) on each /// healthy interval. ejection_multiplier: AtomicU32, @@ -101,7 +97,6 @@ impl OutlierChannelState { Self { addr, counters: EndpointCounters::default(), - is_qualifying: AtomicBool::new(false), ejection_multiplier: AtomicU32::new(0), ejected_at_nanos: AtomicU64::new(0), epoch: Instant::now(), @@ -134,18 +129,6 @@ impl OutlierChannelState { self.counters.snapshot_and_reset() } - /// Set `is_qualifying` to `true`. Returns `true` if this call - /// performed the false → true transition (so the caller can bump - /// the registry counter exactly once per crossing). - pub(crate) fn mark_qualifying(&self) -> bool { - !self.is_qualifying.swap(true, Ordering::AcqRel) - } - - /// Clear `is_qualifying`. Returns the previous value. - pub(crate) fn clear_qualifying(&self) -> bool { - self.is_qualifying.swap(false, Ordering::AcqRel) - } - /// Atomically mark this channel as ejected starting at `now`. /// Returns `true` on the not-ejected → ejected transition and /// bumps the multiplier; `false` if already ejected. diff --git a/tonic-xds/src/client/loadbalance/loadbalancer.rs b/tonic-xds/src/client/loadbalance/loadbalancer.rs index 9f65c03de..09c238bf9 100644 --- a/tonic-xds/src/client/loadbalance/loadbalancer.rs +++ b/tonic-xds/src/client/loadbalance/loadbalancer.rs @@ -910,6 +910,11 @@ mod tests { let _ = lb.call("hello").await; } + // Per A50 the sweep decides ejection; drive it synchronously. + // (The spawned actor would also fire it on tick; calling it + // here avoids depending on actor timing under #[tokio::test].) + registry.run_housekeeping(); + // poll_ready drains the eject mpsc and transitions 8084 into // `self.ejected` via `ReadyChannel::eject`. let _ = poll_ready_now(&mut lb); @@ -1030,6 +1035,7 @@ mod tests { for _ in 0..100 { let _ = lb.call("hello").await; } + registry.run_housekeeping(); let _ = poll_ready_now(&mut lb); let state_8084 = registry.add_channel(addr(8084)); assert!( @@ -1096,6 +1102,7 @@ mod tests { for _ in 0..100 { let _ = lb.call("hello").await; } + registry.run_housekeeping(); let _ = poll_ready_now(&mut lb); assert!( lb.ejected.contains_key(&addr(8084)), diff --git a/tonic-xds/src/client/loadbalance/outlier_detection.rs b/tonic-xds/src/client/loadbalance/outlier_detection.rs index 2e0e4c805..03c5f48ab 100644 --- a/tonic-xds/src/client/loadbalance/outlier_detection.rs +++ b/tonic-xds/src/client/loadbalance/outlier_detection.rs @@ -47,11 +47,10 @@ use crate::xds::resource::outlier_detection::OutlierDetectionConfig; /// balancer ([`Self::note_uneject`], [`Self::remaining_ejection`]). pub(crate) struct OutlierStatsRegistry { channels: DashMap>, - /// Channels with `total >= request_volume` in the active - /// interval. Drives the `minimum_hosts` gate. - qualifying_count: AtomicU64, /// Channels currently ejected. Drives the - /// `max_ejection_percent` cap. + /// `max_ejection_percent` cap. Bumped by the sweep on each + /// ejection; decremented by [`Self::note_uneject`] and + /// [`Self::remove_channel`]. ejected_count: AtomicU64, /// Shared config, hot-swappable. Readers `.load()` per call; /// future xDS integration `.store()`s new configs on cluster @@ -72,7 +71,6 @@ impl OutlierStatsRegistry { let (eject_tx, eject_rx) = mpsc::unbounded_channel(); let registry = Arc::new(Self { channels: DashMap::new(), - qualifying_count: AtomicU64::new(0), ejected_count: AtomicU64::new(0), config, eject_tx, @@ -89,16 +87,13 @@ impl OutlierStatsRegistry { .clone() } - /// Drop the state for `addr`, decrementing cluster-wide counters - /// (`qualifying_count`, `ejected_count`) if it was contributing. + /// Drop the state for `addr`, decrementing `ejected_count` if + /// the removed channel was contributing to it. pub(crate) fn remove_channel(&self, addr: &EndpointAddress) { - if let Some((_, state)) = self.channels.remove(addr) { - if state.clear_qualifying() { - self.qualifying_count.fetch_sub(1, Ordering::Relaxed); - } - if state.is_ejected() { - self.ejected_count.fetch_sub(1, Ordering::Relaxed); - } + if let Some((_, state)) = self.channels.remove(addr) + && state.is_ejected() + { + self.ejected_count.fetch_sub(1, Ordering::Relaxed); } } @@ -107,59 +102,15 @@ impl OutlierStatsRegistry { self.channels.len() } - /// Per-RPC entry point. Records the outcome and, if all gates - /// pass, transitions the channel to ejected and dispatches the - /// address on the eject mpsc. + /// Per-RPC entry point. Records the outcome on the channel's + /// counter. Ejection decisions are deferred to the next sweep + /// (gRFC A50 §6) — see [`Self::run_housekeeping`]. pub(crate) fn record_outcome(&self, state: &OutlierChannelState, success: bool) { if success { state.record_success(); } else { state.record_failure(); } - - let config = self.config.load(); - let Some(fp) = config.failure_percentage.as_ref() else { - return; - }; - - let (s, f) = state.counters(); - let total = s + f; - let request_volume = u64::from(fp.request_volume); - - // Bump `qualifying_count` exactly once per channel per - // interval so the `minimum_hosts` gate is a single atomic load. - if total >= request_volume && state.mark_qualifying() { - self.qualifying_count.fetch_add(1, Ordering::Relaxed); - } - - if state.is_ejected() { - return; - } - if total < request_volume { - return; - } - if self.qualifying_count.load(Ordering::Relaxed) < u64::from(fp.minimum_hosts) { - return; - } - if self.ejected_count.load(Ordering::Relaxed) >= self.max_ejections(&config) { - return; - } - - // failure_pct = 100 * failure / total. A50 uses strict ">". - let failure_pct = 100 * f / total; - if failure_pct <= u64::from(fp.threshold.get()) { - return; - } - if !roll(fp.enforcing_failure_percentage.get()) { - return; - } - - if state.try_eject(Instant::now()) { - self.ejected_count.fetch_add(1, Ordering::Relaxed); - // Send failure (LB receiver dropped during shutdown) is - // ignored; the registry will be torn down momentarily. - let _ = self.eject_tx.send(state.addr().clone()); - } } /// Clear the ejection: flip the state, decrement @@ -197,16 +148,69 @@ impl OutlierStatsRegistry { Some(target.checked_sub(elapsed).unwrap_or_default()) } - /// Interval-boundary housekeeping. Resets counters and - /// decrements multipliers for non-ejected channels. Does not - /// un-eject — that is driven by each `EjectedChannel`'s timer. + /// One interval-boundary sweep (gRFC A50 §6). Order matters: + /// + /// 1. Snapshot every channel's counters for one consistent pass. + /// 2. Run the failure-percentage algorithm against the snapshot: + /// apply `minimum_hosts` to the qualifying population, then + /// `max_ejection_percent`, then per-channel threshold and the + /// enforcement roll. Dispatch eject signals through the mpsc. + /// 3. Reset counters and decrement multipliers for non-ejected + /// channels. + /// + /// Un-ejection is *not* driven from here — each `EjectedChannel` + /// owns its own `Sleep` timer. pub(crate) fn run_housekeeping(&self) { - for entry in self.channels.iter() { - let state = entry.value(); - state.snapshot_and_reset(); - if state.clear_qualifying() { - self.qualifying_count.fetch_sub(1, Ordering::Relaxed); + let config = self.config.load(); + let snapshots: Vec<(Arc, u64, u64)> = self + .channels + .iter() + .map(|e| { + let state = e.value().clone(); + let (s, f) = state.counters(); + (state, s, f) + }) + .collect(); + + if let Some(fp) = config.failure_percentage.as_ref() { + let request_volume = u64::from(fp.request_volume); + let qualifying = snapshots + .iter() + .filter(|(_, s, f)| s + f >= request_volume) + .count() as u64; + if qualifying >= u64::from(fp.minimum_hosts) { + let max_ejections = self.max_ejections(&config); + let now = Instant::now(); + let threshold = u64::from(fp.threshold.get()); + let enforcing = fp.enforcing_failure_percentage.get(); + for (state, s, f) in &snapshots { + let total = s + f; + if total < request_volume || state.is_ejected() { + continue; + } + if self.ejected_count.load(Ordering::Relaxed) >= max_ejections { + break; + } + // failure_pct = 100 * failure / total. A50 uses strict ">". + let failure_pct = 100 * f / total; + if failure_pct <= threshold { + continue; + } + if !roll(enforcing) { + continue; + } + if state.try_eject(now) { + self.ejected_count.fetch_add(1, Ordering::Relaxed); + // Send failure (LB receiver dropped during + // shutdown) is ignored. + let _ = self.eject_tx.send(state.addr().clone()); + } + } } + } + + for (state, _, _) in &snapshots { + state.snapshot_and_reset(); if !state.is_ejected() { state.decrement_multiplier(); } @@ -371,10 +375,10 @@ mod tests { } } - // ----- record_outcome: failure-percentage detection ----- + // ----- run_housekeeping: failure-percentage detection ----- #[test] - fn ejects_above_threshold_inline() { + fn ejects_above_threshold_at_sweep() { let registry = make_registry_only(fp_config(50, 10, 3)); let bad = registry.add_channel(addr(8084)); for port in 8080..=8083 { @@ -382,6 +386,9 @@ mod tests { drive(®istry, &s, 100, 0); } drive(®istry, &bad, 10, 90); + // Per A50 the algorithm runs at the interval sweep, not per RPC. + assert!(!bad.is_ejected()); + registry.run_housekeeping(); assert!(bad.is_ejected()); assert_eq!(registry.ejected_count.load(Ordering::Relaxed), 1); } @@ -396,6 +403,7 @@ mod tests { drive(®istry, &s, 70, 30); all.push(s); } + registry.run_housekeeping(); for s in &all { assert!(!s.is_ejected()); } @@ -411,6 +419,7 @@ mod tests { drive(®istry, &s, 50, 50); all.push(s); } + registry.run_housekeeping(); for s in &all { assert!(!s.is_ejected()); } @@ -426,6 +435,7 @@ mod tests { drive(®istry, &s, 0, 100); all.push(s); } + registry.run_housekeeping(); for s in &all { assert!(!s.is_ejected()); } @@ -440,6 +450,7 @@ mod tests { let s = registry.add_channel(addr(port)); drive(®istry, &s, 200, 0); } + registry.run_housekeeping(); assert!(!bad.is_ejected()); } @@ -458,6 +469,7 @@ mod tests { drive(®istry, &s, 0, 100); all.push(s); } + registry.run_housekeeping(); for s in &all { assert!(!s.is_ejected()); } @@ -474,10 +486,11 @@ mod tests { let s = registry.add_channel(addr(port)); all.push(s); } - // Drive all hosts to bad state in parallel pseudo-order. + // Drive all hosts to bad state. for s in &all { drive(®istry, s, 0, 100); } + registry.run_housekeeping(); let ejected = all.iter().filter(|s| s.is_ejected()).count(); // 5 hosts × 20% = 1 max ejection. @@ -485,7 +498,7 @@ mod tests { } #[test] - fn remove_channel_decrements_counters() { + fn remove_channel_decrements_ejected_count() { let registry = make_registry_only(fp_config(50, 10, 3)); let mut all = vec![]; for port in 8080..=8083 { @@ -495,15 +508,12 @@ mod tests { } let bad = registry.add_channel(addr(8084)); drive(®istry, &bad, 0, 100); + registry.run_housekeeping(); assert!(bad.is_ejected()); assert_eq!(registry.ejected_count.load(Ordering::Relaxed), 1); - // Each healthy host crossed request_volume; bad too. So - // qualifying_count = 5. - assert_eq!(registry.qualifying_count.load(Ordering::Relaxed), 5); registry.remove_channel(&addr(8084)); assert_eq!(registry.ejected_count.load(Ordering::Relaxed), 0); - assert_eq!(registry.qualifying_count.load(Ordering::Relaxed), 4); } #[test] @@ -515,6 +525,7 @@ mod tests { drive(®istry, &s, 100, 0); } drive(®istry, &bad, 10, 90); + registry.run_housekeeping(); // Eject dispatched exactly once via the mpsc. assert_eq!(rx.try_recv(), Ok(addr(8084))); @@ -527,16 +538,14 @@ mod tests { // ----- Housekeeping ----- #[test] - fn housekeeping_resets_counters_and_qualifying() { + fn housekeeping_resets_counters() { let registry = make_registry_only(fp_config(50, 10, 3)); for port in 8080..=8083 { let s = registry.add_channel(addr(port)); drive(®istry, &s, 100, 0); } - assert_eq!(registry.qualifying_count.load(Ordering::Relaxed), 4); registry.run_housekeeping(); - assert_eq!(registry.qualifying_count.load(Ordering::Relaxed), 0); for port in 8080..=8083 { let s = registry.channels.get(&addr(port)).unwrap(); assert_eq!(s.counters(), (0, 0)); From c89158cdbbc13c6dad41690047710b608c040c8e Mon Sep 17 00:00:00 2001 From: Yu Liu <60283975+LYZJU2019@users.noreply.github.com> Date: Wed, 27 May 2026 16:27:55 -0700 Subject: [PATCH 39/39] fix(tonic-xds): floor max_ejections at 1 for non-empty pools MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A50's max_ejection_percent prose: "Defaults to 10% but will eject at least one address regardless of the value." Without that floor the common case of 5 endpoints × default 10% rounds to 0, and the cap check (ejected_count >= 0) bails before the first ejection — silently disabling outlier detection. Floor max_ejections at 1 when the pool is non-empty; empty pools still yield 0. Adds a regression test on the rounding-down boundary. The existing 20% × 5-host test sat exactly at the floor and didn't exercise it. --- .../client/loadbalance/outlier_detection.rs | 33 +++++++++++++++++-- 1 file changed, 31 insertions(+), 2 deletions(-) diff --git a/tonic-xds/src/client/loadbalance/outlier_detection.rs b/tonic-xds/src/client/loadbalance/outlier_detection.rs index 03c5f48ab..c4e6a94da 100644 --- a/tonic-xds/src/client/loadbalance/outlier_detection.rs +++ b/tonic-xds/src/client/loadbalance/outlier_detection.rs @@ -217,9 +217,15 @@ impl OutlierStatsRegistry { } } - /// Resolve `max_ejection_percent` against the current channel count. + /// Resolve `max_ejection_percent` against the current channel + /// count. A50 mandates "at least one address regardless of the + /// value" — without this floor the default 10% × small clusters + /// (e.g. 5 endpoints) rounds to zero and silently disables + /// ejection. An empty pool genuinely has nothing to eject. fn max_ejections(&self, config: &OutlierDetectionConfig) -> u64 { - self.channels.len() as u64 * u64::from(config.max_ejection_percent.get()) / 100 + let len = self.channels.len() as u64; + let cap = len * u64::from(config.max_ejection_percent.get()) / 100; + if len > 0 { cap.max(1) } else { 0 } } } @@ -497,6 +503,29 @@ mod tests { assert_eq!(ejected, 1); } + /// A50 §"max_ejection_percent": at least one address may be + /// ejected regardless of the percentage. 5 hosts × 10% = 0 + /// arithmetically; the floor still allows 1. + #[test] + fn max_ejection_percent_permits_at_least_one_ejection() { + let mut config = fp_config(50, 10, 3); + config.max_ejection_percent = pct(10); + let registry = make_registry_only(config); + + let mut all = vec![]; + for port in 8080..=8084 { + let s = registry.add_channel(addr(port)); + all.push(s); + } + for s in &all { + drive(®istry, s, 0, 100); + } + registry.run_housekeeping(); + + let ejected = all.iter().filter(|s| s.is_ejected()).count(); + assert_eq!(ejected, 1); + } + #[test] fn remove_channel_decrements_ejected_count() { let registry = make_registry_only(fp_config(50, 10, 3));