diff --git a/Cargo.lock b/Cargo.lock index a7acdb0ee09..109fdc9c65f 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -6382,6 +6382,8 @@ dependencies = [ "tokio", "tracing", "tracing-subscriber 0.3.23", + "url", + "vec1", "versions", ] diff --git a/crates/builder/src/lib.rs b/crates/builder/src/lib.rs index 1c5c4321699..6a56c54fbbb 100755 --- a/crates/builder/src/lib.rs +++ b/crates/builder/src/lib.rs @@ -141,6 +141,10 @@ pub mod testing { start_voting_time: 0, stop_proposing_time: 0, stop_voting_time: 0, + upgrade_propose_offset: None, + upgrade_decide_by_offset: None, + upgrade_begin_offset: None, + upgrade_finish_offset: None, epoch_height: 0, epoch_start_block: 0, stake_table_capacity: hotshot_types::light_client::DEFAULT_STAKE_TABLE_CAPACITY, diff --git a/crates/espresso/node/src/consensus_handle.rs b/crates/espresso/node/src/consensus_handle.rs index 7935e937f1e..df017814271 100644 --- a/crates/espresso/node/src/consensus_handle.rs +++ b/crates/espresso/node/src/consensus_handle.rs @@ -15,6 +15,10 @@ use hotshot_new_protocol::{ client::ClientApi, consensus::ConsensusOutput, coordinator::{Coordinator, CoordinatorOutput, error::Severity}, + harvest::{ + LegacyPreCutoverSeed, forward_legacy_timeout_votes, harvest_legacy_pre_cutover_seed, + try_perform_handover, + }, network::Network, state::UpdateLeaf, storage::NewProtocolStorage, @@ -32,6 +36,24 @@ use tokio::spawn; use tokio_util::task::AbortOnDropHandle; use versions::version; +/// Status of the legacy → 0.8 protocol cutover. +#[derive(Clone, Debug)] +pub enum CutoverStatus { + /// No upgrade certificate has been decided yet — the network is running + /// purely on the legacy protocol. + NotConfigured, + /// The cutover view is in the future. `views_remaining` is how many views + /// before the new protocol takes over. Operators should ensure Cliquenet + /// peer connectivity is established by the time this hits 0. + Approaching { + cur_view: ViewNumber, + cutover_view: ViewNumber, + views_remaining: u64, + }, + /// The new protocol is active. + Active { cutover_view: ViewNumber }, +} + fn consensus_event(output: &ConsensusOutput) -> Option> { match output { ConsensusOutput::LeafDecided { @@ -112,6 +134,15 @@ where let coordinator_task = AbortOnDropHandle::new(spawn(run_coordinator(coordinator, event_tx))); + // Forward `LegacyTimeoutVoteEmitted` events from the legacy task into + // the new-protocol coordinator's timeout collectors. This is how the + // first 0.8 leader gets a `TimeoutCertificate2` for the boundary + // view if 0.4 timed out before its QC formed. + spawn(forward_legacy_timeout_votes( + legacy_event_rx.clone(), + client_api.clone(), + )); + Self { legacy_handle, client_api, @@ -123,6 +154,11 @@ where } } + pub async fn harvest_legacy_pre_cutover_seed(&self) -> Option> { + let legacy = self.legacy_handle.read().await; + harvest_legacy_pre_cutover_seed(&legacy).await + } + pub fn legacy_consensus(&self) -> Arc>> { self.legacy_handle.clone() } @@ -141,12 +177,33 @@ where >= version(0, 8) } + /// Status of the legacy → 0.8 cutover relative to the current view. + /// Use for operator monitoring around the upgrade boundary. + pub async fn cutover_status(&self) -> CutoverStatus { + let legacy = self.legacy_handle.read().await; + let cur_view = legacy.cur_view().await; + let lock = &legacy.hotshot.upgrade_lock; + let Some(cert) = lock.decided_upgrade_cert() else { + return CutoverStatus::NotConfigured; + }; + let cutover_view = cert.data.new_version_first_view; + if cur_view >= cutover_view { + CutoverStatus::Active { cutover_view } + } else { + CutoverStatus::Approaching { + cur_view, + cutover_view, + views_remaining: *cutover_view - *cur_view, + } + } + } + async fn new_protocol(&self) -> bool { if self.new_protocol_active.load(Ordering::Relaxed) { return true; } - let view = self.legacy_handle.read().await.cur_view().await; - let active = self.new_protocol_at(view).await; + let legacy = self.legacy_handle.read().await; + let active = try_perform_handover(&legacy, &self.client_api).await; if active { self.new_protocol_active.store(true, Ordering::Relaxed); } diff --git a/crates/espresso/node/src/lib.rs b/crates/espresso/node/src/lib.rs index ee5ddd90522..bc4eea953b4 100644 --- a/crates/espresso/node/src/lib.rs +++ b/crates/espresso/node/src/lib.rs @@ -1234,6 +1234,10 @@ pub mod testing { start_voting_time: 0, stop_proposing_time: 0, stop_voting_time: 0, + upgrade_propose_offset: None, + upgrade_decide_by_offset: None, + upgrade_begin_offset: None, + upgrade_finish_offset: None, epoch_height: 30, epoch_start_block: 1, stake_table_capacity: hotshot_types::light_client::DEFAULT_STAKE_TABLE_CAPACITY, diff --git a/crates/espresso/types/src/v0/config.rs b/crates/espresso/types/src/v0/config.rs index b309e599f3c..c8e191201ec 100644 --- a/crates/espresso/types/src/v0/config.rs +++ b/crates/espresso/types/src/v0/config.rs @@ -81,6 +81,14 @@ pub struct PublicHotShotConfig { stop_proposing_time: u64, start_voting_time: u64, stop_voting_time: u64, + #[serde(default)] + upgrade_propose_offset: Option, + #[serde(default)] + upgrade_decide_by_offset: Option, + #[serde(default)] + upgrade_begin_offset: Option, + #[serde(default)] + upgrade_finish_offset: Option, epoch_height: u64, epoch_start_block: u64, #[serde(default = "default_stake_table_capacity")] @@ -132,6 +140,10 @@ impl From> for PublicHotShotConfig { stop_proposing_time, start_voting_time, stop_voting_time, + upgrade_propose_offset, + upgrade_decide_by_offset, + upgrade_begin_offset, + upgrade_finish_offset, epoch_height, epoch_start_block, stake_table_capacity, @@ -161,6 +173,10 @@ impl From> for PublicHotShotConfig { stop_proposing_time, start_voting_time, stop_voting_time, + upgrade_propose_offset, + upgrade_decide_by_offset, + upgrade_begin_offset, + upgrade_finish_offset, epoch_height, epoch_start_block, stake_table_capacity, @@ -194,6 +210,10 @@ impl PublicHotShotConfig { stop_proposing_time: self.stop_proposing_time, start_voting_time: self.start_voting_time, stop_voting_time: self.stop_voting_time, + upgrade_propose_offset: self.upgrade_propose_offset, + upgrade_decide_by_offset: self.upgrade_decide_by_offset, + upgrade_begin_offset: self.upgrade_begin_offset, + upgrade_finish_offset: self.upgrade_finish_offset, epoch_height: self.epoch_height, epoch_start_block: self.epoch_start_block, stake_table_capacity: self.stake_table_capacity, diff --git a/crates/hotshot/hotshot/src/tasks/task_state.rs b/crates/hotshot/hotshot/src/tasks/task_state.rs index 516c9c41d62..f25894a3c07 100644 --- a/crates/hotshot/hotshot/src/tasks/task_state.rs +++ b/crates/hotshot/hotshot/src/tasks/task_state.rs @@ -86,6 +86,10 @@ impl> CreateTaskState stop_proposing_time: handle.hotshot.config.stop_proposing_time, start_voting_time: handle.hotshot.config.start_voting_time, stop_voting_time: handle.hotshot.config.stop_voting_time, + upgrade_propose_offset: handle.hotshot.config.upgrade_propose_offset, + upgrade_decide_by_offset: handle.hotshot.config.upgrade_decide_by_offset, + upgrade_begin_offset: handle.hotshot.config.upgrade_begin_offset, + upgrade_finish_offset: handle.hotshot.config.upgrade_finish_offset, epoch_start_block: handle.hotshot.config.epoch_start_block, upgrade_lock: handle.hotshot.upgrade_lock.clone(), epoch_height: handle.epoch_height, diff --git a/crates/hotshot/new-protocol/Cargo.toml b/crates/hotshot/new-protocol/Cargo.toml index cd56a34082c..966b33f1514 100644 --- a/crates/hotshot/new-protocol/Cargo.toml +++ b/crates/hotshot/new-protocol/Cargo.toml @@ -35,6 +35,8 @@ time = { workspace = true } tokio = { workspace = true } tracing = { workspace = true } tracing-subscriber = { workspace = true } +url = { workspace = true } +vec1 = { workspace = true } versions = { workspace = true } [lints] diff --git a/crates/hotshot/new-protocol/src/client.rs b/crates/hotshot/new-protocol/src/client.rs index 7ca2107e2db..def2f6046d4 100644 --- a/crates/hotshot/new-protocol/src/client.rs +++ b/crates/hotshot/new-protocol/src/client.rs @@ -1,10 +1,12 @@ -use std::{num::NonZeroUsize, sync::Arc}; +use std::{collections::BTreeMap, num::NonZeroUsize, sync::Arc}; use async_trait::async_trait; use committable::Commitment; use hotshot_types::{ data::{EpochNumber, Leaf2, ViewNumber}, message::Proposal as SignedProposal, + simple_certificate::QuorumCertificate2, + simple_vote::TimeoutVote2, traits::{leaf_fetcher_network::LeafFetcherNetwork, node_implementation::NodeType}, utils::StateAndDelta, }; @@ -112,6 +114,66 @@ impl ClientApi { .await? } + /// Forward a `TimeoutVote2` produced by the legacy (pre-0.8) consensus + /// task into the new-protocol coordinator's timeout collectors. Used at + /// the legacy → new-protocol boundary: when a legacy view near the + /// cutover times out, the legacy task signs a `TimeoutVote2` (whose + /// commitment is version-tagged via the shared `UpgradeLock`) and + /// submits it here so the first 0.8 leader can collect a + /// `TimeoutCertificate2` for that pre-cutover view. + /// + /// `TimeoutVote2` is structurally identical between 0.4 and 0.8 + /// (`SimpleVote`) so the same vote feeds both + /// systems' aggregators without re-signing. + pub async fn submit_timeout_vote(&self, vote: TimeoutVote2) -> Result<(), QueryError> { + let (respond, rx) = oneshot::channel(); + self.call(ClientRequest::SubmitTimeoutVote { vote, respond }, rx) + .await + } + + /// Bridge legacy (pre-0.8) state into the running coordinator at the + /// legacy → new-protocol cutover. + /// + /// - `decided_anchor` is the highest leaf 0.4 had decided. + /// - `undecided` is the chain of undecided 0.4 leaves above the anchor + /// (oldest-first). + /// - `high_qc` is the QC of the topmost undecided leaf, if 0.4 voting + /// completed enough for that QC to form. Required for the first 0.8 + /// leader to find `certs[N-1]` when proposing at view N (= the + /// topmost leaf's view + 1). May be `None` if the chain stalled + /// before the topmost leaf got a QC; in that case the first 0.8 + /// leader will need view-change evidence. + /// - `validated_states` is the validated state of every seeded leaf + /// (anchor + undecided), keyed by view number. The new protocol + /// pipelines header creation and state validation against the + /// parent's stored state — without seeding these, the first + /// post-cutover leader cannot build a header (no parent state) and + /// peers cannot validate the first post-cutover proposal. + /// + /// Idempotent at the consensus level: `set_pre_cutover_anchor` no-ops if + /// the supplied view is not above the current `last_decided_view`, and + /// `seed_pre_cutover_leaves` reinserts views that are already in the set. + pub async fn seed_pre_cutover( + &self, + decided_anchor: Leaf2, + undecided: Vec>, + high_qc: Option>, + validated_states: BTreeMap>, + ) -> Result<(), QueryError> { + let (respond, rx) = oneshot::channel(); + self.call( + ClientRequest::SeedPreCutover { + decided_anchor, + undecided, + high_qc, + validated_states, + respond, + }, + rx, + ) + .await + } + async fn call( &self, request: ClientRequest, @@ -192,6 +254,19 @@ pub(crate) enum ClientRequest { recipient: T::SignatureKey, respond: oneshot::Sender>, }, + SeedPreCutover { + decided_anchor: Leaf2, + undecided: Vec>, + high_qc: Option>, + /// Validated state for each seeded leaf, keyed by view. Empty if + /// the caller has no states to seed (e.g. legacy-only test paths). + validated_states: BTreeMap>, + respond: oneshot::Sender<()>, + }, + SubmitTimeoutVote { + vote: TimeoutVote2, + respond: oneshot::Sender<()>, + }, } #[derive(Debug, thiserror::Error)] diff --git a/crates/hotshot/new-protocol/src/consensus.rs b/crates/hotshot/new-protocol/src/consensus.rs index 795698008c8..0ae58f58f5a 100644 --- a/crates/hotshot/new-protocol/src/consensus.rs +++ b/crates/hotshot/new-protocol/src/consensus.rs @@ -10,7 +10,7 @@ use hotshot_contract_adapter::light_client::derive_signed_state_digest; use hotshot_types::{ data::{ BlockNumber, EpochNumber, Leaf2, VidCommitment, VidCommitment2, VidDisperse2, - VidDisperseShare2, ViewNumber, + VidDisperseShare2, ViewChangeEvidence2, ViewNumber, }, drb::DrbResult, epoch_membership::EpochMembershipCoordinator, @@ -31,7 +31,7 @@ use hotshot_types::{ LCV2StateSignatureKey, LCV3StateSignatureKey, SignatureKey, StateSignatureKey, }, }, - utils::{is_epoch_root, is_epoch_transition, is_last_block}, + utils::{epoch_from_block_number, is_epoch_root, is_epoch_transition, is_last_block}, vote::{self, Certificate, HasViewNumber}, }; use tracing::{debug, instrument, warn}; @@ -136,6 +136,12 @@ pub struct Consensus { voted_1_views: BTreeSet, voted_2_views: BTreeSet, + /// Views bridged in from the legacy (pre-0.8) protocol via + /// `seed_pre_cutover_leaves`. Skipped by `maybe_vote_2_and_update_lock` + /// (V1 AvidM dispersal, no V2 reconstruction possible). Cert1 is the + /// inherited 0.4 QC; Cert2 forms post-cutover via Vote2. + pre_cutover_views: BTreeSet, + /// Certificates whose epoch membership was not yet available when they /// arrived. They are retried when new epoch data becomes available. pending_certs1: BTreeMap>, @@ -220,6 +226,7 @@ impl Consensus { stake_table_coordinator: membership_coordinator, voted_1_views: BTreeSet::new(), voted_2_views: BTreeSet::new(), + pre_cutover_views: BTreeSet::new(), pending_certs1: BTreeMap::new(), pending_certs2: BTreeMap::new(), private_key, @@ -250,6 +257,127 @@ impl Consensus { .insert(ViewNumber::genesis(), genesis_proposal); } + /// Bridge a chain of legacy (pre-0.8) UNDECIDED leaves into this Consensus + /// instance so the new protocol can decide them via Cert2. + /// + /// `leaves` must be ordered oldest-first. `Leaf2` and `QuorumCertificate2` + /// are shared between 0.4 and 0.8, so each leaf's `justify_qc` is exactly + /// a new-protocol Cert1 for the *parent* view. We register that Cert1, + /// then synthesize and store a new-protocol `Proposal` from each leaf so + /// the decide rule has the (Cert1, proposal) pair it needs. + /// + /// The youngest seeded leaf has no Cert1 yet (the QC for it would have + /// been formed by the next leaf, which doesn't exist). Its Cert1 will + /// land automatically when the first post-cutover proposal arrives, + /// carrying it as `justify_qc` — see [`Self::register_proposal_justify_qc`]. + /// + /// Seeded views are added to `pre_cutover_views`, which causes + /// `maybe_vote_2_and_update_lock` to skip them (V1 AvidM dispersal). + pub fn seed_pre_cutover_leaves(&mut self, leaves: Vec>) { + let mut max_seeded_view = self.current_view; + let mut max_seeded_epoch = self.current_epoch.unwrap_or(EpochNumber::genesis()); + for leaf in leaves { + let view = leaf.view_number(); + let justify_qc = leaf.justify_qc().clone(); + // Register Cert1 for the parent view (the QC of the parent leaf, + // embedded as this leaf's justify_qc). + self.register_proposal_justify_qc(&justify_qc); + + // Compute the epoch from the block number; pre-cutover leaves + // don't carry an explicit epoch field but the new-protocol + // Proposal does. + let block_number = leaf.block_header().block_number(); + let epoch = EpochNumber::new(epoch_from_block_number(block_number, *self.epoch_height)); + + // Synthesize the new-protocol Proposal from the leaf. The decide + // rule only consults `block_header`, `view_number`, `epoch`, + // and `justify_qc`; `state_cert` is `None` for non-epoch-root + // pre-cutover leaves (epoch roots in 0.4 storage are decided and + // therefore not in this seed list). + let view_change_evidence = leaf.view_change_evidence.clone().and_then(|e| match e { + ViewChangeEvidence2::Timeout(tc) => Some(tc), + ViewChangeEvidence2::ViewSync(_) => None, + }); + let proposal = Proposal { + block_header: leaf.block_header().clone(), + view_number: view, + epoch, + justify_qc, + next_epoch_justify_qc: None, + upgrade_certificate: leaf.upgrade_certificate().clone(), + view_change_evidence, + next_drb_result: leaf.next_drb_result, + state_cert: None, + }; + + self.leaves.insert(view, leaf); + self.proposals.insert(view, proposal); + self.pre_cutover_views.insert(view); + + // Mark this view as already proposed/voted-on so the network + // doesn't try to re-propose or re-vote at it. The seeded chain + // is authoritative for these views. + self.proposed_views.insert(view); + self.voted_1_views.insert(view); + self.voted_2_views.insert(view); + + if view > max_seeded_view { + max_seeded_view = view; + max_seeded_epoch = epoch; + } + } + + // Advance current_view past the highest seeded view so the local + // node's leader logic targets the first POST-cutover view rather + // than re-running the seeded ones. + if max_seeded_view > self.current_view { + self.current_view = max_seeded_view; + self.current_epoch = Some(max_seeded_epoch); + } + } + + /// Register a proposal's `justify_qc` as Cert1 for the parent view, if + /// not already known. This is how Cert1 for the topmost pre-cutover leaf + /// (and, in general, any view whose Cert1 we don't independently form) + /// enters this Consensus instance after the legacy/new-protocol cutover. + /// + /// Idempotent: existing entries are not overwritten. + /// + /// Also bumps `locked_cert` to this QC if its view is higher than the + /// current locked view. The seeded pre-cutover chain advances + /// `locked_cert` here because `maybe_vote_2_and_update_lock` (which + /// would normally bump it) skips pre-cutover views. + pub fn register_proposal_justify_qc(&mut self, justify_qc: &Certificate1) { + let parent_view = justify_qc.view_number(); + self.certs + .entry(parent_view) + .or_insert_with(|| justify_qc.clone()); + if self + .locked_cert + .as_ref() + .is_none_or(|locked| locked.view_number() < parent_view) + { + self.locked_cert = Some(justify_qc.clone()); + } + } + + /// Advance the decided-anchor (`last_decided_leaf`/`last_decided_view`) + /// to the supplied leaf. Use this at the legacy → new-protocol cutover + /// to position the anchor at the highest leaf 0.4 had decided, so the + /// new protocol's decide-walk starts from the correct boundary instead + /// of from the boot-time genesis anchor. + /// + /// No-op if `leaf.view_number()` is not strictly greater than the + /// current `last_decided_view`. + pub fn set_pre_cutover_anchor(&mut self, leaf: Leaf2) { + let view = leaf.view_number(); + if view <= self.last_decided_view { + return; + } + self.last_decided_view = view; + self.last_decided_leaf = leaf; + } + /// Return the proposal stored at the given view, if any. pub fn proposal_at(&self, view: ViewNumber) -> Option<&Proposal> { self.proposals.get(&view) @@ -526,6 +654,12 @@ impl Consensus { self.leaves.insert(view, proposal.clone().into()); self.vid_shares.insert(view, vid_share); + // Register the proposal's `justify_qc` as Cert1 for the parent + // view. This is how Cert1 enters the system for views whose votes + // we never collected ourselves -- in particular, the topmost + // pre-cutover leaf seeded via `seed_pre_cutover_leaves`. + self.register_proposal_justify_qc(&proposal.justify_qc); + // Request the DRB if we don't have it yet. A mismatching DRB is // a hard failure (invalid leader), but a missing DRB is // recoverable — the proposal is stored and voting will proceed @@ -1161,35 +1295,43 @@ impl Consensus { let parent_view = proposal.justify_qc.view_number(); // We don't need the genesis block or the last block of the epoch to be reconstructed or verified - // or the genesis qc to be verified + // or the genesis qc to be verified. + // + // Pre-cutover parents are also exempt: their data was certified + // available under 0.4's DA mechanism and their VID shares are V1 + // (AvidM) which cannot be reconstructed under V2. We still verify + // the justify_qc/proposal commitment binding below. + let parent_is_pre_cutover = self.pre_cutover_views.contains(&parent_view); if parent_view != ViewNumber::genesis() && !is_last_block( proposal.block_header.block_number().saturating_sub(1), *self.epoch_height, ) { - // Verify we have the block for the QC on this commitment - let Some(block_commitment) = self.blocks_reconstructed.get(&parent_view) else { - debug!(%parent_view, "block commitment not available"); - return; - }; let Some(prev_proposal) = self.proposals.get(&parent_view) else { debug!(%parent_view, "proposal not available"); return; }; - let VidCommitment::V2(prev_block_commitment) = - prev_proposal.block_header.payload_commitment() - else { - warn! { - %view, - %parent_view, - "prev. proposal payload commitment is not a V2 VID commitment" + if !parent_is_pre_cutover { + // Verify we have the block for the QC on this commitment + let Some(block_commitment) = self.blocks_reconstructed.get(&parent_view) else { + debug!(%parent_view, "block commitment not available"); + return; + }; + let VidCommitment::V2(prev_block_commitment) = + prev_proposal.block_header.payload_commitment() + else { + warn! { + %view, + %parent_view, + "prev. proposal payload commitment is not a V2 VID commitment" + } + return; + }; + if block_commitment != &prev_block_commitment { + debug!(%parent_view, "parent block commitment does not match prev. block commitment"); + return; } - return; - }; - if block_commitment != &prev_block_commitment { - debug!(%parent_view, "parent block commitment does not match prev. block commitment"); - return; } if proposal.justify_qc.data().leaf_commit != proposal_commitment(prev_proposal) { @@ -1252,13 +1394,20 @@ impl Consensus { view: ViewNumber, outbox: &mut Outbox>, ) { + // Pre-cutover leaves were dispersed under AvidM (V1) and certified + // available by 0.4's DA; the new protocol does not re-vote them. + // `seed_pre_cutover_leaves` marks them as voted, but make the + // exclusion explicit so this path stays single-mode. + if self.pre_cutover_views.contains(&view) { + return; + } if self.voted_2_views.contains(&view) { return; } - let Some(reconstructed_block_commitment) = self.blocks_reconstructed.get(&view) else { + if !self.blocks_reconstructed.contains_key(&view) { debug!("reconstructed block commitment not available"); return; - }; + } let Some(cert1) = self.certs.get(&view) else { debug!("cert1 not available"); return; @@ -1276,7 +1425,8 @@ impl Consensus { warn!(%view, "cert1 commitment does not match proposal commitment"); return; } - // The proposal block commitment must match the reconstructed block commitment + let reconstructed_block_commitment = + self.blocks_reconstructed.get(&view).expect("checked above"); let VidCommitment::V2(proposal_block_commitment) = proposal.block_header.payload_commitment() else { diff --git a/crates/hotshot/new-protocol/src/coordinator.rs b/crates/hotshot/new-protocol/src/coordinator.rs index 91abbe35a2d..c7de60d2eac 100644 --- a/crates/hotshot/new-protocol/src/coordinator.rs +++ b/crates/hotshot/new-protocol/src/coordinator.rs @@ -183,32 +183,40 @@ where .build() } - /// Bootstrap the coordinator so the view-1 leader can propose. + /// Bootstrap the coordinator so the leader of `current_view + 1` can + /// propose. /// - /// Emits an initial `ViewChanged(1)` and, if this node is the view-1 - /// leader, a `RequestBlockAndHeader` for view 1. Call this after - /// `seed_genesis` on the inner `Consensus` instance. + /// Reads `current_view` and `current_epoch` from the inner `Consensus` + /// instance — these default to genesis after `seed_genesis`, but a + /// pre-cutover seed advances them to the highest seeded view. Emits + /// `ViewChanged(next_view)` and, if this node is the next-view leader, + /// a `RequestBlockAndHeader` whose parent is the proposal at + /// `current_view`. pub async fn start(&mut self) { - let view = ViewNumber::new(1); - let epoch = EpochNumber::genesis(); + let cur_view = self.consensus.current_view(); + let next_view = cur_view + 1; + let epoch = self + .consensus + .current_epoch() + .unwrap_or(EpochNumber::genesis()); self.outbox - .push_back(ConsensusOutput::ViewChanged(view, epoch)); + .push_back(ConsensusOutput::ViewChanged(next_view, epoch)); - if let Some(leader) = self.leader(view, epoch).await + if let Some(leader) = self.leader(next_view, epoch).await && leader == self.public_key { - let genesis_proposal = self + let parent_proposal = self .consensus - .proposal_at(ViewNumber::genesis()) - .expect("genesis proposal must be seeded before start()") + .proposal_at(cur_view) + .expect("parent proposal must be seeded before start()") .clone(); self.outbox .push_back(ConsensusOutput::RequestBlockAndHeader( BlockAndHeaderRequest { - view, + view: next_view, epoch, - parent_proposal: genesis_proposal, + parent_proposal, }, )); } @@ -246,7 +254,7 @@ where } } Some(request) = self.client.next_request() => { - if let Err(err) = self.on_client_request(request) { + if let Err(err) = self.on_client_request(request).await { error!(%err, "error while handling client request"); } } @@ -753,7 +761,10 @@ where membership.leader(view).await.ok() } - fn on_client_request(&mut self, request: ClientRequest) -> Result<(), CoordinatorError> { + async fn on_client_request( + &mut self, + request: ClientRequest, + ) -> Result<(), CoordinatorError> { match request { ClientRequest::CurrentView(tx) => { let _ = tx.send(self.consensus.current_view()); @@ -844,6 +855,91 @@ where }); let _ = respond.send(result); }, + ClientRequest::SeedPreCutover { + decided_anchor, + undecided, + high_qc, + validated_states, + respond, + } => { + tracing::info!( + undecided = undecided.len(), + anchor_view = *decided_anchor.view_number(), + high_qc_view = high_qc.as_ref().map(|qc| *qc.view_number()), + states = validated_states.len(), + "coordinator: applying legacy → new-protocol seed", + ); + // Seed StateManager BEFORE handing the leaves to consensus. + // Consensus needs the parent view's state extend. + let anchor_view = decided_anchor.view_number(); + if let Some(state) = validated_states.get(&anchor_view).cloned() { + self.state_manager + .seed_state(anchor_view, state, decided_anchor.clone()); + } + for leaf in &undecided { + let view = leaf.view_number(); + if let Some(state) = validated_states.get(&view).cloned() { + self.state_manager.seed_state(view, state, leaf.clone()); + } + } + self.consensus.set_pre_cutover_anchor(decided_anchor); + self.consensus.seed_pre_cutover_leaves(undecided); + if let Some(qc) = high_qc { + // Register the topmost legacy QC so `maybe_propose` for the + // first 0.8 view can find `certs[N-1]`. + self.consensus.register_proposal_justify_qc(&qc); + } + // Same boot kick as genesis: emit ViewChanged + (if leader) + // RequestBlockAndHeader for max_seeded_view + 1 so the + // post-cutover view proposes. Drain in place — the run loop + // only drains after `next_consensus_input` returns. + // + // Skipped if no proposal was seeded at current_view (e.g. + // the legacy chain timed out and the harvest produced an + // empty undecided chain). In that case the TC2 forwarded + // through the timeout-vote bridge will advance the + // coordinator via `handle_timeout_certificate` instead. + let cur_view = self.consensus.current_view(); + if self.consensus.proposal_at(cur_view).is_some() { + self.start().await; + let mut outputs = Vec::new(); + while let Some(output) = self.outbox.pop_front() { + outputs.push(output); + } + for output in outputs { + if let Err(err) = self.process_consensus_output(output).await { + tracing::warn!( + %err, + "error processing post-seed bootstrap output" + ); + } + } + } + let _ = respond.send(()); + }, + ClientRequest::SubmitTimeoutVote { vote, respond } => { + // Same dual-feed as the wire-message path + // (`ConsensusMessage::TimeoutVote`): aggregate into both the + // success-threshold and one-honest collectors. + self.timeout_collector.accumulate_vote(vote.clone()).await; + self.timeout_one_honest_collector + .accumulate_vote(vote.clone()) + .await; + // Rebroadcast on cliquenet so peer coordinators can aggregate + // it too. The bridge only fires for the local legacy node, so + // without this each coordinator sees just one vote and TC2 + // never forms at the cutover boundary. + let message = Message { + sender: self.public_key.clone(), + message_type: MessageType::Consensus(ConsensusMessage::TimeoutVote( + message::TimeoutVoteMessage { vote, lock: None }, + )), + }; + if let Err(err) = self.network.broadcast(message.view_number(), &message) { + tracing::warn!(%err, "failed to rebroadcast bridged timeout vote"); + } + let _ = respond.send(()); + }, } Ok(()) diff --git a/crates/hotshot/new-protocol/src/harvest.rs b/crates/hotshot/new-protocol/src/harvest.rs new file mode 100644 index 00000000000..74ede8e284b --- /dev/null +++ b/crates/hotshot/new-protocol/src/harvest.rs @@ -0,0 +1,171 @@ +//! Legacy → new-protocol handover: harvest legacy state and dispatch +//! the seed via `ClientApi`. Shared by `ConsensusHandle::new_protocol` +//! (production) and `tests::legacy_handover` (integration). + +use std::{collections::BTreeMap, sync::Arc}; + +use async_broadcast::InactiveReceiver; +use committable::Committable; +use futures::StreamExt; +use hotshot::{traits::NodeImplementation, types::SystemContextHandle}; +use hotshot_types::{ + data::{Leaf2, ViewNumber}, + event::{Event, EventType}, + simple_certificate::QuorumCertificate2, + traits::node_implementation::NodeType, +}; +use versions::CLIQUENET_VERSION; + +use crate::client::ClientApi; + +/// Inputs to the new protocol's `seed_pre_cutover` request. +pub struct LegacyPreCutoverSeed { + pub decided_anchor: Leaf2, + /// Oldest-first chain above the anchor, walked from `high_qc` via + /// `justify_qc` back to the anchor. + pub undecided: Vec>, + pub high_qc: QuorumCertificate2, + /// Per-view validated state for the anchor + each undecided leaf. + /// The first post-cutover header request needs the parent view's + /// state to build against. + pub validated_states: BTreeMap>, +} + +/// Walk the legacy `Consensus` to produce a [`LegacyPreCutoverSeed`]. +/// `None` on a broken walk (fork or missing leaf). Validated states +/// are best-effort; missing entries are tolerated by the handler. +pub async fn harvest_legacy_pre_cutover_seed( + handle: &SystemContextHandle, +) -> Option> +where + T: NodeType, + I: NodeImplementation, +{ + let consensus_arc = handle.hotshot.consensus(); + let consensus = consensus_arc.read().await; + let decided_anchor = consensus.decided_leaf(); + let decided_view = decided_anchor.view_number(); + let decided_commit = decided_anchor.commit(); + + let high_qc = consensus.high_qc().clone(); + let saved = consensus.saved_leaves(); + + let mut chain: Vec> = Vec::new(); + let mut next_commit = high_qc.data.leaf_commit; + loop { + if next_commit == decided_commit { + break; + } + let Some(leaf) = saved.get(&next_commit) else { + tracing::warn!( + %next_commit, + "harvest_legacy_pre_cutover_seed: missing leaf in saved_leaves; aborting", + ); + return None; + }; + if leaf.view_number() <= decided_view { + tracing::warn!( + leaf_view = *leaf.view_number(), + %decided_view, + "harvest_legacy_pre_cutover_seed: walked below decided view without matching commit; aborting", + ); + return None; + } + chain.push(leaf.clone()); + next_commit = leaf.justify_qc().data.leaf_commit; + } + + chain.reverse(); + + let mut validated_states = BTreeMap::new(); + if let Some(state) = consensus.state(decided_view) { + validated_states.insert(decided_view, state.clone()); + } else { + tracing::warn!( + %decided_view, + "harvest_legacy_pre_cutover_seed: no validated state for decided anchor", + ); + } + for leaf in &chain { + let view = leaf.view_number(); + if let Some(state) = consensus.state(view) { + validated_states.insert(view, state.clone()); + } else { + tracing::warn!( + %view, + "harvest_legacy_pre_cutover_seed: no validated state for undecided leaf", + ); + } + } + + Some(LegacyPreCutoverSeed { + decided_anchor, + undecided: chain, + high_qc, + validated_states, + }) +} + +/// Returns `true` once `legacy.cur_view`'s version >= `CLIQUENET_VERSION`, +/// dispatching a best-effort `seed_pre_cutover` through `client_api` on +/// the way. Logs but does not surface harvest/seed failures — the boundary +/// signal stands regardless so callers don't flip back to legacy. +/// +/// Re-seeding is idempotent at the consensus layer; callers should still +/// gate repeats with a once-flag. +pub async fn try_perform_handover( + legacy: &SystemContextHandle, + client_api: &ClientApi, +) -> bool +where + T: NodeType, + I: NodeImplementation, +{ + let cur_view = legacy.cur_view().await; + let crossed = legacy.hotshot.upgrade_lock.version_infallible(cur_view) >= CLIQUENET_VERSION; + if !crossed { + return false; + } + + if let Some(seed) = harvest_legacy_pre_cutover_seed(legacy).await { + if let Err(err) = client_api + .seed_pre_cutover( + seed.decided_anchor, + seed.undecided, + Some(seed.high_qc), + seed.validated_states, + ) + .await + { + tracing::warn!(%err, "seed_pre_cutover client request failed"); + } + } else { + tracing::warn!( + "harvest_legacy_pre_cutover_seed returned None; coordinator will not be seeded", + ); + } + + true +} + +/// Forward `LegacyTimeoutVoteEmitted` events from the legacy task into the +/// new-protocol coordinator's timeout collectors. Lets the first 0.8 +/// leader form a `TimeoutCertificate2` for the boundary view if 0.4 +/// timed out before its QC formed. +/// +/// Run as a long-lived task. Spawned by `ConsensusHandle::new` in +/// production and by the integration test for the same parity reason +/// `try_perform_handover` is shared. +pub async fn forward_legacy_timeout_votes( + legacy_event_rx: InactiveReceiver>, + client_api: ClientApi, +) { + let mut rx = legacy_event_rx.activate_cloned(); + while let Some(event) = rx.next().await { + if let EventType::LegacyTimeoutVoteEmitted { vote } = event.event + && let Err(err) = client_api.submit_timeout_vote(vote).await + { + tracing::warn!(%err, "failed to forward legacy TimeoutVote2 to new-protocol coordinator"); + } + } +} diff --git a/crates/hotshot/new-protocol/src/lib.rs b/crates/hotshot/new-protocol/src/lib.rs index 21b9796c066..75a11a78935 100644 --- a/crates/hotshot/new-protocol/src/lib.rs +++ b/crates/hotshot/new-protocol/src/lib.rs @@ -4,6 +4,7 @@ pub mod consensus; pub mod coordinator; pub mod epoch; pub mod epoch_root_vote_collector; +pub mod harvest; pub mod helpers; pub mod logging; pub mod message; diff --git a/crates/hotshot/new-protocol/src/tests.rs b/crates/hotshot/new-protocol/src/tests.rs index f8d8f95620c..dc003efd9fd 100644 --- a/crates/hotshot/new-protocol/src/tests.rs +++ b/crates/hotshot/new-protocol/src/tests.rs @@ -3,9 +3,11 @@ pub(crate) mod common; mod block; mod cliquenet; mod consensus; +mod cutover; mod epoch_change; mod failures; mod integration; +mod legacy_handover; mod restarts; mod state; mod vid; diff --git a/crates/hotshot/new-protocol/src/tests/common/coordinator_builder.rs b/crates/hotshot/new-protocol/src/tests/common/coordinator_builder.rs index 3bb6f5d93c0..c1f7a963a2c 100644 --- a/crates/hotshot/new-protocol/src/tests/common/coordinator_builder.rs +++ b/crates/hotshot/new-protocol/src/tests/common/coordinator_builder.rs @@ -39,6 +39,14 @@ use crate::{ /// certificate and proposal so that the view-1 leader can propose without any /// external injection. The initial `ViewChanged` and (for the leader) /// `RequestBlockAndHeader` outputs are already queued in the outbox. +/// +/// If `pre_cutover_seed` is provided, it is applied **synchronously before** +/// `coord.start()` runs. This prevents the startup race where `start()` +/// emits `ViewChanged(1)` and the view-1 leader proposes before an +/// async-dispatched seed can land. With a seed in place, `start()` reads the +/// (now advanced) `current_view` and emits `ViewChanged(max_seeded_view + 1)` +/// instead. +#[allow(clippy::too_many_arguments)] pub async fn build_test_coordinator>( node_index: u64, network: N, @@ -47,6 +55,7 @@ pub async fn build_test_coordinator>( client: CoordinatorClient, epoch_height: u64, view_timeout: Duration, + pre_cutover_seed: Option, ) -> Coordinator> { let (public_key, private_key) = BLSPubKey::generated_from_seed_indexed([0; 32], node_index); let state_key_pair = StateKeyPair::generate_from_seed_indexed([0u8; 32], node_index); @@ -96,11 +105,41 @@ pub async fn build_test_coordinator>( genesis_leaf.clone(), ); + // If a pre-cutover seed is provided, seed the StateManager for each + // pre-cutover leaf. The new protocol's proposal validator pipelines + // state validation against the parent's stored state — without this, + // the leader of `max_seeded_view + 1` cannot validate its own proposal + // (no parent state on file). For tests we use the default + // `TestValidatedState`; in production the espresso bridge would carry + // legacy state forward. + if let Some(seed) = pre_cutover_seed.as_ref() { + let default_state = Arc::new(TestValidatedState::default()); + state_manager.seed_state( + seed.decided_anchor.view_number(), + default_state.clone(), + seed.decided_anchor.clone(), + ); + for leaf in &seed.undecided { + state_manager.seed_state(leaf.view_number(), default_state.clone(), leaf.clone()); + } + } + // Build a genesis cert1 and proposal so consensus can self-start. let genesis_cert1 = build_genesis_cert1(&genesis_leaf); let genesis_proposal = build_genesis_proposal(&genesis_leaf, &genesis_cert1); consensus.seed_genesis(genesis_cert1.clone(), genesis_proposal.clone()); + // Apply the legacy → new-protocol seed (if provided) BEFORE we hand + // consensus to the coordinator builder. After this, `current_view` is + // advanced past the seeded views and `coord.start()` will emit + // `ViewChanged(max_seeded_view + 1)` instead of the genesis-default + // `ViewChanged(1)`. + if let Some(seed) = pre_cutover_seed { + consensus.set_pre_cutover_anchor(seed.decided_anchor); + consensus.seed_pre_cutover_leaves(seed.undecided); + consensus.register_proposal_justify_qc(&seed.high_qc); + } + // Seed the genesis proposal into the backing TestStorage so that // peers can serve the genesis block to late-joiners during // `EpochMembershipCoordinator::catchup` (epoch 0 root block == 0). diff --git a/crates/hotshot/new-protocol/src/tests/common/runner.rs b/crates/hotshot/new-protocol/src/tests/common/runner.rs index dedb4d66311..7f9fa75f466 100644 --- a/crates/hotshot/new-protocol/src/tests/common/runner.rs +++ b/crates/hotshot/new-protocol/src/tests/common/runner.rs @@ -98,10 +98,22 @@ pub struct TestRunner { #[builder(default)] node_changes: Vec<(u64, Vec)>, + /// Optional legacy → new-protocol seed handed to each coordinator + /// before its run loop starts. + pre_cutover_seed: Option, + #[builder(skip = test_upgrade_lock())] upgrade_lock: UpgradeLock, } +/// Seed handed to every coordinator at startup to bridge legacy state. +#[derive(Clone)] +pub struct PreCutoverSeed { + pub decided_anchor: hotshot_types::data::Leaf2, + pub undecided: Vec>, + pub high_qc: crate::message::Certificate1, +} + #[derive(Debug)] pub enum TestError { Timeout, @@ -273,6 +285,7 @@ impl TestRunner { client, self.epoch_height, self.view_timeout, + self.pre_cutover_seed.clone(), ) .await; @@ -283,6 +296,25 @@ impl TestRunner { } else { let (cancel_tx, cancel_rx) = oneshot::channel(); cancels.insert(i, cancel_tx); + // Pre-populate commits with seeded leaves so the verifier + // sees them as decided (they are inherited from the legacy + // protocol; the new protocol won't fire LeafDecided for + // them). Also stamp views 1..anchor with the anchor's + // commit so the verifier accepts them as legacy-decided — + // it only checks node-cross consistency on these slots, + // not the actual chain shape. + let mut initial_commits: BTreeMap = BTreeMap::new(); + if let Some(seed) = &self.pre_cutover_seed { + let anchor_view = seed.decided_anchor.view_number(); + let anchor_commit: [u8; 32] = seed.decided_anchor.commit().into(); + for v in 1..*anchor_view { + initial_commits.insert(ViewNumber::new(v), anchor_commit); + } + initial_commits.insert(anchor_view, anchor_commit); + for leaf in &seed.undecided { + initial_commits.insert(leaf.view_number(), leaf.commit().into()); + } + } Some(tokio::spawn(run_node( coord, tx, @@ -290,6 +322,7 @@ impl TestRunner { generation, external_events_tx, cancel_rx, + initial_commits, ))) }); } @@ -311,6 +344,27 @@ impl TestRunner { let mut node_timeouts: Vec> = vec![BTreeSet::new(); self.num_nodes]; let mut max_decided_view: u64 = 0; + // Pre-populate commits for seeded leaves: those views are + // "previously decided" (in the legacy protocol) and the new-protocol + // nodes inherit them via the seed rather than re-deriving them, so + // they will never appear in `LeafDecided` outputs. The verifier + // expects every view in `1..=target_decisions` to be either decided + // or expected-to-fail; without this pre-population the seeded views + // would falsely fail the `NotEnoughDecided` check. + if let Some(seed) = &self.pre_cutover_seed { + let anchor_view = seed.decided_anchor.view_number(); + let anchor_commit: [u8; 32] = seed.decided_anchor.commit().into(); + for commits in &mut node_commits { + for v in 1..*anchor_view { + commits.insert(ViewNumber::new(v), anchor_commit); + } + commits.insert(anchor_view, anchor_commit); + for leaf in &seed.undecided { + commits.insert(leaf.view_number(), leaf.commit().into()); + } + } + } + let deadline = Instant::now() + self.max_runtime; while node_commits .iter() @@ -369,6 +423,7 @@ impl TestRunner { client, self.epoch_height, self.view_timeout, + self.pre_cutover_seed.clone(), ) .await; // Bump the generation so stale events queued @@ -378,6 +433,9 @@ impl TestRunner { let generation = generations[change.idx]; let (cancel_tx, cancel_rx) = oneshot::channel(); cancels.insert(change.idx, cancel_tx); + // Restarted nodes start with a fresh commits + // map (mirroring the wipe at line ~404 below). + let initial_commits = BTreeMap::new(); node_handles[change.idx] = Some(tokio::spawn(run_node( coord, tx, @@ -385,6 +443,7 @@ impl TestRunner { generation, external_events_tx, cancel_rx, + initial_commits, ))); currently_down.remove(&change.idx); node_commits[change.idx] = BTreeMap::new(); @@ -558,8 +617,9 @@ async fn run_node>( generation: u64, external_events_tx: Sender>, mut cancel: oneshot::Receiver>, + initial_commits: BTreeMap, ) { - let mut commits: BTreeMap = BTreeMap::new(); + let mut commits: BTreeMap = initial_commits; let mut last_view = ViewNumber::genesis(); let send = |event: NodeEvent| { let _ = output_tx.send(TaggedEvent { diff --git a/crates/hotshot/new-protocol/src/tests/common/utils.rs b/crates/hotshot/new-protocol/src/tests/common/utils.rs index 22a6ac5f9b0..12229f5a176 100644 --- a/crates/hotshot/new-protocol/src/tests/common/utils.rs +++ b/crates/hotshot/new-protocol/src/tests/common/utils.rs @@ -30,9 +30,10 @@ use hotshot_types::{ epoch_membership::EpochMembershipCoordinator, light_client::{StakeTableState, StateKeyPair}, message::Proposal as SignedProposal, - simple_certificate::TimeoutCertificate2, + simple_certificate::{TimeoutCertificate2, UpgradeCertificate}, simple_vote::{ - LightClientStateUpdateVote2, QuorumVote2, TimeoutData2, TimeoutVote2, Vote2Data, + LightClientStateUpdateVote2, QuorumVote2, TimeoutData2, TimeoutVote2, UpgradeProposalData, + UpgradeVote, Vote2Data, }, traits::{ EncodeBytes, @@ -259,6 +260,25 @@ impl TestData { num_views: usize, epoch_height: u64, num_nodes: usize, + ) -> Self { + Self::new_with_upgrade(num_views, epoch_height, num_nodes, None).await + } + + /// Build a chain of `num_views` legacy-style views and, if + /// `upgrade_at_view` is set, attach a real, quorum-signed + /// `UpgradeCertificate` to that view's leaf. + /// + /// The returned upgrade certificate is properly signed by every + /// validator in the test stake table — it verifies under the same + /// `EpochMembership` that signs the `cert1` chain, exactly the way the + /// upgrade task forms one in production legacy hotshot. Subsequent + /// views in the chain re-sign their `cert1` because the upgraded + /// leaf's commit changes when the certificate is attached. + pub async fn new_with_upgrade( + num_views: usize, + epoch_height: u64, + num_nodes: usize, + upgrade_at_view: Option<(ViewNumber, UpgradeProposalData)>, ) -> Self { crate::logging::init_test_logging(); let (public_key, _) = BLSPubKey::generated_from_seed_indexed([0u8; 32], 0); @@ -266,11 +286,46 @@ impl TestData { mock_membership_with_num_nodes(num_nodes, epoch_height, public_key).await; let keys = key_map_with_num_nodes(num_nodes as u64); let node_key_map = Arc::new(keys.clone()); - let upgrade = TEST_VERSIONS.vid2; + // Match `test_upgrade_lock()` (used by `ConsensusHarness` and + // `TestRunner`) so signature commitments computed during view + // generation are byte-identical to what those harnesses verify. + // Different versions in `Upgrade::trivial` would produce + // different `VersionedVoteData` commitments, breaking cert + // signature verification across the boundary. + let upgrade = versions::Upgrade::trivial(versions::CLIQUENET_VERSION); let mut generator = TestViewGenerator::generate(membership.clone(), node_key_map.clone(), upgrade); + // Pre-build the upgrade certificate so we can attach it to the + // matching view in the patching loop below. + let upgrade_cert: Option<(ViewNumber, UpgradeCertificate)> = + if let Some((target_view, ref data)) = upgrade_at_view { + let epoch_membership = membership + .membership_for_epoch(Some(EpochNumber::genesis())) + .await + .unwrap(); + let (leader_pk, leader_priv_key) = + BLSPubKey::generated_from_seed_indexed([0u8; 32], 0); + let cert = build_cert::< + TestTypes, + UpgradeProposalData, + UpgradeVote, + UpgradeCertificate, + >( + data.clone(), + &epoch_membership, + target_view, + &leader_pk, + &leader_priv_key, + &test_upgrade_lock::(), + ) + .await; + Some((target_view, cert)) + } else { + None + }; + let gen_views: Vec<_> = (&mut generator).take(num_views).collect::>().await; let mut views = Vec::new(); @@ -343,6 +398,19 @@ impl TestData { proposal.next_epoch_justify_qc = prev_new_cert2.clone(); } + // Attach the upgrade certificate at the requested view. The + // leaf's commitment will absorb the certificate; subsequent + // views must rebuild their justify_qc against the new commit + // (handled below via `prev_new_cert1`). + let upgrade_attached = upgrade_cert.as_ref().is_some_and(|(target_view, cert)| { + if *target_view == view_number { + proposal.upgrade_certificate = Some(cert.clone()); + true + } else { + false + } + }); + // Recompute leaf and commitment (may differ from generator output // when we touched justify_qc or next_drb_result). let leaf = Leaf2::from(proposal.clone()); @@ -404,7 +472,12 @@ impl TestData { // Propagate the rebuilt cert1 so the next view's justify_qc is // consistent with our updated commitment. - if needs_drb || needs_justify_update || needs_new_epoch || epoch_patched { + if needs_drb + || needs_justify_update + || needs_new_epoch + || epoch_patched + || upgrade_attached + { prev_new_cert1 = Some(cert1.clone()); } // Set prev_new_cert2 on the last block of each epoch so the diff --git a/crates/hotshot/new-protocol/src/tests/cutover.rs b/crates/hotshot/new-protocol/src/tests/cutover.rs new file mode 100644 index 00000000000..091f860d5b9 --- /dev/null +++ b/crates/hotshot/new-protocol/src/tests/cutover.rs @@ -0,0 +1,310 @@ +//! Unit tests for the legacy → new-protocol (0.4 → 0.8) cutover bridging +//! API: `Consensus::seed_pre_cutover_leaves`, +//! `Consensus::register_proposal_justify_qc`, +//! `Consensus::set_pre_cutover_anchor`, and the `pre_cutover_views` +//! grandfathering of the V2 VID-availability check in +//! `maybe_vote_2_and_update_lock`. + +use hotshot::types::{BLSPubKey, SignatureKey}; +use hotshot_example_types::node_types::TestTypes; +use hotshot_types::{ + data::{EpochNumber, ViewNumber}, + simple_vote::UpgradeProposalData, + stake_table::StakeTableEntries, + vote::{Certificate, HasViewNumber}, +}; +use versions::{CLIQUENET_VERSION, version}; + +use crate::{ + helpers::test_upgrade_lock, + tests::common::utils::{ConsensusHarness, TestData}, +}; + +/// `seed_pre_cutover_leaves` populates `proposals`, marks the view in +/// `pre_cutover_views` (probed indirectly via the public `proposal_at` +/// accessor), and registers the parent's Cert1. +#[tokio::test] +async fn test_seed_pre_cutover_leaves_populates_state() { + let mut harness = ConsensusHarness::new(0).await; + let test_data = TestData::new(3).await; + + // Seed views 1 and 2 as undecided pre-cutover leaves. + let leaves: Vec<_> = test_data + .views + .iter() + .take(2) + .map(|v| v.leaf.clone()) + .collect(); + harness.consensus.seed_pre_cutover_leaves(leaves); + + // Both seeded views have a synthesized proposal recorded. + assert!( + harness + .consensus + .proposal_at(test_data.views[0].view_number) + .is_some(), + "seeded view 0 should have a proposal", + ); + assert!( + harness + .consensus + .proposal_at(test_data.views[1].view_number) + .is_some(), + "seeded view 1 should have a proposal", + ); + + // Each leaf's `justify_qc` was registered as Cert1 for the parent view. + // The oldest seeded leaf's justify_qc points to the genesis-style parent + // (its `view_number()` minus one, in TestData). + let parent_view_of_first = test_data.views[0].leaf.justify_qc().view_number(); + assert!( + harness.consensus.cert1_at(parent_view_of_first).is_some(), + "Cert1 for parent of oldest seeded leaf should be registered", + ); + + // The second leaf's justify_qc is the QC of the first seeded leaf — + // that registers Cert1 for the first seeded leaf's view. + let parent_view_of_second = test_data.views[1].leaf.justify_qc().view_number(); + assert!( + harness.consensus.cert1_at(parent_view_of_second).is_some(), + "Cert1 for first seeded leaf (= second's parent) should be registered", + ); +} + +/// `register_proposal_justify_qc` is idempotent: calling it twice with the +/// same QC doesn't replace the original entry, and a second call with a +/// different QC for the same view doesn't overwrite either. +#[tokio::test] +async fn test_register_proposal_justify_qc_idempotent() { + let mut harness = ConsensusHarness::new(0).await; + let test_data = TestData::new(2).await; + + let qc1 = test_data.views[0].cert1.clone(); + let qc1_view = qc1.view_number(); + + // First registration installs the QC. + harness.consensus.register_proposal_justify_qc(&qc1); + let after_first = harness + .consensus + .cert1_at(qc1_view) + .cloned() + .expect("Cert1 should be registered"); + + // Second registration is a no-op (or_insert_with semantics). + harness.consensus.register_proposal_justify_qc(&qc1); + let after_second = harness + .consensus + .cert1_at(qc1_view) + .cloned() + .expect("Cert1 should still be registered"); + + assert_eq!( + after_first.signatures, after_second.signatures, + "Cert1 entry should not be replaced by a second register call", + ); +} + +/// `set_pre_cutover_anchor` advances `last_decided_view` only when the +/// supplied leaf's view is strictly greater than the current anchor. +/// Lower-or-equal views are silently ignored (idempotent / safe to retry). +#[tokio::test] +async fn test_set_pre_cutover_anchor_only_advances() { + let mut harness = ConsensusHarness::new(0).await; + let test_data = TestData::new(3).await; + + // Pre-condition: harness starts at genesis (view 0). + let starting_view = harness.consensus.last_decided_view(); + + // Advance to a leaf above genesis. + let advanced_leaf = test_data.views[1].leaf.clone(); + let advanced_view = advanced_leaf.view_number(); + assert!(advanced_view > starting_view); + harness.consensus.set_pre_cutover_anchor(advanced_leaf); + assert_eq!(harness.consensus.last_decided_view(), advanced_view); + + // Calling again with an EARLIER leaf should be a no-op. + let earlier_leaf = test_data.views[0].leaf.clone(); + harness.consensus.set_pre_cutover_anchor(earlier_leaf); + assert_eq!( + harness.consensus.last_decided_view(), + advanced_view, + "anchor should not regress to earlier view", + ); +} + +/// Multi-node E2E test of the legacy → new-protocol cutover seed via +/// `TestRunner` (real Cliquenet network between five nodes). +/// +/// Each coordinator is seeded with a chain of pre-cutover leaves +/// **synchronously before** its run loop starts (the seed is threaded into +/// `build_test_coordinator` so it applies before `coord.start()`, avoiding +/// the startup race that would otherwise let the view-1 leader propose +/// before the seed lands). +#[tokio::test(flavor = "multi_thread")] +async fn five_nodes_decide_after_pre_cutover_seed() { + use crate::tests::common::runner::{PreCutoverSeed, TestRunner}; + + // Generate a 2-leaf pre-cutover chain (anchor at view 1, one undecided + // leaf at view 2). This is the minimum useful seed that exercises the + // entire bridging path while keeping the boundary close to genesis + // where the leader-of-view-3 can take over cleanly. + // + // Use epoch_height=100 so the leaves carry `epoch = Some(...)`, + // matching what `Leaf2::from_quorum_proposal` produces from a + // new-protocol Proposal (whose `epoch` is non-Option). Without this + // the synthesized proposals in `seed_pre_cutover_leaves` round-trip + // with `with_epoch = true` while originals have `with_epoch = false`, + // making the commitments differ. + // + // num_nodes=5 to match `TestRunner`'s default — TestData and TestRunner + // must use the same membership size or the leaf certs (signed under + // TestData's stake table) won't verify against TestRunner's stake table. + let test_data = + crate::tests::common::utils::TestData::new_with_epoch_height_and_num_nodes(2, 100, 5).await; + + let anchor = test_data.views[0].leaf.clone(); + let undecided = vec![test_data.views[1].leaf.clone()]; + let high_qc = test_data.views[1].cert1.clone(); + + let seed = PreCutoverSeed { + decided_anchor: anchor, + undecided, + high_qc, + }; + + TestRunner::builder() + .pre_cutover_seed(seed) + .target_decisions(10) + .build() + .run() + .await + .expect("network should decide past the pre-cutover boundary"); +} + +/// End-to-end legacy → new-protocol handover with a *real, quorum-signed* +/// `UpgradeCertificate` formed by aggregating BLS votes from every +/// validator in the test stake table — the same primitive the legacy +/// upgrade task uses in production. +/// +/// What this test exercises: +/// +/// 1. **Upgrade certificate formation**: an `UpgradeProposalData` +/// transitioning from `CLIQUENET_VERSION - 1` (0.7) to +/// `CLIQUENET_VERSION` (0.8) is signed by every validator and +/// aggregated into an `UpgradeCertificate`. The cert verifies under +/// the same `EpochMembership` that signed it (via +/// `Certificate::is_valid_cert`). +/// +/// 2. **Embedding in the legacy chain**: the certificate is attached to +/// the leaf at view 2 — exactly where the legacy upgrade task would +/// embed it once enough votes form. `TestData::new_with_upgrade` +/// re-derives that leaf's commit and re-signs the chain's `cert1` / +/// `cert2` so the chain remains internally consistent. +/// +/// 3. **Cutover hand-over**: the chain (anchor view 1, undecided view 2) +/// plus its high QC seed five new-protocol coordinators on a real +/// Cliquenet network. The leaf at view 2 carries the upgrade +/// certificate forward via +/// `Consensus::seed_pre_cutover_leaves`. +/// +/// 4. **Post-cutover progress**: the new protocol takes over at view 3 +/// (= `new_version_first_view`) and decides through view 10 — proving +/// that with a properly formed upgrade certificate decided in the +/// legacy chain, the handover lets the new protocol extend the chain +/// past the boundary. +/// +/// What this test does *not* exercise: +/// +/// - Live legacy HotShot consensus rounds: the chain is generated +/// deterministically rather than driven by a running `SystemContext`. +/// The validator keys, signatures, certificate aggregation, and chain +/// shape are all real, so the upgrade cert is indistinguishable from +/// one formed by a live cluster — the only thing absent is the +/// wall-clock view advance. +#[tokio::test(flavor = "multi_thread")] +async fn upgrade_certificate_handover() { + use crate::tests::common::runner::{PreCutoverSeed, TestRunner}; + + let num_nodes = 5; + let num_views = 2; + // The cert says the legacy version's last view is `num_views` and + // the new version begins at `num_views + 1`. The new-protocol + // coordinators take over at view `num_views + 1` (view 3 here). + let pre_cliquenet = version(CLIQUENET_VERSION.major, CLIQUENET_VERSION.minor - 1); + let upgrade_data = UpgradeProposalData { + old_version: pre_cliquenet, + new_version: CLIQUENET_VERSION, + decide_by: ViewNumber::new(1), + new_version_hash: vec![0u8; 12], + old_version_last_view: ViewNumber::new(num_views as u64), + new_version_first_view: ViewNumber::new(num_views as u64 + 1), + }; + + // Generate a 2-leaf legacy chain. The leaf at view 2 carries a + // properly signed `UpgradeCertificate` — TestData::new_with_upgrade + // calls `build_cert` (the same helper used by the upgrade task in + // production legacy hotshot) to aggregate votes from every + // validator in the membership. + let upgrade_view = ViewNumber::new(num_views as u64); + let test_data = TestData::new_with_upgrade( + num_views, + 100, + num_nodes, + Some((upgrade_view, upgrade_data.clone())), + ) + .await; + + // Sanity: the leaf at view 2 actually carries the cert, and the + // certificate aggregates a quorum-threshold signature that + // verifies against the test stake table. + let upgraded_leaf = &test_data.views[1].leaf; + let cert_opt = upgraded_leaf.upgrade_certificate(); + let cert = cert_opt + .as_ref() + .expect("upgrade certificate should be embedded in legacy chain"); + assert_eq!(cert.data.new_version, CLIQUENET_VERSION); + assert_eq!(cert.data.old_version, pre_cliquenet); + assert_eq!( + cert.data.new_version_first_view, + ViewNumber::new(num_views as u64 + 1) + ); + + // Verify the upgrade cert against the same membership that signed it. + // This proves we have a real, quorum-signed certificate — not just a + // structurally-valid struct. + let public_key = BLSPubKey::generated_from_seed_indexed([0u8; 32], 0).0; + let (membership, ..) = + crate::tests::common::utils::mock_membership_with_client(num_nodes, 100, public_key).await; + let epoch_membership = membership + .membership_for_epoch(Some(EpochNumber::genesis())) + .await + .unwrap(); + let stake_entries = + StakeTableEntries::::from(epoch_membership.stake_table().await).0; + let threshold = epoch_membership.upgrade_threshold().await; + cert.is_valid_cert(&stake_entries, threshold, &test_upgrade_lock::()) + .expect("upgrade certificate should verify against the validator stake table"); + + // Hand the legacy chain to a new-protocol cluster: anchor at view 1 + // (decided in the legacy protocol), undecided leaves = [view 2] + // (the upgrade cert is on view 2, so any node that sees the + // post-cutover seed receives it). + let anchor = test_data.views[0].leaf.clone(); + let undecided = vec![test_data.views[1].leaf.clone()]; + let high_qc = test_data.views[1].cert1.clone(); + + let seed = PreCutoverSeed { + decided_anchor: anchor, + undecided, + high_qc, + }; + + TestRunner::builder() + .num_nodes(num_nodes) + .pre_cutover_seed(seed) + .target_decisions(10) + .build() + .run() + .await + .expect("new protocol should decide past the upgrade boundary"); +} diff --git a/crates/hotshot/new-protocol/src/tests/legacy_handover.rs b/crates/hotshot/new-protocol/src/tests/legacy_handover.rs new file mode 100644 index 00000000000..f261f7fbbd1 --- /dev/null +++ b/crates/hotshot/new-protocol/src/tests/legacy_handover.rs @@ -0,0 +1,756 @@ +//! End-to-end handover test mirroring `SequencerContext::init`: per +//! node, a legacy `SystemContext` (MemoryNetwork) and a new-protocol +//! `Coordinator` (Cliquenet) run concurrently. A per-node watcher polls +//! [`harvest::try_perform_handover`] — the same trigger +//! `ConsensusHandle::new_protocol` uses — so the seed flows through +//! `ClientApi::seed_pre_cutover` and into the coordinator's +//! `SeedPreCutover` handler, the only seeding path production uses. + +use std::{ + collections::{BTreeMap, BTreeSet}, + net::Ipv4Addr, + sync::Arc, + time::{Duration, Instant}, +}; + +use async_lock::RwLock; +use committable::Committable; +use hotshot::{ + HotShotInitializer, SystemContext, + types::{BLSPubKey, SystemContextHandle}, +}; +use hotshot_example_types::{ + membership::TestableMembership, + node_types::{MemoryImpl, TestTypes}, + state_types::TestInstanceState, +}; +use hotshot_testing::{ + block_builder::{SimpleBuilderImplementation, TestBuilderImplementation}, + test_builder::TestDescription, +}; +use hotshot_types::{ + PeerConnectInfo, ValidatorConfig, + addr::NetAddr, + consensus::ConsensusMetricsValue, + data::ViewNumber, + epoch_membership::EpochMembershipCoordinator, + storage_metrics::StorageMetricsValue, + traits::{ + election::Membership, leaf_fetcher_network::ConnectedNetworkLeafFetcher, + node_implementation::NodeType, signature_key::SignatureKey, + }, + x25519::Keypair, +}; +use tokio::{ + sync::mpsc::{self, UnboundedSender}, + task::AbortHandle, + time::sleep, +}; +use url::Url; +use versions::{CLIQUENET_VERSION, Upgrade, version}; + +use crate::{ + client::ClientApi, + consensus::ConsensusOutput, + coordinator::{Coordinator, CoordinatorOutput, error::Severity, timer::Timer}, + harvest::{forward_legacy_timeout_votes, try_perform_handover}, + helpers::test_upgrade_lock, + network::cliquenet::Cliquenet, + outbox::Outbox, + tests::common::utils::mock_membership_with_client, +}; + +const UPGRADE_VIEW: u64 = 5; +const EPOCH_HEIGHT: u64 = 1000; +/// Default new-protocol view timeout. Long enough that the view-0 +/// timer doesn't fire during the legacy phase. Tests that exercise a +/// post-cutover timeout can override via the `view_timeout` argument. +const DEFAULT_NEW_PROTO_VIEW_TIMEOUT: Duration = Duration::from_secs(60); + +async fn spawn_legacy_cluster( + num_nodes: usize, + upgrade_view: u64, +) -> Vec> { + let pre_cliquenet = version(CLIQUENET_VERSION.major, CLIQUENET_VERSION.minor - 1); + let mut metadata: TestDescription = + TestDescription::default_multiple_rounds(); + metadata = metadata.set_num_nodes(num_nodes as u64, num_nodes as u64); + metadata.upgrade = Upgrade::new(pre_cliquenet, CLIQUENET_VERSION); + metadata.upgrade_view = Some(upgrade_view); + metadata.test_config.epoch_height = EPOCH_HEIGHT; + metadata.test_config.set_view_upgrade(upgrade_view); + // Tighten cutover offsets; defaults push it ~20 views out. + metadata.test_config.upgrade_propose_offset = Some(1); + metadata.test_config.upgrade_decide_by_offset = Some(10); + metadata.test_config.upgrade_begin_offset = Some(12); + metadata.test_config.upgrade_finish_offset = Some(15); + + // SimpleBuilder HTTP server — without it the upgrade task never + // sees proposals. Leaked so it outlives the test run. + let port = test_utils::reserve_tcp_port().expect("port"); + let builder_url = Url::parse(&format!("http://localhost:{port}")).expect("url"); + let builder_task = + >::start( + num_nodes, + builder_url.clone(), + (), + Default::default(), + ) + .await; + Box::leak(Box::new(builder_task)); + + let launcher = metadata.gen_launcher(); + let url_for_config = builder_url; + let launcher = launcher.map_hotshot_config(move |config| { + config.builder_urls = vec1::vec1![url_for_config.clone()]; + }); + + let mut handles = Vec::with_capacity(num_nodes); + for node_id in 0..num_nodes as u64 { + let network = (launcher.resource_generators.channel_generator)(node_id).await; + let storage = (launcher.resource_generators.storage)(node_id); + let hotshot_config = (launcher.resource_generators.hotshot_config)(node_id); + + let is_da = node_id < hotshot_config.da_staked_committee_size as u64; + let validator_config: ValidatorConfig = + ValidatorConfig::generated_from_seed_indexed( + [0u8; 32], + node_id, + launcher.metadata.node_stakes.get(node_id), + is_da, + ); + let public_key = validator_config.public_key; + let mut membership = ::Membership::new( + hotshot_config.known_nodes_with_stake.clone(), + hotshot_config.known_da_nodes.clone(), + public_key, + launcher.metadata.test_config.epoch_height, + ); + let external_chan = async_broadcast::broadcast(64); + membership.set_leaf_fetcher( + Arc::new(ConnectedNetworkLeafFetcher::::new( + Arc::clone(&network), + )), + storage.clone(), + public_key, + external_chan.1.new_receiver(), + ); + let memberships = Arc::new(RwLock::new(membership)); + let coordinator = + EpochMembershipCoordinator::new(memberships, hotshot_config.epoch_height, &storage); + + let initializer = HotShotInitializer::::from_genesis( + TestInstanceState::default(), + launcher.metadata.test_config.epoch_height, + launcher.metadata.test_config.epoch_start_block, + vec![], + launcher.metadata.upgrade, + ) + .await + .expect("initializer"); + + let hotshot = SystemContext::::new( + public_key, + validator_config.private_key.clone(), + validator_config.state_private_key.clone(), + node_id, + hotshot_config, + launcher.metadata.upgrade, + coordinator, + network, + initializer, + ConsensusMetricsValue::default(), + storage, + StorageMetricsValue::default(), + ) + .await; + + let handle = hotshot.run_tasks().await; + handles.push(handle); + } + handles +} + +/// Per-node `(x25519 keypair, BLS public key, Cliquenet addr)`, +/// deterministic from the same seed the legacy cluster uses so both +/// stacks share BLS identities (one validator key, two transports). +fn build_parties(num_nodes: usize) -> Vec<(Keypair, BLSPubKey, NetAddr)> { + (0..num_nodes) + .map(|i| { + let (pk, sk) = BLSPubKey::generated_from_seed_indexed([0u8; 32], i as u64); + let kp = Keypair::derive_from::(&sk).unwrap(); + let port = test_utils::reserve_tcp_port().expect("port"); + let addr = NetAddr::Inet(Ipv4Addr::LOCALHOST.into(), port); + (kp, pk, addr) + }) + .collect() +} + +async fn build_new_protocol_network( + i: usize, + parties: &[(Keypair, BLSPubKey, NetAddr)], + lock: &hotshot_types::message::UpgradeLock, +) -> Cliquenet { + let peer_infos: Vec<(BLSPubKey, PeerConnectInfo)> = parties + .iter() + .map(|(kp, pk, addr)| { + ( + *pk, + PeerConnectInfo { + x25519_key: kp.public_key(), + p2p_addr: addr.clone(), + }, + ) + }) + .collect(); + let config = cliquenet::Config::builder() + .name("legacy-handover") + .keypair(parties[i].0.clone().into()) + .bind(parties[i].2.clone()) + .random_connect_delay(false) + .parties( + peer_infos + .iter() + .map(|(_, info)| (info.x25519_key.into(), info.p2p_addr.clone())), + ) + .build(); + Cliquenet::create_with_config(parties[i].1, lock.clone(), config, peer_infos.clone()) + .await + .expect("cliquenet creation should succeed") +} + +/// Production-shaped `Coordinator` build (mirrors `Coordinator::maker`): +/// no `seed_genesis`, no `coord.start()`, no inline pre-cutover seeding. +/// Boots at view 0 and waits for the seed via `ClientApi`. The pre-built +/// `CoordinatorClient` is shared with the membership's leaf fetcher. +#[allow(clippy::too_many_arguments)] +async fn build_handover_coordinator( + node_index: u64, + network: Cliquenet, + membership: EpochMembershipCoordinator, + storage: hotshot_example_types::storage_types::TestStorage, + client: crate::client::CoordinatorClient, + epoch_height: u64, + view_timeout: Duration, +) -> Coordinator< + TestTypes, + Cliquenet, + hotshot_example_types::storage_types::TestStorage, +> { + use hotshot_example_types::{node_types::TEST_VERSIONS, state_types::TestValidatedState}; + use hotshot_types::{data::Leaf2, light_client::StateKeyPair}; + + use crate::{ + block::{BlockBuilder, BlockBuilderConfig}, + consensus::Consensus, + epoch::EpochManager, + epoch_root_vote_collector::EpochRootVoteCollector, + proposal::ProposalValidator, + state::StateManager, + vid::{VidDisperser, VidReconstructor}, + vote::VoteCollector, + }; + + let (public_key, private_key) = BLSPubKey::generated_from_seed_indexed([0; 32], node_index); + let state_key_pair = StateKeyPair::generate_from_seed_indexed([0u8; 32], node_index); + let state_private_key = state_key_pair.sign_key_ref().clone(); + let instance = Arc::new(TestInstanceState::default()); + let upgrade_lock = test_upgrade_lock(); + + // Throwaway view-0 anchor; the seed advances past it later. + let genesis_state = TestValidatedState::default(); + let genesis_leaf = + Leaf2::::genesis(&genesis_state, &instance, TEST_VERSIONS.test.base).await; + + let consensus = Consensus::new( + membership.clone(), + public_key, + private_key.clone(), + state_private_key, + 10, + upgrade_lock.clone(), + genesis_leaf, + epoch_height, + ); + + let state_manager = StateManager::new(instance.clone(), upgrade_lock.clone()); + + let block_builder = BlockBuilder::new( + instance.clone(), + membership.clone(), + BlockBuilderConfig::default(), + upgrade_lock.clone(), + ); + + let proposal_validator = + ProposalValidator::new(membership.clone(), epoch_height, upgrade_lock.clone()); + + Coordinator::builder() + .consensus(consensus) + .network(network) + .state_manager(state_manager) + .vote1_collector(VoteCollector::new(membership.clone(), upgrade_lock.clone())) + .vote2_collector(VoteCollector::new(membership.clone(), upgrade_lock.clone())) + .timeout_collector(VoteCollector::new(membership.clone(), upgrade_lock.clone())) + .timeout_one_honest_collector(VoteCollector::new(membership.clone(), upgrade_lock.clone())) + .checkpoint_collector(VoteCollector::new(membership.clone(), upgrade_lock.clone())) + .epoch_root_collector(EpochRootVoteCollector::new( + membership.clone(), + upgrade_lock.clone(), + )) + .vid_disperser(VidDisperser::new(membership.clone())) + .vid_reconstructor(VidReconstructor::new()) + .epoch_manager(EpochManager::new(epoch_height, membership.clone())) + .block_builder(block_builder) + .proposal_validator(proposal_validator) + .storage(crate::storage::Storage::new(storage, private_key)) + .client(client) + .membership_coordinator(membership) + .outbox(Outbox::new()) + .timer(Timer::new( + view_timeout, + ViewNumber::genesis(), + hotshot_types::data::EpochNumber::genesis(), + )) + .public_key(public_key) + .build() +} + +#[derive(Clone, Debug)] +struct DecisionEvent { + view: ViewNumber, + commit: [u8; 32], +} + +/// Mirror of `consensus_handle::run_coordinator`: drive the coordinator +/// loop, forward `ExternalMessageReceived` to the leaf-fetcher channel, +/// and report decided views to `decision_tx`. +async fn run_handover_node( + mut coord: Coordinator< + TestTypes, + Cliquenet, + hotshot_example_types::storage_types::TestStorage, + >, + decision_tx: UnboundedSender, + external_events_tx: async_broadcast::Sender>, +) { + use hotshot_types::event::{Event, EventType}; + + loop { + match coord.next_consensus_input().await { + Ok(input) => coord.apply_consensus(input).await, + Err(err) if err.severity == Severity::Critical => { + tracing::error!(%err, "handover coord: critical error"); + return; + }, + Err(err) => tracing::warn!(%err, "handover coord: non-critical error"), + } + + while let Some(output) = coord.outbox_mut().pop_front() { + if let ConsensusOutput::LeafDecided { leaves, .. } = &output { + for leaf in leaves { + let _ = decision_tx.send(DecisionEvent { + view: leaf.view_number(), + commit: leaf.commit().into(), + }); + } + } + if let Err(err) = coord.process_consensus_output(output).await + && err.severity == Severity::Critical + { + tracing::error!(%err, "handover coord: critical error processing output"); + return; + } + } + + while let Some(output) = coord.coordinator_outbox_mut().pop_front() { + if let CoordinatorOutput::ExternalMessageReceived { sender, data } = output { + let _ = external_events_tx + .broadcast_direct(Event { + view_number: coord.current_view(), + event: EventType::ExternalMessageReceived { sender, data }, + }) + .await; + } + } + } +} + +/// Polls [`try_perform_handover`] until the cutover crosses. Production +/// triggers the same call lazily from any `ConsensusHandle` method; +/// the test polls because nothing else exercises the gate. +async fn handover_watcher( + legacy: Arc>>, + client_api: ClientApi, +) { + loop { + let crossed = { + let guard = legacy.read().await; + try_perform_handover(&guard, &client_api).await + }; + if crossed { + return; + } + sleep(Duration::from_millis(100)).await; + } +} + +/// Build a new-protocol coordinator + spawn its runner + watcher + +/// timeout-vote forwarder for one node — exactly the bundle +/// `ConsensusHandle::new` spawns in production. +async fn spawn_node( + i: usize, + num_nodes: usize, + view_timeout: Duration, + parties: &[(Keypair, BLSPubKey, NetAddr)], + new_proto_lock: &hotshot_types::message::UpgradeLock, + legacy: Arc>>, + bg_handles: &mut Vec, +) -> NodeState { + let network = build_new_protocol_network(i, parties, new_proto_lock).await; + let (membership, storage, client, external_events_tx) = + mock_membership_with_client(num_nodes, EPOCH_HEIGHT, parties[i].1).await; + + let coord = build_handover_coordinator( + i as u64, + network, + membership, + storage, + client, + EPOCH_HEIGHT, + view_timeout, + ) + .await; + + let client_api = coord.client_api().clone(); + + let legacy_event_rx = legacy.read().await.event_stream_known_impl().deactivate(); + bg_handles.push( + tokio::spawn(forward_legacy_timeout_votes( + legacy_event_rx, + client_api.clone(), + )) + .abort_handle(), + ); + + bg_handles.push(tokio::spawn(handover_watcher(legacy, client_api.clone())).abort_handle()); + + let (decision_tx, decision_rx) = mpsc::unbounded_channel::(); + let runner_abort = + tokio::spawn(run_handover_node(coord, decision_tx, external_events_tx)).abort_handle(); + + NodeState { + decision_rx, + runner_abort, + } +} + +struct NodeState { + decision_rx: mpsc::UnboundedReceiver, + runner_abort: AbortHandle, +} + +/// A node held silent for the test, with its shutdown timed by view. +struct SilentNode { + /// Index of the legacy node to shut down. + idx: usize, + /// Wait until any non-silent legacy node's `cur_view` reaches this + /// view, then shut down the silent node. Setting `idx=3` and + /// `at_view=18` (with num_nodes=4, cutover=20) makes view 19 + /// (leader=node 3) time out cluster-wide. + at_view: ViewNumber, +} + +/// Run a handover scenario: spin up legacy + new-protocol clusters +/// per node, optionally silence nodes at either layer on per-view +/// triggers, then verify every non-silent node decides +/// `target_decisions` post-cutover views and they all agree on the +/// commits. +/// +/// `silent_nodes[i]` takes node `silent_nodes[i].idx` fully offline — +/// shutting down its legacy `SystemContext` AND aborting its +/// new-protocol `Coordinator` runner — once any other node's legacy +/// `cur_view` reaches `silent_nodes[i].at_view`. Models a node that +/// has either crashed or been disconnected at the trigger view, just +/// like in production: a node is either online or offline. +/// +/// `num_nodes` must satisfy supermajority thresholds with the silent +/// nodes excluded — i.e. +/// `num_nodes - silent_nodes.len() >= (2*num_nodes/3) + 1`. +async fn run_handover_test( + num_nodes: usize, + target_decisions: usize, + deadline: Duration, + view_timeout: Duration, + silent_nodes: Vec, +) { + crate::logging::init_test_logging(); + + let parties = build_parties(num_nodes); + let new_proto_lock = test_upgrade_lock(); + + let legacy_handles = spawn_legacy_cluster(num_nodes, UPGRADE_VIEW).await; + let legacy_arcs: Vec>>> = legacy_handles + .into_iter() + .map(|h| Arc::new(RwLock::new(h))) + .collect(); + + // Both stacks alive concurrently from here — same shape as + // `SequencerContext::init`. + let mut bg_handles: Vec = Vec::new(); + let mut node_state: Vec = Vec::with_capacity(num_nodes); + for (i, legacy_arc) in legacy_arcs.iter().enumerate() { + node_state.push( + spawn_node( + i, + num_nodes, + view_timeout, + &parties, + &new_proto_lock, + legacy_arc.clone(), + &mut bg_handles, + ) + .await, + ); + } + + for silent in &silent_nodes { + bg_handles.push(spawn_silence_at_view( + &legacy_arcs, + silent, + node_state[silent.idx].runner_abort.clone(), + )); + } + + for legacy in &legacy_arcs { + legacy.read().await.hotshot.start_consensus().await; + } + + let silent_idxs: BTreeSet = silent_nodes.iter().map(|s| s.idx).collect(); + let live_indices: Vec = (0..num_nodes) + .filter(|i| !silent_idxs.contains(i)) + .collect(); + let mut decided_per_node: Vec> = + vec![BTreeMap::new(); num_nodes]; + let deadline = Instant::now() + deadline; + while !live_indices + .iter() + .all(|i| decided_per_node[*i].len() >= target_decisions) + { + if Instant::now() > deadline { + for (i, m) in decided_per_node.iter().enumerate() { + tracing::error!( + node = i, + decided = m.len(), + views = ?m.keys().map(|v| **v).collect::>(), + "node decisions at deadline", + ); + } + panic!("live nodes did not reach the post-cutover decision target in time"); + } + for (i, ns) in node_state.iter_mut().enumerate() { + while let Ok(ev) = ns.decision_rx.try_recv() { + if decided_per_node[i].insert(ev.view, ev.commit).is_none() { + tracing::info!(node = i, view = *ev.view, "new-protocol decided leaf"); + } + } + } + sleep(Duration::from_millis(50)).await; + } + + // Cross-check commits for every shared decided view among live nodes + // — catches forks the per-node counter alone misses. + let live_decided: Vec<&BTreeMap> = + live_indices.iter().map(|i| &decided_per_node[*i]).collect(); + let common_views: BTreeSet = + live_decided + .iter() + .skip(1) + .fold(live_decided[0].keys().copied().collect(), |acc, m| { + acc.intersection(&m.keys().copied().collect()) + .copied() + .collect() + }); + assert!( + common_views.len() >= target_decisions, + "live nodes do not agree on enough decided views: common={} target={target_decisions}", + common_views.len(), + ); + for view in &common_views { + let commit = live_decided[0][view]; + for (k, m) in live_decided.iter().enumerate().skip(1) { + assert_eq!( + m[view], commit, + "live node {} decided a different leaf than live node 0 at view {}", + live_indices[k], **view + ); + } + } + + for w in bg_handles { + w.abort(); + } + for ns in &node_state { + ns.runner_abort.abort(); + } + for legacy in legacy_arcs { + legacy.write().await.shut_down().await; + } +} + +/// Poll non-silent legacy `cur_view`s until one reaches `target_view`. +/// Returns when crossed; panics if `timeout` elapses first. +async fn await_legacy_view( + watch: &[Arc>>], + target_view: ViewNumber, + timeout: Duration, +) { + let deadline = Instant::now() + timeout; + loop { + if Instant::now() > deadline { + panic!("watcher did not observe cur_view >= {target_view} in time"); + } + for legacy in watch { + if legacy.read().await.cur_view().await >= target_view { + return; + } + } + sleep(Duration::from_millis(20)).await; + } +} + +/// Watcher that takes `silent.idx` fully offline — shuts down its +/// legacy `SystemContext` AND aborts its new-protocol coordinator — +/// once any non-silent node's legacy `cur_view` reaches `silent.at_view`. +fn spawn_silence_at_view( + legacy_arcs: &[Arc>>], + silent: &SilentNode, + runner_abort: AbortHandle, +) -> AbortHandle { + let watch: Vec<_> = legacy_arcs + .iter() + .enumerate() + .filter(|(i, _)| *i != silent.idx) + .map(|(_, l)| l.clone()) + .collect(); + let target = silent.idx; + let target_view = silent.at_view; + let target_legacy = legacy_arcs[silent.idx].clone(); + tokio::spawn(async move { + await_legacy_view(&watch, target_view, Duration::from_secs(120)).await; + runner_abort.abort(); + target_legacy.write().await.shut_down().await; + tracing::info!(node = target, at_view = *target_view, "took node offline"); + }) + .abort_handle() +} + +/// Predicted cutover view = `upgrade_view + upgrade_finish_offset`. Used +/// by timeout scenarios that need to know which node leads which view +/// before the cluster actually decides the upgrade cert. +const PREDICTED_CUTOVER_VIEW: u64 = UPGRADE_VIEW + 15; + +/// End-to-end happy-path handover: legacy + new-protocol clusters run +/// concurrently, the upgrade cert decides naturally, and the new +/// protocol takes over via the seed-bootstrap path. +#[tokio::test(flavor = "multi_thread")] +async fn legacy_runs_upgrade_then_new_protocol_takes_over() { + run_handover_test( + 4, + 6, + Duration::from_secs(180), + DEFAULT_NEW_PROTO_VIEW_TIMEOUT, + Vec::new(), + ) + .await; +} + +/// Timeout-bridge handover: the leader of the last legacy view +/// (`cutover_view - 1`) is shut down one view *before* it would propose, +/// so that view times out cluster-wide. Active legacy nodes emit +/// `TimeoutVote2`s; the bridge forwards them; the new-protocol +/// coordinator rebroadcasts on cliquenet; `TimeoutCertificate2` forms; +/// the first 0.8 leader uses the TC as view-change evidence; and the +/// network decides past the cutover with one validator down. +#[tokio::test(flavor = "multi_thread")] +async fn legacy_last_view_times_out_then_new_protocol_takes_over() { + const NUM_NODES: usize = 4; + let silent_idx = ((PREDICTED_CUTOVER_VIEW - 1) as usize) % NUM_NODES; + run_handover_test( + NUM_NODES, + 6, + Duration::from_secs(180), + DEFAULT_NEW_PROTO_VIEW_TIMEOUT, + vec![SilentNode { + idx: silent_idx, + at_view: ViewNumber::new(PREDICTED_CUTOVER_VIEW - 2), + }], + ) + .await; +} + +/// View-sync handover: the leaders of the **two** views right before +/// the cutover (`cutover_view - 2` and `cutover_view - 1`) are both +/// silent, so two consecutive legacy views time out at the boundary. +/// Bumped to 7 nodes — the BFT supermajority threshold for n=7 is 5, +/// so silencing 2 leaves exactly quorum on the live nodes. The bridge +/// forwards two batches of `TimeoutVote2`s; the new-protocol forms TC2s +/// for both views; `handle_timeout_certificate` advances through the +/// sequence; and the first 0.8 leader proposes against `locked_cert` +/// and the latest TC. +#[tokio::test(flavor = "multi_thread")] +async fn legacy_two_views_view_sync_then_new_protocol_takes_over() { + const NUM_NODES: usize = 7; + let silent_n_minus_2 = ((PREDICTED_CUTOVER_VIEW - 2) as usize) % NUM_NODES; + let silent_n_minus_1 = ((PREDICTED_CUTOVER_VIEW - 1) as usize) % NUM_NODES; + let trigger = ViewNumber::new(PREDICTED_CUTOVER_VIEW - 3); + run_handover_test( + NUM_NODES, + 6, + Duration::from_secs(240), + DEFAULT_NEW_PROTO_VIEW_TIMEOUT, + vec![ + SilentNode { + idx: silent_n_minus_2, + at_view: trigger, + }, + SilentNode { + idx: silent_n_minus_1, + at_view: trigger, + }, + ], + ) + .await; +} + +/// New-protocol first-leader timeout: the leader of `cutover_view` (= +/// the first post-cutover view) goes offline right at `cutover_view`. +/// Trigger is set to `cutover_view` so that view (cutover_view - 1) +/// has already QC'd in the legacy (its votes go to leader-of-cutover_view +/// = the silent node, who must be alive long enough to aggregate them). +/// After silence: legacy view `cutover_view` times out (TC routed to +/// leader of cutover_view+1, alive); legacy advances; alive watchers +/// seed the new-protocol cluster; new-protocol view `cutover_view` +/// also times out (silent leader); new-proto-native TC2 forms on +/// cliquenet (no bridge involved); leader of `cutover_view + 1` +/// proposes; the network decides. +/// +/// Bumped to 7 nodes so the silent leader (= leader of view 20) only +/// rotates back every 7 views, keeping the number of expensive 60s +/// new-protocol timeouts bounded within the deadline. +#[tokio::test(flavor = "multi_thread")] +async fn new_protocol_first_leader_offline_then_recovers() { + const NUM_NODES: usize = 7; + let silent_idx = (PREDICTED_CUTOVER_VIEW as usize) % NUM_NODES; + run_handover_test( + NUM_NODES, + 6, + Duration::from_secs(240), + DEFAULT_NEW_PROTO_VIEW_TIMEOUT, + vec![SilentNode { + idx: silent_idx, + at_view: ViewNumber::new(PREDICTED_CUTOVER_VIEW), + }], + ) + .await; +} diff --git a/crates/hotshot/task-impls/src/consensus/handlers.rs b/crates/hotshot/task-impls/src/consensus/handlers.rs index 0d6783fcb09..00e4c46aa17 100644 --- a/crates/hotshot/task-impls/src/consensus/handlers.rs +++ b/crates/hotshot/task-impls/src/consensus/handlers.rs @@ -493,7 +493,28 @@ pub(crate) async fn handle_timeout .wrap() .context(error!("Failed to sign TimeoutData"))?; - broadcast_event(Arc::new(HotShotEvent::TimeoutVoteSend(vote)), sender).await; + broadcast_event( + Arc::new(HotShotEvent::TimeoutVoteSend(vote.clone())), + sender, + ) + .await; + + // Forward the same vote to any external listener so the espresso bridge + // can submit it to the new-protocol coordinator at the legacy → 0.8 + // upgrade boundary. Only emit when a target upgrade is decided (i.e. + // we know a cutover is coming) to avoid spurious events in normal + // operation. The check is cheap; the event payload is small. + if task_state.upgrade_lock.decided_upgrade_cert().is_some() { + broadcast_event( + Event { + view_number, + event: EventType::LegacyTimeoutVoteEmitted { vote: vote.clone() }, + }, + &task_state.output_event_stream, + ) + .await; + } + broadcast_event( Event { view_number, diff --git a/crates/hotshot/task-impls/src/consensus/mod.rs b/crates/hotshot/task-impls/src/consensus/mod.rs index 80e33bae385..f7807e0a1b6 100644 --- a/crates/hotshot/task-impls/src/consensus/mod.rs +++ b/crates/hotshot/task-impls/src/consensus/mod.rs @@ -279,6 +279,9 @@ impl> TaskState for ConsensusTaskS sender: &Sender>, _receiver: &Receiver>, ) -> Result<()> { + if self.upgrade_lock.new_protocol_active(self.cur_view) { + return Ok(()); + } self.handle(event, sender.clone()).await } diff --git a/crates/hotshot/task-impls/src/da.rs b/crates/hotshot/task-impls/src/da.rs index 9e07b63b82b..0c2021e4855 100644 --- a/crates/hotshot/task-impls/src/da.rs +++ b/crates/hotshot/task-impls/src/da.rs @@ -533,6 +533,9 @@ impl> TaskState for DaTaskState>, _receiver: &Receiver>, ) -> Result<()> { + if self.upgrade_lock.new_protocol_active(self.cur_view) { + return Ok(()); + } self.handle(event, sender.clone()).await } diff --git a/crates/hotshot/task-impls/src/quorum_proposal/mod.rs b/crates/hotshot/task-impls/src/quorum_proposal/mod.rs index 4268d839b45..554513036c1 100644 --- a/crates/hotshot/task-impls/src/quorum_proposal/mod.rs +++ b/crates/hotshot/task-impls/src/quorum_proposal/mod.rs @@ -763,6 +763,12 @@ impl> TaskState sender: &Sender>, receiver: &Receiver>, ) -> Result<()> { + if self + .upgrade_lock + .new_protocol_active(self.latest_proposed_view) + { + return Ok(()); + } self.handle(event, receiver.clone(), sender.clone()).await } diff --git a/crates/hotshot/task-impls/src/quorum_proposal_recv/mod.rs b/crates/hotshot/task-impls/src/quorum_proposal_recv/mod.rs index 8fceff60c5e..8e1330c0d1d 100644 --- a/crates/hotshot/task-impls/src/quorum_proposal_recv/mod.rs +++ b/crates/hotshot/task-impls/src/quorum_proposal_recv/mod.rs @@ -231,6 +231,9 @@ impl> TaskState sender: &Sender>, receiver: &Receiver>, ) -> Result<()> { + if self.upgrade_lock.new_protocol_active(self.cur_view) { + return Ok(()); + } self.handle(event, sender.clone(), receiver.clone()).await; Ok(()) diff --git a/crates/hotshot/task-impls/src/quorum_vote/mod.rs b/crates/hotshot/task-impls/src/quorum_vote/mod.rs index a0db5aaf150..24651ce25ec 100644 --- a/crates/hotshot/task-impls/src/quorum_vote/mod.rs +++ b/crates/hotshot/task-impls/src/quorum_vote/mod.rs @@ -852,6 +852,12 @@ impl> TaskState for QuorumVoteTask sender: &Sender>, receiver: &Receiver>, ) -> Result<()> { + if self + .upgrade_lock + .new_protocol_active(self.latest_voted_view) + { + return Ok(()); + } self.handle(event, receiver.clone(), sender.clone()).await } diff --git a/crates/hotshot/task-impls/src/transactions.rs b/crates/hotshot/task-impls/src/transactions.rs index 146ae7d97b0..9e15e8c2b53 100644 --- a/crates/hotshot/task-impls/src/transactions.rs +++ b/crates/hotshot/task-impls/src/transactions.rs @@ -729,6 +729,9 @@ impl TaskState for TransactionTaskState { sender: &Sender>, _receiver: &Receiver>, ) -> Result<()> { + if self.upgrade_lock.new_protocol_active(self.cur_view) { + return Ok(()); + } self.handle(event, sender.clone()).await } diff --git a/crates/hotshot/task-impls/src/upgrade.rs b/crates/hotshot/task-impls/src/upgrade.rs index 83701d0b356..6071c9ab241 100644 --- a/crates/hotshot/task-impls/src/upgrade.rs +++ b/crates/hotshot/task-impls/src/upgrade.rs @@ -87,6 +87,18 @@ pub struct UpgradeTaskState { /// Unix time in seconds at which we stop voting on an upgrade pub stop_voting_time: u64, + /// Override for `UpgradeConstants::propose_offset` (falls back to trait const when `None`). + pub upgrade_propose_offset: Option, + + /// Override for `UpgradeConstants::decide_by_offset` (falls back to trait const when `None`). + pub upgrade_decide_by_offset: Option, + + /// Override for `UpgradeConstants::begin_offset` (falls back to trait const when `None`). + pub upgrade_begin_offset: Option, + + /// Override for `UpgradeConstants::finish_offset` (falls back to trait const when `None`). + pub upgrade_finish_offset: Option, + /// Lock for a decided upgrade pub upgrade_lock: UpgradeLock, @@ -320,18 +332,29 @@ impl UpgradeTaskState { ))? .as_secs(); + let propose_offset = self + .upgrade_propose_offset + .unwrap_or(TYPES::UPGRADE_CONSTANTS.propose_offset); + let begin_offset = self + .upgrade_begin_offset + .unwrap_or(TYPES::UPGRADE_CONSTANTS.begin_offset); + let finish_offset = self + .upgrade_finish_offset + .unwrap_or(TYPES::UPGRADE_CONSTANTS.finish_offset); + let decide_by_offset = self + .upgrade_decide_by_offset + .unwrap_or(TYPES::UPGRADE_CONSTANTS.decide_by_offset); + let leader = self .membership_coordinator .membership_for_epoch(self.cur_epoch) .await? - .leader(ViewNumber::new( - view + TYPES::UPGRADE_CONSTANTS.propose_offset, - )) + .leader(ViewNumber::new(view + propose_offset)) .await?; - let old_version_last_view = view + TYPES::UPGRADE_CONSTANTS.begin_offset; - let new_version_first_view = view + TYPES::UPGRADE_CONSTANTS.finish_offset; - let decide_by = view + TYPES::UPGRADE_CONSTANTS.decide_by_offset; + let old_version_last_view = view + begin_offset; + let new_version_first_view = view + finish_offset; + let decide_by = view + decide_by_offset; let epoch_upgrade_checks = if upgrade.target >= EPOCH_VERSION && upgrade.base < EPOCH_VERSION @@ -391,9 +414,7 @@ impl UpgradeTaskState { let upgrade_proposal = UpgradeProposal { upgrade_proposal: upgrade_proposal_data.clone(), - view_number: ViewNumber::new( - view + TYPES::UPGRADE_CONSTANTS.propose_offset, - ), + view_number: ViewNumber::new(view + propose_offset), }; let signature = TYPES::SignatureKey::sign( diff --git a/crates/hotshot/task-impls/src/vid.rs b/crates/hotshot/task-impls/src/vid.rs index 564a7442ab9..f5fd690d357 100644 --- a/crates/hotshot/task-impls/src/vid.rs +++ b/crates/hotshot/task-impls/src/vid.rs @@ -278,6 +278,9 @@ impl> TaskState for VidTaskState>, _receiver: &Receiver>, ) -> Result<()> { + if self.upgrade_lock.new_protocol_active(self.cur_view) { + return Ok(()); + } self.handle(event, sender.clone()).await; Ok(()) } diff --git a/crates/hotshot/task-impls/src/view_sync.rs b/crates/hotshot/task-impls/src/view_sync.rs index bc5b1cab2a9..42d15223fc1 100644 --- a/crates/hotshot/task-impls/src/view_sync.rs +++ b/crates/hotshot/task-impls/src/view_sync.rs @@ -129,6 +129,9 @@ impl TaskState for ViewSyncTaskState { sender: &Sender>, _receiver: &Receiver>, ) -> Result<()> { + if self.upgrade_lock.new_protocol_active(self.cur_view) { + return Ok(()); + } self.handle(event, sender.clone()).await } @@ -190,6 +193,9 @@ impl TaskState for ViewSyncReplicaTaskState { sender: &Sender>, _receiver: &Receiver>, ) -> Result<()> { + if self.upgrade_lock.new_protocol_active(self.cur_view) { + return Ok(()); + } self.handle(event, sender.clone()).await; Ok(()) diff --git a/crates/hotshot/testing/src/test_builder.rs b/crates/hotshot/testing/src/test_builder.rs index 4373b515f46..3307f895ba8 100644 --- a/crates/hotshot/testing/src/test_builder.rs +++ b/crates/hotshot/testing/src/test_builder.rs @@ -90,6 +90,10 @@ pub fn default_hotshot_config( stop_proposing_time: 0, start_voting_time: u64::MAX, stop_voting_time: 0, + upgrade_propose_offset: None, + upgrade_decide_by_offset: None, + upgrade_begin_offset: None, + upgrade_finish_offset: None, epoch_height, epoch_start_block, stake_table_capacity: hotshot_types::light_client::DEFAULT_STAKE_TABLE_CAPACITY, diff --git a/crates/hotshot/types/src/event.rs b/crates/hotshot/types/src/event.rs index e64e7c60656..63905e3f9bc 100644 --- a/crates/hotshot/types/src/event.rs +++ b/crates/hotshot/types/src/event.rs @@ -19,6 +19,7 @@ use crate::{ error::HotShotError, message::{Proposal, convert_proposal}, simple_certificate::{CertificatePair, LightClientStateUpdateCertificateV2, QuorumCertificate}, + simple_vote::TimeoutVote2, traits::{ValidatedState, node_implementation::NodeType}, vote::HasViewNumber, }; @@ -262,6 +263,17 @@ pub enum EventType { /// Serialized data of the message data: Vec, }, + + /// Emitted by the legacy consensus task whenever it signs and broadcasts a + /// `TimeoutVote2`. Used at the legacy → new-protocol upgrade boundary so + /// the espresso bridge can forward the same vote into the new-protocol + /// coordinator's vote collectors. The wire-level protocols differ but the + /// underlying `TimeoutVote2` type and its version-tagged signature + /// commitment are shared, so the same vote is valid in both systems. + LegacyTimeoutVoteEmitted { + /// The vote that was signed and broadcast on the legacy wire. + vote: TimeoutVote2, + }, } impl std::fmt::Display for EventType { @@ -309,6 +321,9 @@ impl std::fmt::Display for EventType { Self::ExternalMessageReceived { .. } => { write!(f, "ExternalMessageReceived") }, + Self::LegacyTimeoutVoteEmitted { vote } => { + write!(f, "LegacyTimeoutVoteEmitted: view={}", vote.view_number()) + }, } } } @@ -357,6 +372,15 @@ impl EventType { EventType::ExternalMessageReceived { sender, data } => { LegacyEventType::ExternalMessageReceived { sender, data } }, + // Upgrade-bridging event: doesn't exist in the pre-epoch event + // surface. Convert to a no-op equivalent (drop) since legacy + // consumers wouldn't know what to do with it. + EventType::LegacyTimeoutVoteEmitted { .. } => { + anyhow::bail!( + "LegacyTimeoutVoteEmitted is upgrade-bridging only and has no legacy \ + equivalent" + ) + }, }) } } diff --git a/crates/hotshot/types/src/hotshot_config_file.rs b/crates/hotshot/types/src/hotshot_config_file.rs index 9f0c8913031..5b673b39215 100644 --- a/crates/hotshot/types/src/hotshot_config_file.rs +++ b/crates/hotshot/types/src/hotshot_config_file.rs @@ -112,6 +112,10 @@ impl From> for HotShotConfig { stop_proposing_time: val.upgrade.stop_proposing_time, start_voting_time: val.upgrade.start_voting_time, stop_voting_time: val.upgrade.stop_voting_time, + upgrade_propose_offset: val.upgrade.propose_offset, + upgrade_decide_by_offset: val.upgrade.decide_by_offset, + upgrade_begin_offset: val.upgrade.begin_offset, + upgrade_finish_offset: val.upgrade.finish_offset, epoch_height: val.epoch_height, epoch_start_block: val.epoch_start_block, stake_table_capacity: val.stake_table_capacity, diff --git a/crates/hotshot/types/src/lib.rs b/crates/hotshot/types/src/lib.rs index 7d7fa6a980f..bc2decc3f9e 100644 --- a/crates/hotshot/types/src/lib.rs +++ b/crates/hotshot/types/src/lib.rs @@ -270,6 +270,19 @@ pub struct HotShotConfig { pub start_voting_time: u64, /// Unix time in seconds at which we stop voting on an upgrade. To prevent voting on an upgrade, set stop_voting_time <= start_voting_time. pub stop_voting_time: u64, + /// Override for `UpgradeConstants::propose_offset`. Falls back to `TYPES::UPGRADE_CONSTANTS.propose_offset` when `None`. + #[serde(default)] + pub upgrade_propose_offset: Option, + /// Override for `UpgradeConstants::decide_by_offset`. Falls back to `TYPES::UPGRADE_CONSTANTS.decide_by_offset` when `None`. + #[serde(default)] + pub upgrade_decide_by_offset: Option, + /// Override for `UpgradeConstants::begin_offset`. Falls back to `TYPES::UPGRADE_CONSTANTS.begin_offset` when `None`. + #[serde(default)] + pub upgrade_begin_offset: Option, + /// Override for `UpgradeConstants::finish_offset`. Falls back to `TYPES::UPGRADE_CONSTANTS.finish_offset` when `None`. + /// For a zero-blackout upgrade, set `upgrade_finish_offset == upgrade_begin_offset`. + #[serde(default)] + pub upgrade_finish_offset: Option, /// Number of blocks in an epoch, zero means there are no epochs pub epoch_height: u64, /// Epoch start block diff --git a/crates/hotshot/types/src/message.rs b/crates/hotshot/types/src/message.rs index 76b3afeeaf1..f04c9855db1 100644 --- a/crates/hotshot/types/src/message.rs +++ b/crates/hotshot/types/src/message.rs @@ -20,7 +20,10 @@ use hotshot_utils::anytrace::*; use parking_lot::RwLock; use serde::{Deserialize, Serialize, de::DeserializeOwned}; use vbs::version::Version; -use versions::{DRB_AND_HEADER_UPGRADE_VERSION, EPOCH_VERSION, Upgrade, VID2_UPGRADE_VERSION}; +use versions::{ + DRB_AND_HEADER_UPGRADE_VERSION, EPOCH_VERSION, NEW_PROTOCOL_VERSION, Upgrade, + VID2_UPGRADE_VERSION, +}; /// The version we should expect for external messages pub const EXTERNAL_MESSAGE_VERSION: Version = Version { major: 0, minor: 0 }; @@ -763,6 +766,16 @@ impl UpgradeLock { self.version_infallible(view) >= VID2_UPGRADE_VERSION } + /// Return whether the new protocol (HotShot 0.8) is active for the given view. + /// + /// Once true for any view, all consensus messages tagged with versions strictly + /// less than `NEW_PROTOCOL_VERSION` should be ignored at the legacy task layer + /// for that view onward. The wire-level deserialize already rejects + /// version-mismatched messages via [`Self::version`]. + pub fn new_protocol_active(&self, view: ViewNumber) -> bool { + self.version_infallible(view) >= NEW_PROTOCOL_VERSION + } + /// Serialize a message with a version number, using `message.view_number()` /// and an optional decided upgrade certificate to determine the message's /// version. diff --git a/crates/hotshot/types/src/upgrade_config.rs b/crates/hotshot/types/src/upgrade_config.rs index 4edd3c820d8..8ba817d8845 100644 --- a/crates/hotshot/types/src/upgrade_config.rs +++ b/crates/hotshot/types/src/upgrade_config.rs @@ -39,6 +39,19 @@ pub struct UpgradeConfig { pub start_voting_time: u64, /// Unix time in seconds at which we stop voting on an upgrade. To prevent voting on an upgrade, set stop_voting_time <= start_voting_time. pub stop_voting_time: u64, + /// Override for `UpgradeConstants::propose_offset`. Falls back to `TYPES::UPGRADE_CONSTANTS.propose_offset` when `None`. + #[serde(default)] + pub propose_offset: Option, + /// Override for `UpgradeConstants::decide_by_offset`. Falls back to `TYPES::UPGRADE_CONSTANTS.decide_by_offset` when `None`. + #[serde(default)] + pub decide_by_offset: Option, + /// Override for `UpgradeConstants::begin_offset`. Falls back to `TYPES::UPGRADE_CONSTANTS.begin_offset` when `None`. + #[serde(default)] + pub begin_offset: Option, + /// Override for `UpgradeConstants::finish_offset`. Falls back to `TYPES::UPGRADE_CONSTANTS.finish_offset` when `None`. + /// For a zero-blackout upgrade (e.g. 0.4 → 0.8), set `finish_offset == begin_offset`. + #[serde(default)] + pub finish_offset: Option, } // Explicitly implementing `Default` for clarity. @@ -54,6 +67,10 @@ impl Default for UpgradeConfig { stop_proposing_time: 0, start_voting_time: u64::MAX, stop_voting_time: 0, + propose_offset: None, + decide_by_offset: None, + begin_offset: None, + finish_offset: None, } } } diff --git a/doc/upgrades.md b/doc/upgrades.md index d6c728a60a9..5836f8557aa 100644 --- a/doc/upgrades.md +++ b/doc/upgrades.md @@ -74,11 +74,40 @@ Time based: - **start_proposing_time:** the earliest UNIX timestamp in which the node can propose an upgrade. - **stop_proposing_time:** UNIX timestamp after which the node stops proposing an upgrade. +### Optional offset overrides + +The upgrade task uses four offsets (defined in `UpgradeConstants` at `crates/hotshot/types/src/constants.rs`) to derive +the precise views in the upgrade certificate: + +- `propose_offset`: how far ahead of the current view the upgrade proposal is sent. +- `decide_by_offset`: deadline (in views) by which the certificate must decide. +- `begin_offset`: `old_version_last_view = current_view + begin_offset`. +- `finish_offset`: `new_version_first_view = current_view + finish_offset`. + +By default these come from `TYPES::UPGRADE_CONSTANTS`. Each can be overridden per-upgrade via the `[[upgrade]]` TOML +stanza: + +```toml +[[upgrade]] +version = "0.8" +start_proposing_view = 5 +stop_proposing_view = 15 +# Optional: zero-blackout cutover (no null-block window between versions) +begin_offset = 110 +finish_offset = 111 +``` + +Setting `finish_offset = begin_offset + 1` produces a "zero-blackout" upgrade where the first new-version view +immediately follows the last old-version view (no view satisfies +`view > old_version_last_view && view < new_version_first_view`, so `upgrading_in()` returns false everywhere and no +null-block window is produced). This is required for major protocol jumps (e.g. 0.4 → 0.8) where the new protocol +decides previously-undecided old-protocol leaves rather than waiting for null blocks during a transition window. + The window between `start_proposing_view/time` and `stop_proposing_view/time` should provide sufficient time for nodes to continue proposing the upgrade until successful. -Ensure that the `ESPRESSO_NODE_GENESIS_FILE` environment variable is defined to point to the path of the genesis -TOML file. For an example with upgrades enabled, refer to [`data/genesis/demo.toml`](../data/genesis/demo.toml). +Ensure that the `ESPRESSO_NODE_GENESIS_FILE` environment variable is defined to point to the path of the genesis TOML +file. For an example with upgrades enabled, refer to [`data/genesis/demo.toml`](../data/genesis/demo.toml). ### Example TOML Configuration diff --git a/hotshot-query-service/examples/simple-server.rs b/hotshot-query-service/examples/simple-server.rs index f45de879934..f71774b1105 100644 --- a/hotshot-query-service/examples/simple-server.rs +++ b/hotshot-query-service/examples/simple-server.rs @@ -214,6 +214,10 @@ async fn init_consensus( stop_proposing_time: 0, start_voting_time: 0, stop_voting_time: 0, + upgrade_propose_offset: None, + upgrade_decide_by_offset: None, + upgrade_begin_offset: None, + upgrade_finish_offset: None, epoch_height: 0, epoch_start_block: 0, stake_table_capacity: hotshot_types::light_client::DEFAULT_STAKE_TABLE_CAPACITY, diff --git a/hotshot-query-service/src/testing/consensus.rs b/hotshot-query-service/src/testing/consensus.rs index 1dd66c0660c..e7b1de99534 100644 --- a/hotshot-query-service/src/testing/consensus.rs +++ b/hotshot-query-service/src/testing/consensus.rs @@ -150,6 +150,10 @@ impl MockNetwork { stop_proposing_time: 0, start_voting_time: 0, stop_voting_time: 0, + upgrade_propose_offset: None, + upgrade_decide_by_offset: None, + upgrade_begin_offset: None, + upgrade_finish_offset: None, epoch_height: EPOCH_HEIGHT, epoch_start_block: 0, stake_table_capacity: hotshot_types::light_client::DEFAULT_STAKE_TABLE_CAPACITY,