diff --git a/rs/consensus/src/consensus.rs b/rs/consensus/src/consensus.rs index f1fe77621325..7563f0cee5b5 100644 --- a/rs/consensus/src/consensus.rs +++ b/rs/consensus/src/consensus.rs @@ -74,7 +74,7 @@ use strum_macros::AsRefStr; /// We will not notarize or validate artifacts with a height greater than the given /// value above the latest certification. During validation, the only exception to /// this are CUPs, which have no upper bound on the height to be validated. -pub(crate) const ACCEPTABLE_NOTARIZATION_CERTIFICATION_GAP: u64 = 70; +pub const ACCEPTABLE_NOTARIZATION_CERTIFICATION_GAP: u64 = 70; /// In order to have a bound on the advertised consensus pool, we place a limit on /// the gap between notarized height and the height of the next pending CUP. diff --git a/rs/consensus/src/consensus/catchup_package_maker.rs b/rs/consensus/src/consensus/catchup_package_maker.rs index cfdbb4b3f1ad..03b24104a61d 100644 --- a/rs/consensus/src/consensus/catchup_package_maker.rs +++ b/rs/consensus/src/consensus/catchup_package_maker.rs @@ -13,6 +13,7 @@ //! At the moment, we will start to make a CatchUpPackage once a DKG summary //! block is considered finalized. +use crate::consensus::status; use ic_consensus_utils::{ active_high_threshold_nidkg_id, crypto::ConsensusCrypto, get_oldest_state_registry_version, membership::Membership, pool_reader::PoolReader, @@ -172,10 +173,35 @@ impl CatchUpPackageMaker { // Skip if random beacon does not exist for the height let random_beacon = pool.get_random_beacon(height)?; + let halting = || { + status::should_halt( + height, + self.membership.registry_client.as_ref(), + self.membership.subnet_id, + pool, + &self.log, + ) == Some(true) + }; // Skip if the state referenced by finalization tip has not caught up to // this height. This is to increase the chance that states are available to // validate payloads at the chain tip. - if pool.get_finalized_tip().context.certified_height < height { + // We make an exception if we are halting at this height, which was introduced after the + // incident on subnet `3hhby` on 2026-05-22. + // Checkpointing was slow at an upgrade boundary, and consensus continued creating blocks + // until reaching `ACCEPTABLE_NOTARIZATION_CERTIFICATION_GAP`, each with a validation + // context's certified height equal to the upgrade height minus 1. When checkpointing + // finally finished, a new certified height was available, but since the block maker is + // always one height ahead of the notary, we had already created a block, also with a + // certified height equal to the upgrade height minus 1. The notary would notarize it but + // reach the bound again. Since the CUP maker (here) waits for the finalized tip's + // validation context's certified height to reach the upgrade height, no CUP was ever + // created, and the subnet stalled. + // By allowing the CUP maker to make a CUP share even when the finalized tip's validation + // context has not caught up to the CUP height, we can ensure that a CUP will be created. + // It is not a problem for the finalized tip's validation context to be behind the CUP + // height, because when we are halting, all blocks have empty payloads anyways, and thus do + // not even need to access states at the validation context's certified height. + if pool.get_finalized_tip().context.certified_height < height && !halting() { return None; } @@ -267,6 +293,8 @@ mod tests { dependencies_with_subnet_records_with_raw_state_manager, }; use ic_logger::replica_logger::no_op_logger; + use ic_protobuf::registry::subnet::v1::SubnetRecord; + use ic_registry_client_helpers::subnet::SubnetRegistry; use ic_replicated_state::metadata_state::subnet_call_context_manager::{ SetupInitialDkgContext, SignWithThresholdContext, }; @@ -283,88 +311,232 @@ mod tests { use ic_test_utilities_types::ids::{node_test_id, subnet_test_id}; use ic_types::{ CryptoHashOfState, Height, RegistryVersion, - consensus::{BlockPayload, Payload, SummaryPayload, idkg::PreSigId}, + consensus::{BlockPayload, BlockProposal, Payload, SummaryPayload, idkg::PreSigId}, crypto::CryptoHash, }; use rstest::rstest; use std::sync::{Arc, RwLock}; - #[test] - fn test_catch_up_package_maker() { + fn assert_cup_share_matches_block_and_state( + share: &CatchUpPackageShare, + proposal: &BlockProposal, + state_manager: &dyn StateManager, + oldest_registry_version_in_use_by_replicated_state: Option, + ) { + assert_eq!(&share.content.block, proposal.content.get_hash()); + assert_eq!( + share.content.state_hash, + state_manager.get_state_hash_at(proposal.height()).unwrap() + ); + assert_eq!( + share + .content + .oldest_registry_version_in_use_by_replicated_state, + oldest_registry_version_in_use_by_replicated_state + ); + } + + fn with_cup_maker_setup(run: impl FnOnce(CatchUpPackageMaker, u64, Dependencies) -> T) -> T { ic_test_utilities::artifact_pool_config::with_test_pool_config(|pool_config| { - let interval_length = 5; + let dkg_interval_length = 5; let committee: Vec<_> = (0..4).map(node_test_id).collect(); - let Dependencies { - mut pool, - membership, - replica_config, - crypto, - state_manager, - .. - } = dependencies_with_subnet_params( + let mut deps = dependencies_with_subnet_params( pool_config, subnet_test_id(0), vec![( 1, SubnetRecordBuilder::from(&committee) - .with_dkg_interval_length(interval_length) + .with_dkg_interval_length(dkg_interval_length) .build(), )], ); - state_manager + // Ignore state sync and state divergence + deps.state_manager + .get_mut() + .expect_fetch_state() + .return_const(()); + deps.state_manager + .get_mut() + .expect_report_diverged_checkpoint() + .return_const(()); + + deps.state_manager .get_mut() .expect_get_state_hash_at() .return_const(Ok(CryptoHashOfState::from(CryptoHash(vec![1, 2, 3])))); - let message_routing = FakeMessageRouting::new(); - *message_routing.next_batch_height.write().unwrap() = Height::from(2); - let message_routing = Arc::new(message_routing); + let message_routing = Arc::new(FakeMessageRouting::new()); let cup_maker = CatchUpPackageMaker::new( - replica_config, - membership, - crypto, - state_manager.clone(), + deps.replica_config.clone(), + deps.membership.clone(), + deps.crypto.clone(), + deps.state_manager.clone(), message_routing, no_op_logger(), ); - // 1. Genesis CUP already exists, we won't make a new one - assert!(cup_maker.on_state_change(&PoolReader::new(&pool)).is_none()); - + // Genesis CUP already exists, we won't make a new one + assert!( + cup_maker + .on_state_change(&PoolReader::new(&deps.pool)) + .is_none() + ); // Skip the first DKG interval - pool.advance_round_normal_operation_n(interval_length); + deps.pool + .advance_round_normal_operation_n(dkg_interval_length); - let mut proposal = pool.make_next_block(); - let block = proposal.content.as_mut(); - block.context.certified_height = block.height(); - proposal.content = HashedBlock::new(ic_types::crypto::crypto_hash, block.clone()); - pool.insert_validated(proposal.clone()); - pool.notarize(&proposal); - pool.finalize(&proposal); + run(cup_maker, dkg_interval_length, deps) + }) + } - // 4. Beacon does not exist, we can't make a new CUP share - assert!(cup_maker.on_state_change(&PoolReader::new(&pool)).is_none()); + #[test] + fn test_catch_up_package_maker_waits_for_beacon() { + with_cup_maker_setup( + |cup_maker, + _, + Dependencies { + mut pool, + state_manager, + .. + }| { + let mut proposal = pool.make_next_block(); + let block = proposal.content.as_mut(); + block.context.certified_height = block.height(); + proposal.content = HashedBlock::new(ic_types::crypto::crypto_hash, block.clone()); + pool.insert_validated(proposal.clone()); + pool.notarize(&proposal); + pool.finalize(&proposal); + + // Beacon does not exist, we can't make a new CUP share + assert!(cup_maker.on_state_change(&PoolReader::new(&pool)).is_none()); + + // Beacon now exists, we can make a new CUP share + pool.insert_validated(pool.make_next_beacon()); + let share = cup_maker + .on_state_change(&PoolReader::new(&pool)) + .expect("Expecting CatchUpPackageShare"); + + assert_cup_share_matches_block_and_state( + &share, + &proposal, + state_manager.as_ref(), + None, + ); + }, + ) + } - // 5. Beacon now exists, we can make a new CUP share - pool.insert_validated(pool.make_next_beacon()); - let share = cup_maker - .on_state_change(&PoolReader::new(&pool)) - .expect("Expecting CatchUpPackageShare"); + #[test] + fn test_catch_up_package_maker_waits_for_finalized_tip_certified_height_to_reach_cup_height() { + with_cup_maker_setup( + |cup_maker, + _, + Dependencies { + mut pool, + state_manager, + .. + }| { + let mut summary_proposal = pool.make_next_block(); + let summary_block = summary_proposal.content.as_mut(); + let summary_height = summary_block.height(); + summary_block.context.certified_height = summary_height - 1.into(); + summary_proposal.content = + HashedBlock::new(ic_types::crypto::crypto_hash, summary_block.clone()); + pool.advance_round_with_block(&summary_proposal); + + // Finalized tip's certified height is behind the CUP height, we can't make a new + // CUP share + assert!(cup_maker.on_state_change(&PoolReader::new(&pool)).is_none()); + + let mut proposal = pool.make_next_block(); + let block = proposal.content.as_mut(); + block.context.certified_height = summary_height; + proposal.content = HashedBlock::new(ic_types::crypto::crypto_hash, block.clone()); + pool.advance_round_with_block(&proposal); + + // Finalized tip's certified height has caught up to the CUP height, we can make a + // new CUP share + let share = cup_maker + .on_state_change(&PoolReader::new(&pool)) + .expect("Expecting CatchUpPackageShare"); + + assert_cup_share_matches_block_and_state( + &share, + &summary_proposal, + state_manager.as_ref(), + None, + ); + }, + ) + } - assert_eq!(&share.content.block, proposal.content.get_hash()); - assert_eq!( - share.content.state_hash, - state_manager.get_state_hash_at(Height::from(0)).unwrap() - ); - assert_eq!( - share - .content - .oldest_registry_version_in_use_by_replicated_state, - None - ); - }) + #[test] + fn test_catch_up_package_maker_does_not_wait_for_finalized_tip_when_halting() { + with_cup_maker_setup( + |cup_maker, + dkg_interval_length, + Dependencies { + mut pool, + state_manager, + registry, + registry_data_provider, + .. + }| { + let existing_subnet_record = registry + .get_subnet_record(subnet_test_id(0), registry_data_provider.latest_version()) + .unwrap() + .unwrap(); + let upgrade_registry_version = RegistryVersion::from(10); + registry_data_provider + .add( + &ic_registry_keys::make_subnet_record_key(subnet_test_id(0)), + upgrade_registry_version, + Some(SubnetRecord { + replica_version_id: "upgrade_version".to_string(), + ..existing_subnet_record + }), + ) + .unwrap(); + registry.update_to_latest_version(); + + let mut upgrade_proposal = pool.make_next_block(); + let upgrade_block = upgrade_proposal.content.as_mut(); + let mut upgrade_summary = upgrade_block.payload.as_ref().as_summary().clone(); + // Manually modify the summary's registry version to trigger the update + upgrade_summary.dkg.registry_version = upgrade_registry_version; + upgrade_block.payload = Payload::new( + ic_types::crypto::crypto_hash, + BlockPayload::Summary(upgrade_summary), + ); + upgrade_proposal.content = + HashedBlock::new(ic_types::crypto::crypto_hash, upgrade_block.clone()); + pool.advance_round_with_block(&upgrade_proposal); + pool.insert_validated(pool.make_catch_up_package(upgrade_proposal.height())); + + pool.advance_round_normal_operation_n(dkg_interval_length); + + let mut proposal = pool.make_next_block(); + let block = proposal.content.as_mut(); + block.context.certified_height = block.height() - 1.into(); + proposal.content = HashedBlock::new(ic_types::crypto::crypto_hash, block.clone()); + pool.advance_round_with_block(&proposal); + + // Even if finalized tip's certified height is behind the CUP height, we are halting + // and thus can make a new CUP share + let share = cup_maker + .on_state_change(&PoolReader::new(&pool)) + .expect("Expecting CatchUpPackageShare"); + + assert_cup_share_matches_block_and_state( + &share, + &proposal, + state_manager.as_ref(), + None, + ); + }, + ) } /// Build a vector of signature contexts where the oldest matched diff --git a/rs/consensus/tests/framework/mod.rs b/rs/consensus/tests/framework/mod.rs index 1f7c6ac9dd86..70d9c3ddad21 100644 --- a/rs/consensus/tests/framework/mod.rs +++ b/rs/consensus/tests/framework/mod.rs @@ -48,14 +48,17 @@ use std::{ /// with required records, including subnet record, node record, node public keys, /// catch-up package (with proper NiDKG transcripts). /// -/// Return the registry client, catch-up package, and a list of crypto components, one -/// for each node. +/// Return the registry data provider, registry client, catch-up package, and a list of crypto +/// components, one for each node. +#[allow(clippy::type_complexity)] pub fn setup_subnet( subnet_id: SubnetId, node_ids: &[NodeId], + dkg_interval_length: u64, rng: &mut R, ) -> ( - Arc, + Arc, + Arc, CatchUpPackage, Vec>>, ) { @@ -65,7 +68,7 @@ pub fn setup_subnet( let registry_client = Arc::new(FakeRegistryClient::new(Arc::clone(&data_provider) as Arc<_>)); let subnet_record = SubnetRecordBuilder::from(node_ids) - .with_dkg_interval_length(19) + .with_dkg_interval_length(dkg_interval_length) .with_chain_key_config(ChainKeyConfig { key_configs: test_master_public_key_ids() .iter() @@ -200,7 +203,6 @@ pub fn setup_subnet( .expect("Could not add chain-key enabled subnet list"); } registry_client.reload(); - registry_client.update_to_latest_version(); let cup_contents = registry_client .get_cup_contents(subnet_id, registry_client.get_latest_version()) @@ -215,7 +217,7 @@ pub fn setup_subnet( .with_current_transcripts(ni_transcripts); let cup = make_genesis(summary); - (registry_client, cup, cryptos) + (data_provider, registry_client, cup, cryptos) } pub(crate) fn test_master_public_key_ids() -> Vec { diff --git a/rs/consensus/tests/framework/runner.rs b/rs/consensus/tests/framework/runner.rs index b6b38687aa44..759ed2c3f9a8 100644 --- a/rs/consensus/tests/framework/runner.rs +++ b/rs/consensus/tests/framework/runner.rs @@ -251,7 +251,7 @@ impl<'a> ConsensusRunner<'a> { /// Run a single step of all instances to finish processing their messages. /// Return the updated NetworkStatus. - fn process(&self) -> NetworkStatus { + fn process(&mut self) -> NetworkStatus { let delivered = self.config.delivery.deliver_next(self); let mut idle_since = self.idle_since.borrow_mut(); @@ -274,7 +274,6 @@ impl<'a> ConsensusRunner<'a> { // only stop when all instances satisfy StopPredicate if !(self.stop_predicate)(instance) { stopped = false; - break; } } if stopped { @@ -304,6 +303,7 @@ impl Default for ConsensusRunnerConfig { stall_clocks: false, execution: GlobalMessage::new(false), delivery: Sequential::new(), + dkg_interval_length: 19, } } } diff --git a/rs/consensus/tests/framework/types.rs b/rs/consensus/tests/framework/types.rs index 0afc6df833dd..fbf17ddebe68 100644 --- a/rs/consensus/tests/framework/types.rs +++ b/rs/consensus/tests/framework/types.rs @@ -274,7 +274,7 @@ impl fmt::Display for ConsensusInstance<'_> { /// This is the type of predicates used by the ConsensusRunner to determine /// whether or not it should terminate. It is evaluated for all consensus /// instances at every time step. -pub type StopPredicate = Box) -> bool>; +pub type StopPredicate = Box) -> bool>; pub(crate) struct BouncerState { bouncer: Bouncer, @@ -412,6 +412,7 @@ pub struct ConsensusRunnerConfig { pub stall_clocks: bool, pub execution: Box, pub delivery: Box, + pub dkg_interval_length: u64, } impl fmt::Display for ConsensusRunnerConfig { @@ -419,7 +420,8 @@ impl fmt::Display for ConsensusRunnerConfig { write!( f, "ConsensusRunnerConfig {{ max_delta: {}, random_seed: {}, \ - num_nodes: {}, num_rounds: {}, degree: {}, use_priority_fn: {}, execution: {}, delivery: {} }}", + num_nodes: {}, num_rounds: {}, degree: {}, use_priority_fn: {}, \ + execution: {}, delivery: {}, dkg_interval_length: {} }}", self.max_delta, self.random_seed, self.num_nodes, @@ -427,7 +429,8 @@ impl fmt::Display for ConsensusRunnerConfig { self.degree, self.use_priority_fn, get_name(&self.execution), - get_name(&self.delivery) + get_name(&self.delivery), + self.dkg_interval_length, ) } } diff --git a/rs/consensus/tests/integration.rs b/rs/consensus/tests/integration.rs index 1b340249df80..730a3585ed61 100644 --- a/rs/consensus/tests/integration.rs +++ b/rs/consensus/tests/integration.rs @@ -6,18 +6,23 @@ use crate::framework::{ ConsensusRunnerConfig, StopPredicate, malicious, setup_subnet, }; use framework::test_master_public_key_ids; +use ic_consensus::consensus::ACCEPTABLE_NOTARIZATION_CERTIFICATION_GAP; use ic_consensus_utils::pool_reader::PoolReader; use ic_interfaces::{consensus_pool::ConsensusPool, messaging::MessageRouting}; use ic_interfaces_registry::RegistryClient; +use ic_protobuf::registry::subnet::v1::SubnetRecord; +use ic_registry_client_fake::FakeRegistryClient; +use ic_registry_client_helpers::subnet::SubnetRegistry; +use ic_registry_proto_data_provider::ProtoRegistryDataProvider; use ic_test_utilities_time::FastForwardTimeSource; use ic_test_utilities_types::ids::{node_test_id, subnet_test_id}; use ic_types::{ - Height, batch::BatchContent, crypto::CryptoHash, malicious_flags::MaliciousFlags, - replica_config::ReplicaConfig, + Height, RegistryVersion, batch::BatchContent, crypto::CryptoHash, + malicious_flags::MaliciousFlags, replica_config::ReplicaConfig, }; use rand::Rng; use rand_chacha::{ChaChaRng, rand_core::SeedableRng}; -use std::{cell::RefCell, rc::Rc, sync::Arc}; +use std::{cell::RefCell, cmp::Ordering, rc::Rc, sync::Arc}; #[test] fn multiple_nodes_are_live() -> Result<(), String> { @@ -25,7 +30,7 @@ fn multiple_nodes_are_live() -> Result<(), String> { ConsensusRunnerConfig::new_from_env(4, 0) .and_then(|config| config.parse_extra_config()) .map(|config| { - run_n_rounds_and_collect_hashes(config, Vec::new(), true); + TestRunner::new(config, true).run_n_rounds_and_collect_hashes(); }) } @@ -36,7 +41,7 @@ fn single_node_is_live() { num_rounds: 126, ..Default::default() }; - run_n_rounds_and_collect_hashes(config, Vec::new(), true); + TestRunner::new(config, true).run_n_rounds_and_collect_hashes(); } #[test] @@ -48,7 +53,7 @@ fn master_pubkeys_are_produced() -> Result<(), String> { if config.num_rounds < 60 { config.num_rounds = 60; } - assert!(run_n_rounds_and_check_pubkeys(config, Vec::new(), true)); + assert!(TestRunner::new(config, true).run_n_rounds_and_check_pubkeys()); }) } @@ -61,7 +66,7 @@ fn multiple_nodes_are_deterministic() { num_rounds: 10, ..Default::default() }; - run_n_rounds_and_collect_hashes(config, Vec::new(), true) + TestRunner::new(config, true).run_n_rounds_and_collect_hashes() }; assert_eq!(run(), run()); } @@ -78,7 +83,9 @@ fn minority_invalid_notary_share_signature_would_pass() -> Result<(), String> { for _ in 0..rng.gen_range(1..=f) { malicious.push(malicious::invalid_notary_share_signature()) } - run_n_rounds_and_collect_hashes(config, malicious, true); + TestRunner::new(config, true) + .with_modifiers(malicious) + .run_n_rounds_and_collect_hashes(); }) } @@ -91,7 +98,9 @@ fn majority_invalid_notary_share_signature_would_stuck() -> Result<(), String> { for _ in 0..(config.num_nodes / 3 + 1) { malicious.push(malicious::invalid_notary_share_signature()) } - run_n_rounds_and_collect_hashes(config, malicious, false); + TestRunner::new(config, false) + .with_modifiers(malicious) + .run_n_rounds_and_collect_hashes(); }) } @@ -107,7 +116,9 @@ fn minority_absent_notary_share_would_pass() -> Result<(), String> { for _ in 0..rng.gen_range(1..=f) { malicious.push(malicious::absent_notary_share()); } - run_n_rounds_and_collect_hashes(config, malicious, true); + TestRunner::new(config, true) + .with_modifiers(malicious) + .run_n_rounds_and_collect_hashes(); }) } @@ -120,7 +131,9 @@ fn majority_absent_notary_share_signature_would_stuck() -> Result<(), String> { for _ in 0..(config.num_nodes / 3 + 1) { malicious.push(malicious::absent_notary_share()); } - run_n_rounds_and_collect_hashes(config, malicious, false); + TestRunner::new(config, false) + .with_modifiers(malicious) + .run_n_rounds_and_collect_hashes(); }) } @@ -140,7 +153,9 @@ fn minority_maliciouly_notarize_all_would_pass() -> Result<(), String> { }; malicious.push(malicious::with_malicious_flags(malicious_flags)); } - run_n_rounds_and_collect_hashes(config, malicious, true); + TestRunner::new(config, true) + .with_modifiers(malicious) + .run_n_rounds_and_collect_hashes(); }) } @@ -160,7 +175,9 @@ fn minority_maliciouly_finalize_all_would_pass() -> Result<(), String> { }; malicious.push(malicious::with_malicious_flags(malicious_flags)); } - run_n_rounds_and_collect_hashes(config, malicious, true); + TestRunner::new(config, true) + .with_modifiers(malicious) + .run_n_rounds_and_collect_hashes(); }) } @@ -187,7 +204,9 @@ fn majority_maliciouly_finalize_all_would_diverge() -> Result<(), String> { }; malicious.push(malicious::with_malicious_flags(malicious_flags)); } - run_n_rounds_and_collect_hashes(config, malicious, false); + TestRunner::new(config, false) + .with_modifiers(malicious) + .run_n_rounds_and_collect_hashes(); }) } @@ -211,7 +230,11 @@ fn minority_maliciouly_idkg_dealers_would_pass() -> Result<(), String> { }; malicious.push(malicious::with_malicious_flags(malicious_flags)); } - assert!(run_n_rounds_and_check_pubkeys(config, malicious, true)) + assert!( + TestRunner::new(config, true) + .with_modifiers(malicious) + .run_n_rounds_and_check_pubkeys() + ) }) } @@ -227,124 +250,166 @@ fn stalled_clocks_with_f_malicious_would_pass() -> Result<(), String> { for _ in 0..f { malicious.push(malicious::absent_notary_share()) } - run_n_rounds_and_collect_hashes(config, malicious, true); + TestRunner::new(config, true) + .with_modifiers(malicious) + .run_n_rounds_and_collect_hashes(); }) } -fn run_test( +// Helper type for additional mutations to the registry that a test may want to perform after the +// initial setup of the subnet. +type RegistryMutations = Box; +struct TestRunner { config: ConsensusRunnerConfig, - mut modifiers: Vec, - stop_predicate: StopPredicate, finish: bool, -) { - let rng = &mut ChaChaRng::seed_from_u64(config.random_seed); - let nodes = config.num_nodes; - ic_test_utilities::artifact_pool_config::with_test_pool_configs(nodes, move |pool_configs| { - let time_source = FastForwardTimeSource::new(); - let subnet_id = subnet_test_id(0); - let replica_configs: Vec<_> = vec![(); nodes] - .iter() - .enumerate() - .map(|(index, _)| ReplicaConfig { - node_id: node_test_id(index as u64), - subnet_id, - }) - .collect(); - let node_ids: Vec<_> = replica_configs - .iter() - .map(|config| config.node_id) - .collect(); - let (registry_client, cup, cryptos) = setup_subnet(subnet_id, &node_ids, rng); - let inst_deps: Vec<_> = replica_configs - .iter() - .zip(pool_configs.iter()) - .map(|(replica_config, pool_config)| { - ConsensusDependencies::new( - replica_config.clone(), - pool_config.clone(), - Arc::clone(®istry_client) as Arc, - cup.clone(), - time_source.clone(), - ) - }) - .collect(); - - let mut runner = ConsensusRunner::new_with_config(config, time_source); - - for ((pool_config, deps), crypto) in pool_configs - .iter() - .zip(inst_deps.iter()) - .zip(cryptos.iter()) - { - let modifier = modifiers.pop(); - runner.add_instance( - deps.consensus_pool.read().unwrap().get_cache(), - crypto.clone(), - crypto.clone(), - modifier, - deps, - pool_config.clone(), - &PoolReader::new(&*deps.consensus_pool.read().unwrap()), - ); - } - assert_eq!(runner.run_until(stop_predicate), finish); - }) + modifiers: Vec, + stop_predicate: Option, + additional_registry_mutations: Option, } -fn run_n_rounds_and_collect_hashes( - config: ConsensusRunnerConfig, - modifiers: Vec, - finish: bool, -) -> Vec { - let rounds = config.num_rounds; - let hashes = Rc::new(RefCell::new(Vec::new())); - let hashes_clone = hashes.clone(); - let reach_n_rounds = move |inst: &ConsensusInstance<'_>| { - let pool = inst.driver.consensus_pool.write().unwrap(); - for nota in pool.validated().notarization().get_highest_iter() { - let hash = ic_types::crypto::crypto_hash(¬a); - let hash = hash.get_ref(); - if !hashes_clone.borrow().contains(hash) { - hashes_clone.borrow_mut().push(hash.clone()); - } +impl TestRunner { + fn new(config: ConsensusRunnerConfig, finish: bool) -> Self { + Self { + config, + finish, + modifiers: vec![], + stop_predicate: None, + additional_registry_mutations: None, } - inst.deps.message_routing.expected_batch_height() >= Height::from(rounds) - }; - run_test(config, modifiers, Box::new(reach_n_rounds), finish); - hashes.as_ref().take() -} + } -fn run_n_rounds_and_check_pubkeys( - config: ConsensusRunnerConfig, - modifiers: Vec, - finish: bool, -) -> bool { - let rounds = config.num_rounds; - let pubkey_exists = Rc::new(RefCell::new(false)); - let pubkey_exists_clone = pubkey_exists.clone(); - let got_pubkey = move |inst: &ConsensusInstance<'_>| { - let batches = inst.deps.message_routing.as_ref().batches.read().unwrap(); - let Some(batch) = batches.last() else { - return false; + fn with_modifiers(mut self, modifiers: Vec) -> Self { + self.modifiers = modifiers; + self + } + + fn with_stop_predicate(mut self, stop_predicate: StopPredicate) -> Self { + self.stop_predicate = Some(stop_predicate); + self + } + + fn with_additional_registry_mutations( + mut self, + additional_registry_mutations: RegistryMutations, + ) -> Self { + self.additional_registry_mutations = Some(additional_registry_mutations); + self + } + + fn run_test(mut self) { + let stop_predicate = self + .stop_predicate + .expect("Stop predicate must be set before running the test"); + + let rng = &mut ChaChaRng::seed_from_u64(self.config.random_seed); + let nodes = self.config.num_nodes; + ic_test_utilities::artifact_pool_config::with_test_pool_configs( + nodes, + move |pool_configs| { + let time_source = FastForwardTimeSource::new(); + let subnet_id = subnet_test_id(0); + let replica_configs: Vec<_> = vec![(); nodes] + .iter() + .enumerate() + .map(|(index, _)| ReplicaConfig { + node_id: node_test_id(index as u64), + subnet_id, + }) + .collect(); + let node_ids: Vec<_> = replica_configs + .iter() + .map(|config| config.node_id) + .collect(); + let (data_provider, registry_client, cup, cryptos) = + setup_subnet(subnet_id, &node_ids, self.config.dkg_interval_length, rng); + if let Some(additional_registry_mutations) = self.additional_registry_mutations { + additional_registry_mutations(&data_provider, ®istry_client); + } + let inst_deps: Vec<_> = replica_configs + .iter() + .zip(pool_configs.iter()) + .map(|(replica_config, pool_config)| { + ConsensusDependencies::new( + replica_config.clone(), + pool_config.clone(), + Arc::clone(®istry_client) as Arc, + cup.clone(), + time_source.clone(), + ) + }) + .collect(); + + let mut runner = ConsensusRunner::new_with_config(self.config, time_source); + + for ((pool_config, deps), crypto) in pool_configs + .iter() + .zip(inst_deps.iter()) + .zip(cryptos.iter()) + { + let modifier = self.modifiers.pop(); + runner.add_instance( + deps.consensus_pool.read().unwrap().get_cache(), + crypto.clone(), + crypto.clone(), + modifier, + deps, + pool_config.clone(), + &PoolReader::new(&*deps.consensus_pool.read().unwrap()), + ); + } + assert_eq!(runner.run_until(stop_predicate), self.finish); + }, + ) + } + + fn run_n_rounds_and_collect_hashes(self) -> Vec { + let rounds = self.config.num_rounds; + let hashes = Rc::new(RefCell::new(Vec::new())); + let hashes_clone = hashes.clone(); + let reach_n_rounds = move |inst: &ConsensusInstance<'_>| { + let pool = inst.driver.consensus_pool.write().unwrap(); + for nota in pool.validated().notarization().get_highest_iter() { + let hash = ic_types::crypto::crypto_hash(¬a); + let hash = hash.get_ref(); + if !hashes_clone.borrow().contains(hash) { + hashes_clone.borrow_mut().push(hash.clone()); + } + } + inst.deps.message_routing.expected_batch_height() >= Height::from(rounds) }; + self.with_stop_predicate(Box::new(reach_n_rounds)) + .run_test(); + hashes.as_ref().take() + } + + fn run_n_rounds_and_check_pubkeys(self) -> bool { + let rounds = self.config.num_rounds; + let pubkey_exists = Rc::new(RefCell::new(false)); + let pubkey_exists_clone = pubkey_exists.clone(); + let got_pubkey = move |inst: &ConsensusInstance<'_>| { + let batches = inst.deps.message_routing.as_ref().batches.read().unwrap(); + let Some(batch) = batches.last() else { + return false; + }; - let mut found_keys = 0; - for key_id in test_master_public_key_ids() { - if let BatchContent::Data { chain_key_data, .. } = &batch.content - && chain_key_data.master_public_keys.contains_key(&key_id) - { - found_keys += 1 + let mut found_keys = 0; + for key_id in test_master_public_key_ids() { + if let BatchContent::Data { chain_key_data, .. } = &batch.content + && chain_key_data.master_public_keys.contains_key(&key_id) + { + found_keys += 1 + } } - } - if found_keys == test_master_public_key_ids().len() { - *pubkey_exists_clone.borrow_mut() = true; - } - *pubkey_exists_clone.borrow() - || inst.deps.message_routing.expected_batch_height() >= Height::from(rounds) - }; - run_test(config, modifiers, Box::new(got_pubkey), finish); + if found_keys == test_master_public_key_ids().len() { + *pubkey_exists_clone.borrow_mut() = true; + } + *pubkey_exists_clone.borrow() + || inst.deps.message_routing.expected_batch_height() >= Height::from(rounds) + }; + self.with_stop_predicate(Box::new(got_pubkey)).run_test(); - *pubkey_exists.borrow() + *pubkey_exists.borrow() + } } /// Run a test subnets with `num_nodes` many nodes, out of which there are `num_nodes_equivocating` many equivocating blockmaker @@ -364,7 +429,9 @@ fn equivocating_block_maker_test( for _ in 0..num_nodes_equivocating { malicious.push(malicious::with_malicious_flags(malicious_flags.clone())); } - run_n_rounds_and_collect_hashes(config, malicious, finish); + TestRunner::new(config, finish) + .with_modifiers(malicious) + .run_n_rounds_and_collect_hashes(); }) } @@ -380,3 +447,145 @@ fn one_node_equivocating_passes() -> Result<(), String> { fn all_nodes_equivocating_fail() -> Result<(), String> { equivocating_block_maker_test(4, 4, false) } + +/// Regression test for ICSUP-XXX stalling subnet `3hhby` on 2026-05-22. +/// Tests that if checkpointing is slow at an upgrade boundary, i.e. consensus reaches hard bound +/// `ACCEPTABLE_NOTARIZATION_CERTIFICATION_GAP` before the upgrade height is certified, then +/// consensus still creates a CUP. +/// This used not to be the case because CUP shares were only created when the finalized tip's +/// certified height reached the upgrade height, which would never happen because consensus had +/// reached the hard bound. +/// This was fixed by ignoring this condition when the subnet is halting. +/// +/// Steps of the test: +/// 1. Certified height is frozen at the upgrade height minus 1 (simulating a slow checkpoint). +/// 2. Consensus advances with empty blocks until the bound +/// `ACCEPTABLE_NOTARIZATION_CERTIFICATION_GAP` is reached, then stops creating more blocks. +/// 3. The certified-height override is released; consensus resumes and a CUP should be created at +/// the upgrade height, even though there exists no finalized block whose certified height +/// reached the upgrade height. +#[test] +fn slow_checkpointing_at_upgrade_boundary() { + const DKG_INTERVAL_LENGTH: u64 = 74; // On purpose larger than `ACCEPTABLE_NOTARIZATION_CERTIFICATION_GAP` + // We need to execute one first interval to trigger the upgrade at the end of the second. + let upgrade_height = Height::from(2 * (DKG_INTERVAL_LENGTH + 1)); + + let config = ConsensusRunnerConfig { + num_nodes: 4, + dkg_interval_length: DKG_INTERVAL_LENGTH, + ..Default::default() + }; + + // Make the subnet upgrade at `upgrade_height` + let additional_registry_mutations = + |data_provider: &ProtoRegistryDataProvider, registry_client: &FakeRegistryClient| { + let subnet_record = registry_client + .get_subnet_record(subnet_test_id(0), RegistryVersion::from(1)) + .unwrap() + .unwrap(); + data_provider + .add( + &ic_registry_keys::make_subnet_record_key(subnet_test_id(0)), + RegistryVersion::from(2), + Some(SubnetRecord { + replica_version_id: "upgrade_version".to_string(), + ..subnet_record + }), + ) + .unwrap(); + registry_client.reload(); + }; + + let frozen_state_height = upgrade_height - 1.into(); + let mut is_checkpointing = true; + let stop = move |inst: &ConsensusInstance<'_>| { + let pool = inst.driver.consensus_pool.read().unwrap(); + let reader = PoolReader::new(&*pool); + let finalized_height = reader.get_finalized_height(); + + // As long as we are checkpointing, we should not have a CUP at the upgrade height yet. + if is_checkpointing { + let cup_height = reader.get_catch_up_height(); + assert_ne!( + cup_height, upgrade_height, + "Should not have created a CUP at the upgrade height {} before finishing checkpointing", + upgrade_height, + ); + } + + let stall_height = frozen_state_height.get() + ACCEPTABLE_NOTARIZATION_CERTIFICATION_GAP; + match Ord::cmp(&finalized_height.get(), &stall_height) { + Ordering::Less => { + // Freeze the certified height at `frozen_state_height` on all nodes to simulate a + // slow checkpoint at the upgrade boundary, so that consensus reaches the hard + // bound `ACCEPTABLE_NOTARIZATION_CERTIFICATION_GAP` before the upgrade height is + // certified. + *inst + .deps + .state_manager + .override_max_state_height + .write() + .unwrap() = Some(frozen_state_height); + } + Ordering::Equal => { + // At the stall height, every finalized block should still carry + // `certified_height == frozen_state_height` (the cap is still active). + let finalized_certified_height = reader + .get_finalized_block(finalized_height) + .unwrap() + .context + .certified_height; + assert_eq!( + finalized_certified_height, frozen_state_height, + "finalized block at height {} should have certified_height == {}, but got {}", + finalized_height, frozen_state_height, finalized_certified_height + ); + + // Now, simulate that checkpointing has finished by releasing the override + *inst + .deps + .state_manager + .override_max_state_height + .write() + .unwrap() = None; + is_checkpointing = false; + } + Ordering::Greater => { + // This should happen only after we have released the override. In this case, we + // should only have created a single block past the upgrade height, and its + // certified height should still be equal to the frozen height. + // Note: It is possible not to enter that branch at all if the CUP was created + // before making a new block. + assert!( + !is_checkpointing, + "finalized height should not have exceeded the stall point before finishing checkpointing" + ); + assert_eq!( + finalized_height.get(), + frozen_state_height.get() + ACCEPTABLE_NOTARIZATION_CERTIFICATION_GAP + 1, + "finalized height should only exceed the bound by 1, but got {}", + finalized_height + ); + let finalized_certified_height = reader + .get_finalized_block(finalized_height) + .unwrap() + .context + .certified_height; + assert_eq!( + finalized_certified_height, frozen_state_height, + "finalized block at height {} should still have certified_height == {}, but got {}", + finalized_height, frozen_state_height, finalized_certified_height + ); + } + } + + let cup_height = reader.get_catch_up_height(); + // Success condition is to have been able to create a CUP at the upgrade height. + cup_height == upgrade_height + }; + + TestRunner::new(config, true) + .with_stop_predicate(Box::new(stop)) + .with_additional_registry_mutations(Box::new(additional_registry_mutations)) + .run_test(); +} diff --git a/rs/test_utilities/src/state_manager.rs b/rs/test_utilities/src/state_manager.rs index 7c0977193891..d7cf8392f57b 100644 --- a/rs/test_utilities/src/state_manager.rs +++ b/rs/test_utilities/src/state_manager.rs @@ -62,6 +62,9 @@ pub struct FakeStateManager { /// Size 1 by default (no op). pub encode_certified_stream_slice_barrier: Arc>, fd_factory: Arc, + /// When `Some(cap)`, the state manager acts as if its latest state height was capped at `cap`. + /// Used in tests to simulate a slow checkpoint holding certification back. + pub override_max_state_height: Arc>>, } impl Default for FakeStateManager { @@ -92,6 +95,7 @@ impl FakeStateManager { tempdir: Arc::new(tmpdir), encode_certified_stream_slice_barrier: Arc::new(RwLock::new(Barrier::new(1))), fd_factory: Arc::new(TestPageAllocatorFileDescriptorImpl::new()), + override_max_state_height: Arc::new(RwLock::new(None)), } } @@ -287,31 +291,36 @@ impl StateReader for FakeStateManager { type State = ReplicatedState; fn latest_state_height(&self) -> Height { - self.states + let real = self + .states .read() .unwrap() .last() - .map_or(INITIAL_STATE_HEIGHT, |snap| snap.height) + .map_or(INITIAL_STATE_HEIGHT, |snap| snap.height); + match *self.override_max_state_height.read().unwrap() { + Some(cap) => real.min(cap), + None => real, + } } // No certification support in FakeStateManager fn latest_certified_height(&self) -> Height { - self.states + let real = self + .states .read() .unwrap() .iter() - .filter(|s| s.height > Height::from(0) && s.certification.is_some()) - .map(|s| s.height) - .next_back() - .unwrap_or_else(|| Height::from(0)) + .rfind(|s| s.height > Height::from(0) && s.certification.is_some()) + .map_or(INITIAL_STATE_HEIGHT, |snap| snap.height); + match *self.override_max_state_height.read().unwrap() { + Some(cap) => real.min(cap), + None => real, + } } fn get_latest_state(&self) -> Labeled> { - self.states - .read() - .unwrap() - .last() - .map_or_else(initial_state, |snap| snap.make_labeled_state()) + self.get_state_at(self.latest_state_height()) + .expect("latest state is always available in FakeStateManager") } // No certification support in FakeStateManager