Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion rs/consensus/src/consensus.rs
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ use strum_macros::AsRefStr;
/// We will not notarize or validate artifacts with a height greater than the given
/// value above the latest certification. During validation, the only exception to
/// this are CUPs, which have no upper bound on the height to be validated.
pub(crate) const ACCEPTABLE_NOTARIZATION_CERTIFICATION_GAP: u64 = 70;
pub const ACCEPTABLE_NOTARIZATION_CERTIFICATION_GAP: u64 = 70;

/// In order to have a bound on the advertised consensus pool, we place a limit on
/// the gap between notarized height and the height of the next pending CUP.
Expand Down
276 changes: 224 additions & 52 deletions rs/consensus/src/consensus/catchup_package_maker.rs
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
//! At the moment, we will start to make a CatchUpPackage once a DKG summary
//! block is considered finalized.

use crate::consensus::status;
use ic_consensus_utils::{
active_high_threshold_nidkg_id, crypto::ConsensusCrypto, get_oldest_state_registry_version,
membership::Membership, pool_reader::PoolReader,
Expand Down Expand Up @@ -172,10 +173,35 @@ impl CatchUpPackageMaker {
// Skip if random beacon does not exist for the height
let random_beacon = pool.get_random_beacon(height)?;

let halting = || {
status::should_halt(
height,
self.membership.registry_client.as_ref(),
self.membership.subnet_id,
pool,
&self.log,
) == Some(true)
};
// Skip if the state referenced by finalization tip has not caught up to
// this height. This is to increase the chance that states are available to
// validate payloads at the chain tip.
if pool.get_finalized_tip().context.certified_height < height {
// We make an exception if we are halting at this height, which was introduced after the
// incident on subnet `3hhby` on 2026-05-22.
// Checkpointing was slow at an upgrade boundary, and consensus continued creating blocks
// until reaching `ACCEPTABLE_NOTARIZATION_CERTIFICATION_GAP`, each with a validation
// context's certified height equal to the upgrade height minus 1. When checkpointing
// finally finished, a new certified height was available, but since the block maker is
// always one height ahead of the notary, we had already created a block, also with a
// certified height equal to the upgrade height minus 1. The notary would notarize it but
// reach the bound again. Since the CUP maker (here) waits for the finalized tip's
// validation context's certified height to reach the upgrade height, no CUP was ever
// created, and the subnet stalled.
// By allowing the CUP maker to make a CUP share even when the finalized tip's validation
// context has not caught up to the CUP height, we can ensure that a CUP will be created.
// It is not a problem for the finalized tip's validation context to be behind the CUP
// height, because when we are halting, all blocks have empty payloads anyways, and thus do
// not even need to access states at the validation context's certified height.
if pool.get_finalized_tip().context.certified_height < height && !halting() {
return None;
}

Expand Down Expand Up @@ -267,6 +293,8 @@ mod tests {
dependencies_with_subnet_records_with_raw_state_manager,
};
use ic_logger::replica_logger::no_op_logger;
use ic_protobuf::registry::subnet::v1::SubnetRecord;
use ic_registry_client_helpers::subnet::SubnetRegistry;
use ic_replicated_state::metadata_state::subnet_call_context_manager::{
SetupInitialDkgContext, SignWithThresholdContext,
};
Expand All @@ -283,88 +311,232 @@ mod tests {
use ic_test_utilities_types::ids::{node_test_id, subnet_test_id};
use ic_types::{
CryptoHashOfState, Height, RegistryVersion,
consensus::{BlockPayload, Payload, SummaryPayload, idkg::PreSigId},
consensus::{BlockPayload, BlockProposal, Payload, SummaryPayload, idkg::PreSigId},
crypto::CryptoHash,
};
use rstest::rstest;
use std::sync::{Arc, RwLock};

#[test]
fn test_catch_up_package_maker() {
fn assert_cup_share_matches_block_and_state(
share: &CatchUpPackageShare,
proposal: &BlockProposal,
state_manager: &dyn StateManager<State = ReplicatedState>,
oldest_registry_version_in_use_by_replicated_state: Option<RegistryVersion>,
) {
assert_eq!(&share.content.block, proposal.content.get_hash());
assert_eq!(
share.content.state_hash,
state_manager.get_state_hash_at(proposal.height()).unwrap()
);
assert_eq!(
share
.content
.oldest_registry_version_in_use_by_replicated_state,
oldest_registry_version_in_use_by_replicated_state
);
}

fn with_cup_maker_setup<T>(run: impl FnOnce(CatchUpPackageMaker, u64, Dependencies) -> T) -> T {
ic_test_utilities::artifact_pool_config::with_test_pool_config(|pool_config| {
let interval_length = 5;
let dkg_interval_length = 5;
let committee: Vec<_> = (0..4).map(node_test_id).collect();
let Dependencies {
mut pool,
membership,
replica_config,
crypto,
state_manager,
..
} = dependencies_with_subnet_params(
let mut deps = dependencies_with_subnet_params(
pool_config,
subnet_test_id(0),
vec![(
1,
SubnetRecordBuilder::from(&committee)
.with_dkg_interval_length(interval_length)
.with_dkg_interval_length(dkg_interval_length)
.build(),
)],
);

state_manager
// Ignore state sync and state divergence
deps.state_manager
.get_mut()
.expect_fetch_state()
.return_const(());
deps.state_manager
.get_mut()
.expect_report_diverged_checkpoint()
.return_const(());

deps.state_manager
.get_mut()
.expect_get_state_hash_at()
.return_const(Ok(CryptoHashOfState::from(CryptoHash(vec![1, 2, 3]))));

let message_routing = FakeMessageRouting::new();
*message_routing.next_batch_height.write().unwrap() = Height::from(2);
let message_routing = Arc::new(message_routing);
let message_routing = Arc::new(FakeMessageRouting::new());

let cup_maker = CatchUpPackageMaker::new(
replica_config,
membership,
crypto,
state_manager.clone(),
deps.replica_config.clone(),
deps.membership.clone(),
deps.crypto.clone(),
deps.state_manager.clone(),
message_routing,
no_op_logger(),
);

// 1. Genesis CUP already exists, we won't make a new one
assert!(cup_maker.on_state_change(&PoolReader::new(&pool)).is_none());

// Genesis CUP already exists, we won't make a new one
assert!(
cup_maker
.on_state_change(&PoolReader::new(&deps.pool))
.is_none()
);
// Skip the first DKG interval
pool.advance_round_normal_operation_n(interval_length);
deps.pool
.advance_round_normal_operation_n(dkg_interval_length);

let mut proposal = pool.make_next_block();
let block = proposal.content.as_mut();
block.context.certified_height = block.height();
proposal.content = HashedBlock::new(ic_types::crypto::crypto_hash, block.clone());
pool.insert_validated(proposal.clone());
pool.notarize(&proposal);
pool.finalize(&proposal);
run(cup_maker, dkg_interval_length, deps)
})
}

// 4. Beacon does not exist, we can't make a new CUP share
assert!(cup_maker.on_state_change(&PoolReader::new(&pool)).is_none());
#[test]
fn test_catch_up_package_maker_waits_for_beacon() {
with_cup_maker_setup(
|cup_maker,
_,
Dependencies {
mut pool,
state_manager,
..
}| {
let mut proposal = pool.make_next_block();
let block = proposal.content.as_mut();
block.context.certified_height = block.height();
proposal.content = HashedBlock::new(ic_types::crypto::crypto_hash, block.clone());
pool.insert_validated(proposal.clone());
pool.notarize(&proposal);
pool.finalize(&proposal);

// Beacon does not exist, we can't make a new CUP share
assert!(cup_maker.on_state_change(&PoolReader::new(&pool)).is_none());

// Beacon now exists, we can make a new CUP share
pool.insert_validated(pool.make_next_beacon());
let share = cup_maker
.on_state_change(&PoolReader::new(&pool))
.expect("Expecting CatchUpPackageShare");

assert_cup_share_matches_block_and_state(
&share,
&proposal,
state_manager.as_ref(),
None,
);
},
)
}

// 5. Beacon now exists, we can make a new CUP share
pool.insert_validated(pool.make_next_beacon());
let share = cup_maker
.on_state_change(&PoolReader::new(&pool))
.expect("Expecting CatchUpPackageShare");
#[test]
fn test_catch_up_package_maker_waits_for_finalized_tip_certified_height_to_reach_cup_height() {
with_cup_maker_setup(
|cup_maker,
_,
Dependencies {
mut pool,
state_manager,
..
}| {
let mut summary_proposal = pool.make_next_block();
let summary_block = summary_proposal.content.as_mut();
let summary_height = summary_block.height();
summary_block.context.certified_height = summary_height - 1.into();
summary_proposal.content =
HashedBlock::new(ic_types::crypto::crypto_hash, summary_block.clone());
pool.advance_round_with_block(&summary_proposal);

// Finalized tip's certified height is behind the CUP height, we can't make a new
// CUP share
assert!(cup_maker.on_state_change(&PoolReader::new(&pool)).is_none());

let mut proposal = pool.make_next_block();
let block = proposal.content.as_mut();
block.context.certified_height = summary_height;
proposal.content = HashedBlock::new(ic_types::crypto::crypto_hash, block.clone());
pool.advance_round_with_block(&proposal);

// Finalized tip's certified height has caught up to the CUP height, we can make a
// new CUP share
let share = cup_maker
.on_state_change(&PoolReader::new(&pool))
.expect("Expecting CatchUpPackageShare");

assert_cup_share_matches_block_and_state(
&share,
&summary_proposal,
state_manager.as_ref(),
None,
);
},
)
}

assert_eq!(&share.content.block, proposal.content.get_hash());
assert_eq!(
share.content.state_hash,
state_manager.get_state_hash_at(Height::from(0)).unwrap()
);
assert_eq!(
share
.content
.oldest_registry_version_in_use_by_replicated_state,
None
);
})
#[test]
fn test_catch_up_package_maker_does_not_wait_for_finalized_tip_when_halting() {
with_cup_maker_setup(
|cup_maker,
dkg_interval_length,
Dependencies {
mut pool,
state_manager,
registry,
registry_data_provider,
..
}| {
let existing_subnet_record = registry
.get_subnet_record(subnet_test_id(0), registry_data_provider.latest_version())
.unwrap()
.unwrap();
let upgrade_registry_version = RegistryVersion::from(10);
registry_data_provider
.add(
&ic_registry_keys::make_subnet_record_key(subnet_test_id(0)),
upgrade_registry_version,
Some(SubnetRecord {
replica_version_id: "upgrade_version".to_string(),
..existing_subnet_record
}),
)
.unwrap();
registry.update_to_latest_version();

let mut upgrade_proposal = pool.make_next_block();
let upgrade_block = upgrade_proposal.content.as_mut();
let mut upgrade_summary = upgrade_block.payload.as_ref().as_summary().clone();
// Manually modify the summary's registry version to trigger the update
upgrade_summary.dkg.registry_version = upgrade_registry_version;
upgrade_block.payload = Payload::new(
ic_types::crypto::crypto_hash,
BlockPayload::Summary(upgrade_summary),
);
upgrade_proposal.content =
HashedBlock::new(ic_types::crypto::crypto_hash, upgrade_block.clone());
pool.advance_round_with_block(&upgrade_proposal);
pool.insert_validated(pool.make_catch_up_package(upgrade_proposal.height()));

Comment thread
pierugo-dfinity marked this conversation as resolved.
pool.advance_round_normal_operation_n(dkg_interval_length);

let mut proposal = pool.make_next_block();
let block = proposal.content.as_mut();
block.context.certified_height = block.height() - 1.into();
proposal.content = HashedBlock::new(ic_types::crypto::crypto_hash, block.clone());
pool.advance_round_with_block(&proposal);

// Even if finalized tip's certified height is behind the CUP height, we are halting
// and thus can make a new CUP share
let share = cup_maker
.on_state_change(&PoolReader::new(&pool))
.expect("Expecting CatchUpPackageShare");

assert_cup_share_matches_block_and_state(
&share,
&proposal,
state_manager.as_ref(),
None,
);
},
)
}

/// Build a vector of signature contexts where the oldest matched
Expand Down
14 changes: 8 additions & 6 deletions rs/consensus/tests/framework/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -48,14 +48,17 @@ use std::{
/// with required records, including subnet record, node record, node public keys,
/// catch-up package (with proper NiDKG transcripts).
///
/// Return the registry client, catch-up package, and a list of crypto components, one
/// for each node.
/// Return the registry data provider, registry client, catch-up package, and a list of crypto
/// components, one for each node.
#[allow(clippy::type_complexity)]
pub fn setup_subnet<R: Rng + CryptoRng>(
subnet_id: SubnetId,
node_ids: &[NodeId],
dkg_interval_length: u64,
rng: &mut R,
) -> (
Arc<dyn RegistryClient>,
Arc<ProtoRegistryDataProvider>,
Arc<FakeRegistryClient>,
CatchUpPackage,
Vec<Arc<TempCryptoComponentGeneric<ChaCha20Rng>>>,
) {
Expand All @@ -65,7 +68,7 @@ pub fn setup_subnet<R: Rng + CryptoRng>(
let registry_client = Arc::new(FakeRegistryClient::new(Arc::clone(&data_provider) as Arc<_>));

let subnet_record = SubnetRecordBuilder::from(node_ids)
.with_dkg_interval_length(19)
.with_dkg_interval_length(dkg_interval_length)
.with_chain_key_config(ChainKeyConfig {
key_configs: test_master_public_key_ids()
.iter()
Expand Down Expand Up @@ -200,7 +203,6 @@ pub fn setup_subnet<R: Rng + CryptoRng>(
.expect("Could not add chain-key enabled subnet list");
}
registry_client.reload();
registry_client.update_to_latest_version();

let cup_contents = registry_client
.get_cup_contents(subnet_id, registry_client.get_latest_version())
Expand All @@ -215,7 +217,7 @@ pub fn setup_subnet<R: Rng + CryptoRng>(
.with_current_transcripts(ni_transcripts);

let cup = make_genesis(summary);
(registry_client, cup, cryptos)
(data_provider, registry_client, cup, cryptos)
}

pub(crate) fn test_master_public_key_ids() -> Vec<MasterPublicKeyId> {
Expand Down
Loading
Loading