Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .typos.toml
Original file line number Diff line number Diff line change
Expand Up @@ -33,3 +33,5 @@ Forgetten = "Forgetten"
typ = "typ"
# Token Generation Event
tge = "tge"
# Rust crate name
bimap = "bimap"
51 changes: 45 additions & 6 deletions crates/hotshot/libp2p-networking/src/network/node.rs
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,11 @@ pub const ESTABLISHED_LIMIT: NonZeroU32 = NonZeroU32::new(ESTABLISHED_LIMIT_UNWR
/// Number of connections to a single peer before logging an error
pub const ESTABLISHED_LIMIT_UNWR: u32 = 10;

/// AutoNAT confidence at which we treat a Private status as definitive enough
/// to escalate from a transient warning to a loud operator-facing error.
/// Matches libp2p-autonat's default `confidence_max`.
const AUTONAT_CONFIDENCE_MAX: usize = 3;

/// Network definition
#[derive(derive_more::Debug)]
pub struct NetworkNode<T: NodeType, D: DhtPersistentStorage> {
Expand All @@ -104,6 +109,9 @@ pub struct NetworkNode<T: NodeType, D: DhtPersistentStorage> {
dht_handler: DHTBehaviour<T::SignatureKey, D>,
/// Channel to resend requests, set to Some when we call `spawn_listeners`
resend_tx: Option<UnboundedSender<ClientRequest>>,
/// Whether we've already emitted the loud "not publicly reachable" error for the
/// current Private episode. Reset whenever AutoNAT leaves Private status.
autonat_private_logged: bool,
}

impl<T: NodeType, D: DhtPersistentStorage> NetworkNode<T, D> {
Expand Down Expand Up @@ -267,7 +275,7 @@ impl<T: NodeType, D: DhtPersistentStorage> NetworkNode<T, D> {
.set_publication_interval(Some(kademlia_record_republication_interval))
.set_record_ttl(Some(kademlia_ttl));

// allowing panic here because something is very wrong if this fales
// allowing panic here because something is very wrong if this fails
#[allow(clippy::panic)]
if let Some(factor) = config.replication_factor {
kconfig.set_replication_factor(factor);
Expand Down Expand Up @@ -355,6 +363,7 @@ impl<T: NodeType, D: DhtPersistentStorage> NetworkNode<T, D> {
.unwrap_or(NonZeroUsize::new(4).unwrap()),
),
resend_tx: None,
autonat_private_logged: false,
})
}

Expand Down Expand Up @@ -687,16 +696,46 @@ impl<T: NodeType, D: DhtPersistentStorage> NetworkNode<T, D> {
peer,
error,
} => {
debug!("AutoNAT outbound probe to {peer:?} failed: {error:?}");
},
},
autonat::Event::StatusChanged { old, new } => match &new {
autonat::NatStatus::Public(addr) => {
info!(
"AutoNAT: this node is publicly reachable at {addr} (was \
{old:?})"
);
self.autonat_private_logged = false;
},
autonat::NatStatus::Private => {
warn!(
"AutoNAT Probe failed to peer {peer:?} with error: \
{error:?}"
"AutoNAT: probe reports this node may not be publicly \
reachable (was {old:?}). Treating as transient until \
confirmed by repeated probes."
);
},
},
autonat::Event::StatusChanged { old, new } => {
debug!("AutoNAT Status changed. Old: {old:?}, New: {new:?}");
autonat::NatStatus::Unknown => {
debug!("AutoNAT status: {old:?} -> Unknown");
self.autonat_private_logged = false;
},
},
};
let autonat = &self.swarm.behaviour().autonat;
if matches!(autonat.nat_status(), autonat::NatStatus::Private)
&& autonat.confidence() >= AUTONAT_CONFIDENCE_MAX
&& !self.autonat_private_logged
{
error!(
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this error is too harsh, we can still perform as leader (and should) without libp2p. If the CDN is also down or we aren't connect then we will miss slots.

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

But it's just a log message. If this autonat stuff works correctly then we are in a degraded state and operators should fix it ASAP. We are still continuing.

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@bfish713 updated message in b9010fd

"AutoNAT: this node is NOT publicly reachable (confirmed by \
repeated probes). Peers cannot direct-message us, so leader \
views may fail and we may accumulate missed slots. Verify \
--libp2p-advertise-address (env \
ESPRESSO_NODE_LIBP2P_ADVERTISE_ADDRESS) is set a publicly \
reachable host:port, and ensure inbound UDP at that port is open \
from the public internet (firewall/NAT/security group)."
);
self.autonat_private_logged = true;
}
None
},
};
Expand Down
Loading