Skip to content
Open
Show file tree
Hide file tree
Changes from 7 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 9 additions & 9 deletions subnet/gossip/gossip_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -96,11 +96,11 @@ func (m *mockSigAccumulator) getCalls() []sigAccCall {

// mockDiffFetcher returns pre-configured diffs.
type mockDiffFetcher struct {
diffs []types.Diff
diffs []types.DiffRecord
err error
}

func (m *mockDiffFetcher) GetDiffs(_ context.Context, _, _ uint64) ([]types.Diff, error) {
func (m *mockDiffFetcher) GetDiffs(_ context.Context, _, _ uint64) ([]types.DiffRecord, error) {
return m.diffs, m.err
}

Expand All @@ -110,7 +110,7 @@ type mockStateUpdater struct {
err error
}

func (m *mockStateUpdater) ApplyRecoveredDiffs(_ context.Context, _ []types.Diff) ([]GossipSig, error) {
func (m *mockStateUpdater) ApplyRecoveredDiffs(_ context.Context, _ []types.DiffRecord) ([]GossipSig, error) {
return m.sigs, m.err
}

Expand Down Expand Up @@ -285,9 +285,9 @@ func TestHighestSeen(t *testing.T) {
}

func TestRecovery_TriggersWhenBehind(t *testing.T) {
fetchedDiffs := []types.Diff{
{Nonce: 4, UserSig: []byte("sig4")},
{Nonce: 5, UserSig: []byte("sig5")},
fetchedDiffs := []types.DiffRecord{
{Diff: types.Diff{Nonce: 4, UserSig: []byte("sig4")}},
{Diff: types.Diff{Nonce: 5, UserSig: []byte("sig5")}},
}
recoveredSigs := []GossipSig{
{Nonce: 4, StateHash: []byte("h4"), Sig: []byte("s4"), SlotID: 0},
Expand Down Expand Up @@ -437,9 +437,9 @@ func TestRecovery_UpdatesWatermark(t *testing.T) {
{Nonce: 5, StateHash: []byte("h5"), Sig: []byte("s5"), SlotID: 0},
}

fetcher := &mockDiffFetcher{diffs: []types.Diff{
{Nonce: 4, UserSig: []byte("sig4")},
{Nonce: 5, UserSig: []byte("sig5")},
fetcher := &mockDiffFetcher{diffs: []types.DiffRecord{
{Diff: types.Diff{Nonce: 4, UserSig: []byte("sig4")}},
{Diff: types.Diff{Nonce: 5, UserSig: []byte("sig5")}},
}}
updater := &mockStateUpdater{sigs: recoveredSigs}

Expand Down
4 changes: 2 additions & 2 deletions subnet/gossip/interface.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,12 +22,12 @@ type GossipSig struct {

// DiffFetcher retrieves diffs from a peer (backed by HTTPClient).
type DiffFetcher interface {
GetDiffs(ctx context.Context, fromNonce, toNonce uint64) ([]types.Diff, error)
GetDiffs(ctx context.Context, fromNonce, toNonce uint64) ([]types.DiffRecord, error)
}

// StateUpdater applies recovered diffs and signs them (backed by Host).
type StateUpdater interface {
ApplyRecoveredDiffs(ctx context.Context, diffs []types.Diff) ([]GossipSig, error)
ApplyRecoveredDiffs(ctx context.Context, diffs []types.DiffRecord) ([]GossipSig, error)
Comment thread
heitor-lassarote marked this conversation as resolved.
}

// SigAccumulator receives gossip signatures for nonces that are already applied locally.
Expand Down
57 changes: 57 additions & 0 deletions subnet/host/finality.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
package host

import (
"subnet/storage"
"subnet/types"
)

// computeFinalizedNonce returns the highest nonce F such that >=2/3 of the
// session group (by slot count) has signed at nonce >= F.
//
// A slot that signed at nonce n is considered to have implicitly confirmed all
// nonces <= n by building on top of them.
func computeFinalizedNonce(store storage.Storage, escrowID string, latestNonce uint64, group []types.SlotAssignment) uint64 {
// confirmedBy[n] = bitmap of slots that have signed at nonce >= n.
// Bitmap128 is a value type (16 bytes); max group size is 128.
confirmedBy := make(map[uint64]types.Bitmap128)

for n := uint64(1); n <= latestNonce; n++ {
sigs, err := store.GetSignatures(escrowID, n)
if err != nil {
continue
}
Copy link
Copy Markdown
Collaborator

@akup akup Apr 8, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We should add log at error level here if err != nil

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

for slotID := range sigs {
// This slot signed at n, confirming all nonces 1..n.
for prev := uint64(1); prev <= n; prev++ {
bm := confirmedBy[prev] // zero value if absent
bm.Set(slotID)
confirmedBy[prev] = bm
Comment thread
heitor-lassarote marked this conversation as resolved.
Outdated
}
}
}
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Here is O(N^2) cycle when O(N) is intended to be.
We just don't need inner cycle

for prev := uint64(1); prev <= n; prev++

It could be:

confirmedBy := make(map[uint64]types.Bitmap128)
// running = slots that have signed at any nonce >= current n.
var running types.Bitmap128
for n := latestNonce; n > 0; n-- {
	sigs, err := store.GetSignatures(escrowID, n)
	if err != nil {
		// keep previous running set; just skip adding new signers for this nonce
		confirmedBy[n] = running
		continue
	}
	// A slot that signed at n confirms all nonces <= n.
	// In reverse traversal, that means it belongs to running for n and all lower nonces.
	for slotID := range sigs {
		running.Set(slotID)
	}
	confirmedBy[n] = running
}

O(N) cycle

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Moreover we don't need next cycle, we can here add

...
confirmedBy[n] = running
if bitmapSlotWeight(confirmedBy[f], group) >= threshold {
  return n
}

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.


threshold := twoThirdsWeight(group)
for f := latestNonce; f > 0; f-- {
if bitmapSlotWeight(confirmedBy[f], group) >= threshold {
return f
Copy link

Copilot AI Mar 31, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

computeFinalizedNonce uses twoThirdsWeight(group) as the threshold, but twoThirdsWeight currently computes ceil(2/3 * n). This is weaker than the finalization threshold used elsewhere (e.g. Host.checkFinalization uses 2*len(group)/3 + 1, i.e. >2/3). As a result, finalized may be overestimated (e.g. group size 3 => threshold 2 here vs 3 in checkFinalization), causing ApplyRecoveredDiffs to trust WarmKeyDelta from the wire before the nonce is actually finalized. Align the threshold calculation with the existing finalization rule (>2/3).

Copilot uses AI. Check for mistakes.
}
}
Comment thread
heitor-lassarote marked this conversation as resolved.
Outdated
return 0 // warm-up period: not yet finalized
}

// bitmapSlotWeight counts the number of group slots whose bit is set in bm.
func bitmapSlotWeight(bm types.Bitmap128, group []types.SlotAssignment) uint32 {
var total uint32
for _, sa := range group {
if bm.IsSet(sa.SlotID) {
total++
}
}
return total
}

// twoThirdsWeight returns ceil(2/3 * totalSlots).
func twoThirdsWeight(group []types.SlotAssignment) uint32 {
total := uint32(len(group))
return (total*2 + 2) / 3
}
107 changes: 107 additions & 0 deletions subnet/host/finality_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
package host

import (
"testing"

"github.com/stretchr/testify/require"

"subnet/storage"
"subnet/types"
)

func makeSlotGroup(n int) []types.SlotAssignment {
g := make([]types.SlotAssignment, n)
for i := 0; i < n; i++ {
g[i] = types.SlotAssignment{SlotID: uint32(i), ValidatorAddress: "v"}
}
return g
}

// newMemorySessionWithNonces creates an escrow session and appends empty diffs for nonces 1..latestNonce.
func newMemorySessionWithNonces(t *testing.T, escrowID string, group []types.SlotAssignment, latestNonce uint64) *storage.Memory {
t.Helper()
store := storage.NewMemory()
err := store.CreateSession(storage.CreateSessionParams{
EscrowID: escrowID,
Group: group,
})
require.NoError(t, err)
for n := uint64(1); n <= latestNonce; n++ {
err := store.AppendDiff(escrowID, types.DiffRecord{Diff: types.Diff{Nonce: n}})
require.NoError(t, err)
}
return store
}

func addSignatures(t *testing.T, store *storage.Memory, escrowID string, byNonce map[uint64][]uint32) {
t.Helper()
for nonce, slots := range byNonce {
for _, slotID := range slots {
err := store.AddSignature(escrowID, nonce, slotID, []byte{1})
require.NoError(t, err)
}
}
}

// F=0 when no nonce reaches the 2/3 slot threshold (4 slots → need 3; only 1 signs nonce 1).
func TestComputeFinalizedNonce_F0_insufficientSigners(t *testing.T) {
group := makeSlotGroup(4)
store := newMemorySessionWithNonces(t, "e1", group, 1)
addSignatures(t, store, "e1", map[uint64][]uint32{1: {0}})

f := computeFinalizedNonce(store, "e1", 1, group)
require.Equal(t, uint64(0), f)
}

// F=N when ≥2/3 slots have signed at some nonce ≥ N; a later partial nonce does not raise F past the
// last nonce that still had a supermajority at that height.
func TestComputeFinalizedNonce_F3_partialNonce4DoesNotRaise(t *testing.T) {
group := makeSlotGroup(4)
store := newMemorySessionWithNonces(t, "e1", group, 4)
addSignatures(t, store, "e1", map[uint64][]uint32{
3: {0, 1, 2},
4: {0, 1},
})

f := computeFinalizedNonce(store, "e1", 4, group)
require.Equal(t, uint64(3), f)
}

// Signing only at a high nonce implies confirmation of all lower nonces (transitivity).
func TestComputeFinalizedNonce_transitivity_onlyHighNonceSupermajority(t *testing.T) {
group := makeSlotGroup(4)
store := newMemorySessionWithNonces(t, "e1", group, 5)
// No signatures on nonces 1–4; slots 0,1,2 sign only at nonce 5 → confirms 1..5.
addSignatures(t, store, "e1", map[uint64][]uint32{
5: {0, 1, 2},
})

f := computeFinalizedNonce(store, "e1", 5, group)
require.Equal(t, uint64(5), f)
}

func TestComputeFinalizedNonce_threshold_groupOf3(t *testing.T) {
group := makeSlotGroup(3) // ceil(2/3 * 3) = 2
store := newMemorySessionWithNonces(t, "e1", group, 1)
addSignatures(t, store, "e1", map[uint64][]uint32{1: {0, 1}})

f := computeFinalizedNonce(store, "e1", 1, group)
require.Equal(t, uint64(1), f)
}

func TestComputeFinalizedNonce_threshold_groupOf6(t *testing.T) {
group := makeSlotGroup(6) // ceil(2/3 * 6) = 4
store := newMemorySessionWithNonces(t, "e1", group, 1)
addSignatures(t, store, "e1", map[uint64][]uint32{1: {0, 1, 2, 3}})

f := computeFinalizedNonce(store, "e1", 1, group)
require.Equal(t, uint64(1), f)
}

func TestComputeFinalizedNonce_latestNonceZero(t *testing.T) {
group := makeSlotGroup(4)
store := newMemorySessionWithNonces(t, "e1", group, 1)

f := computeFinalizedNonce(store, "e1", 0, group)
require.Equal(t, uint64(0), f)
}
28 changes: 24 additions & 4 deletions subnet/host/host.go
Original file line number Diff line number Diff line change
Expand Up @@ -821,16 +821,36 @@ func (h *Host) AccumulateGossipSig(nonce uint64, stateHash, sig []byte, senderSl
}

// ApplyRecoveredDiffs applies diffs fetched during gossip recovery.
// For each nonce, if F is the BFT-finalized nonce from local signatures
// (see computeFinalizedNonce), nonces at or below F may trust WarmKeyDelta from
// the wire without mainnet resolution; nonces above F fall back to ResolveWarmKey.
// Returns GossipSig for each successfully applied nonce.
func (h *Host) ApplyRecoveredDiffs(ctx context.Context, diffs []types.Diff) ([]gossip.GossipSig, error) {
func (h *Host) ApplyRecoveredDiffs(ctx context.Context, diffs []types.DiffRecord) ([]gossip.GossipSig, error) {
_ = ctx // interface hook; recovery is synchronous under h.mu today
Comment thread
heitor-lassarote marked this conversation as resolved.

h.mu.Lock()
defer h.mu.Unlock()

var latestNonce uint64
for _, rec := range diffs {
if rec.Nonce > latestNonce {
latestNonce = rec.Nonce
}
}

var finalized uint64
if h.store != nil {
finalized = computeFinalizedNonce(h.store, h.escrowID, latestNonce, h.group)
Copy link
Copy Markdown
Collaborator

@akup akup Apr 8, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We are calling here computeFinalizedNonce that is using stored values to calculate finalized

sigs, err := store.GetSignatures(escrowID, n)

but it seams store is empty as peer signatures written after catch-up gossip exchange

Am I right or missed something? where do we fill the store?

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

computeFinalizedNonce runs before we applyAndPersist the recovered diffs. Peer sigs only go into storage via AccumulateGossipSig, which requires a stored diff for that nonce first, so for nonces in the recovery batch, peer sigs usually aren’t there yet at that line. Recovery also gossips our sigs after apply, so others’ sigs for that gap often arrive later. So finalized is a pre-recovery snapshot; that’s conservative for trusting WarmKeyDelta during catch-up.

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I mean that at the next lines we inject warm keys only for finalized nonce:

if rec.Nonce <= finalized && len(rec.WarmKeyDelta) > 0 {
	h.sm.InjectWarmKeys(rec.WarmKeyDelta)
}

So we have following situation:

  1. We are trying to catch up
  2. We got the diff and empty storage
  3. We are trying to computeFinalizedNonce, but because of empty storage it is 0
  4. We do not InjectWarmKeys as condition rec.Nonce <= finalized is never met
  5. We fail on applyAndPersist if there was warm key changes

I think we need somehow get signatures at computeFinalizedNonce, maybe not from store ( store.GetSignatures(escrowID, n) ) but from diffs

}

Comment on lines +841 to +845
Copy link

Copilot AI Mar 31, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

New finality/warm-key trust behavior is introduced via computeFinalizedNonce and the rec.Nonce <= finalized gate, but there are no unit tests covering (a) the finalized threshold semantics across group sizes and (b) that WarmKeyDelta is only trusted up to the finalized nonce. Adding focused tests for computeFinalizedNonce and ApplyRecoveredDiffs would help prevent regressions in recovery correctness.

Copilot uses AI. Check for mistakes.
var sigs []gossip.GossipSig

for _, diff := range diffs {
if err := h.applyAndPersist(diff); err != nil {
return sigs, fmt.Errorf("apply recovered diff nonce %d: %w", diff.Nonce, err)
for _, rec := range diffs {
if rec.Nonce <= finalized && len(rec.WarmKeyDelta) > 0 {
h.sm.InjectWarmKeys(rec.WarmKeyDelta)
}
if err := h.applyAndPersist(rec.Diff); err != nil {
return sigs, fmt.Errorf("apply recovered diff nonce %d: %w", rec.Nonce, err)
}
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Here is the possible attack from peer:
peer provides wrong warm key diffs and they are stored at h.sm.InjectWarmKeys(rec.WarmKeyDelta)

Then h.applyAndPersist fails

If attacker provides change in warm key where there was no change, we will not be able to catch up and recover at all even on retry as warm key is already broken.

We should make a transaction here and rollback if h.applyAndPersist fails

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.


// Sign state with acceptance check (same path as HandleRequest).
Expand Down
Loading
Loading