diff --git a/ansible/roles/ethlambda/tasks/main.yml b/ansible/roles/ethlambda/tasks/main.yml index 28e222d..ab17d87 100644 --- a/ansible/roles/ethlambda/tasks/main.yml +++ b/ansible/roles/ethlambda/tasks/main.yml @@ -50,32 +50,24 @@ ethlambda_is_aggregator: "{{ 'true' if (ethlambda_node_config.results[3].stdout | default('') | trim) == 'true' else 'false' }}" when: ethlambda_node_config is defined -# Compute the full set of subnet ids in the network so aggregators can subscribe -# to attestations from every subnet (not just the one their validators live in). -# Required in multi-subnet deployments for cross-subnet attestation aggregation. -- name: Compute aggregate subnet ids from attestation_committee_count +# Compute this host's attestation committee subnet id (for diagnostics / +# optional --aggregate-subnet-ids when the CSV lists multiple subnets). +# blockblaz/zeam#863 follow-up — see ansible/roles/zeam/tasks/main.yml. +# Per-host (NOT run_once). +- name: Compute aggregate subnet id for {{ node_name }} shell: | set -e - ac=$(yq eval '.config.attestation_committee_count // 1' "{{ local_validator_config_path }}") - ac=$(echo "$ac" | tr -d '\r\n' | head -1) - case "$ac" in ''|*[!0-9]*) ac=1;; esac - if [ "$ac" -lt 1 ]; then ac=1; fi - out="0" - i=1 - while [ "$i" -lt "$ac" ]; do - out="$out,$i" - i=$((i + 1)) - done - echo "$out" + project_root="$(cd '{{ playbook_dir }}/../..' && pwd)" + "$project_root/compute-aggregate-subnet-ids.sh" "{{ local_validator_config_path }}" "{{ node_name }}" register: ethlambda_all_subnets_raw changed_when: false delegate_to: localhost - run_once: true + when: node_name is defined - name: Set aggregate subnet ids csv set_fact: ethlambda_aggregate_subnet_ids: "{{ ethlambda_all_subnets_raw.stdout | trim }}" - run_once: true + when: ethlambda_all_subnets_raw is defined and ethlambda_all_subnets_raw.stdout is defined - name: Ensure node key file exists stat: diff --git a/ansible/roles/zeam/tasks/main.yml b/ansible/roles/zeam/tasks/main.yml index d855d77..19f3893 100644 --- a/ansible/roles/zeam/tasks/main.yml +++ b/ansible/roles/zeam/tasks/main.yml @@ -56,32 +56,32 @@ zeam_is_aggregator: "{{ 'true' if (node_config.results[3].stdout | default('') | trim) == 'true' else 'false' }}" when: node_config is defined -# Compute the full set of subnet ids in the network so aggregators can subscribe -# to attestations from every subnet (not just the one their validators live in). -# Required in multi-subnet deployments for cross-subnet attestation aggregation. -- name: Compute aggregate subnet ids from attestation_committee_count +# Compute this host's attestation committee subnet id (for diagnostics / +# optional --aggregate-subnet-ids when the CSV lists multiple subnets). +# +# blockblaz/zeam#863 follow-up: pre-fix every aggregator subscribed to +# every subnet, which under multi-subnet load fans every gossip attestation +# N-ways into zeam's libxev thread. The helper now emits a single id (own +# subnet only); lean-quickstart selects one aggregator per subnet so +# validator-derived gossip subscriptions suffice without a neighbor subnet. +# +# Per-host (NOT run_once): each value depends on which validator that host +# is running, so this task runs in the per-host fact-set context after +# node_config has resolved {{ node_name }}. +- name: Compute aggregate subnet id for {{ node_name }} shell: | set -e - ac=$(yq eval '.config.attestation_committee_count // 1' "{{ local_validator_config_path }}") - ac=$(echo "$ac" | tr -d '\r\n' | head -1) - case "$ac" in ''|*[!0-9]*) ac=1;; esac - if [ "$ac" -lt 1 ]; then ac=1; fi - out="0" - i=1 - while [ "$i" -lt "$ac" ]; do - out="$out,$i" - i=$((i + 1)) - done - echo "$out" + project_root="$(cd '{{ playbook_dir }}/../..' && pwd)" + "$project_root/compute-aggregate-subnet-ids.sh" "{{ local_validator_config_path }}" "{{ node_name }}" register: zeam_all_subnets_raw changed_when: false delegate_to: localhost - run_once: true + when: node_name is defined - name: Set aggregate subnet ids csv set_fact: zeam_aggregate_subnet_ids: "{{ zeam_all_subnets_raw.stdout | trim }}" - run_once: true + when: zeam_all_subnets_raw is defined and zeam_all_subnets_raw.stdout is defined - name: Ensure node key file exists stat: diff --git a/client-cmds/ethlambda-cmd.sh b/client-cmds/ethlambda-cmd.sh index 1ea79ee..74c4b4b 100644 --- a/client-cmds/ethlambda-cmd.sh +++ b/client-cmds/ethlambda-cmd.sh @@ -10,10 +10,11 @@ if [ "$isAggregator" == "true" ]; then aggregator_flag="--is-aggregator" fi -# In multi-subnet deployments, an aggregator must subscribe to every subnet's -# attestation topics so it can aggregate votes from all committees. The caller -# (spin-node.sh / ansible roles) exports aggregateSubnetIds as a CSV of the -# full subnet id set for the network. +# Multi-subnet: lean-quickstart picks one aggregator per subnet (spin-node.sh). +# compute-aggregate-subnet-ids.sh reports the node's own committee subnet id; +# Zeam and other clients derive attestation gossip from local validator placement, +# so --aggregate-subnet-ids is only passed when the CSV explicitly lists multiple +# ids (comma). Background: blockblaz/zeam#863. aggregate_subnet_ids_flag="" if [ "$isAggregator" == "true" ] && [ -n "${aggregateSubnetIds:-}" ] && [[ "$aggregateSubnetIds" == *,* ]]; then aggregate_subnet_ids_flag="--aggregate-subnet-ids $aggregateSubnetIds" diff --git a/client-cmds/gean-cmd.sh b/client-cmds/gean-cmd.sh index 53af74f..f30a08e 100644 --- a/client-cmds/gean-cmd.sh +++ b/client-cmds/gean-cmd.sh @@ -9,10 +9,11 @@ if [ "$isAggregator" == "true" ]; then aggregator_flag="--is-aggregator" fi -# In multi-subnet deployments, an aggregator must subscribe to every subnet's -# attestation topics so it can aggregate votes from all committees. The caller -# (spin-node.sh / ansible roles) exports aggregateSubnetIds as a CSV of the -# full subnet id set for the network. +# Multi-subnet: lean-quickstart picks one aggregator per subnet (spin-node.sh). +# compute-aggregate-subnet-ids.sh reports the node's own committee subnet id; +# Zeam and other clients derive attestation gossip from local validator placement, +# so --aggregate-subnet-ids is only passed when the CSV explicitly lists multiple +# ids (comma). Background: blockblaz/zeam#863. aggregate_subnet_ids_flag="" if [ "$isAggregator" == "true" ] && [ -n "${aggregateSubnetIds:-}" ] && [[ "$aggregateSubnetIds" == *,* ]]; then aggregate_subnet_ids_flag="--aggregate-subnet-ids $aggregateSubnetIds" diff --git a/client-cmds/grandine-cmd.sh b/client-cmds/grandine-cmd.sh index 8828ed6..8ad9d03 100644 --- a/client-cmds/grandine-cmd.sh +++ b/client-cmds/grandine-cmd.sh @@ -6,10 +6,11 @@ if [ "$isAggregator" == "true" ]; then aggregator_flag="--is-aggregator" fi -# In multi-subnet deployments, an aggregator must subscribe to every subnet's -# attestation topics so it can aggregate votes from all committees. The caller -# (spin-node.sh / ansible roles) exports aggregateSubnetIds as a CSV of the -# full subnet id set for the network. +# Multi-subnet: lean-quickstart picks one aggregator per subnet (spin-node.sh). +# compute-aggregate-subnet-ids.sh reports the node's own committee subnet id; +# Zeam and other clients derive attestation gossip from local validator placement, +# so --aggregate-subnet-ids is only passed when the CSV explicitly lists multiple +# ids (comma). Background: blockblaz/zeam#863. aggregate_subnet_ids_flag="" if [ "$isAggregator" == "true" ] && [ -n "${aggregateSubnetIds:-}" ] && [[ "$aggregateSubnetIds" == *,* ]]; then aggregate_subnet_ids_flag="--aggregate-subnet-ids $aggregateSubnetIds" diff --git a/client-cmds/lantern-cmd.sh b/client-cmds/lantern-cmd.sh index 19f1b86..21a617c 100755 --- a/client-cmds/lantern-cmd.sh +++ b/client-cmds/lantern-cmd.sh @@ -14,10 +14,11 @@ if [ "$isAggregator" == "true" ]; then aggregator_flag="--is-aggregator" fi -# In multi-subnet deployments, an aggregator must subscribe to every subnet's -# attestation topics so it can aggregate votes from all committees. The caller -# (spin-node.sh / ansible roles) exports aggregateSubnetIds as a CSV of the -# full subnet id set for the network. +# Multi-subnet: lean-quickstart picks one aggregator per subnet (spin-node.sh). +# compute-aggregate-subnet-ids.sh reports the node's own committee subnet id; +# Zeam and other clients derive attestation gossip from local validator placement, +# so --aggregate-subnet-ids is only passed when the CSV explicitly lists multiple +# ids (comma). Background: blockblaz/zeam#863. aggregate_subnet_ids_flag="" if [ "$isAggregator" == "true" ] && [ -n "${aggregateSubnetIds:-}" ] && [[ "$aggregateSubnetIds" == *,* ]]; then aggregate_subnet_ids_flag="--aggregate-subnet-ids $aggregateSubnetIds" diff --git a/client-cmds/lighthouse-cmd.sh b/client-cmds/lighthouse-cmd.sh index 4d25b83..fd1e691 100644 --- a/client-cmds/lighthouse-cmd.sh +++ b/client-cmds/lighthouse-cmd.sh @@ -9,10 +9,11 @@ if [ "$isAggregator" == "true" ]; then aggregator_flag="--is-aggregator" fi -# In multi-subnet deployments, an aggregator must subscribe to every subnet's -# attestation topics so it can aggregate votes from all committees. The caller -# (spin-node.sh / ansible roles) exports aggregateSubnetIds as a CSV of the -# full subnet id set for the network. +# Multi-subnet: lean-quickstart picks one aggregator per subnet (spin-node.sh). +# compute-aggregate-subnet-ids.sh reports the node's own committee subnet id; +# Zeam and other clients derive attestation gossip from local validator placement, +# so --aggregate-subnet-ids is only passed when the CSV explicitly lists multiple +# ids (comma). Background: blockblaz/zeam#863. aggregate_subnet_ids_flag="" if [ "$isAggregator" == "true" ] && [ -n "${aggregateSubnetIds:-}" ] && [[ "$aggregateSubnetIds" == *,* ]]; then aggregate_subnet_ids_flag="--aggregate-subnet-ids $aggregateSubnetIds" diff --git a/client-cmds/nlean-cmd.sh b/client-cmds/nlean-cmd.sh index 9d28a5c..e6b461e 100755 --- a/client-cmds/nlean-cmd.sh +++ b/client-cmds/nlean-cmd.sh @@ -35,10 +35,11 @@ if [[ "${isAggregator:-false}" == "true" ]]; then aggregator_flag="--is-aggregator" fi -# In multi-subnet deployments, an aggregator must subscribe to every subnet's -# attestation topics so it can aggregate votes from all committees. The caller -# (spin-node.sh / ansible roles) exports aggregateSubnetIds as a CSV of the -# full subnet id set for the network. +# Multi-subnet: lean-quickstart picks one aggregator per subnet (spin-node.sh). +# compute-aggregate-subnet-ids.sh reports the node's own committee subnet id; +# Zeam and other clients derive attestation gossip from local validator placement, +# so --aggregate-subnet-ids is only passed when the CSV explicitly lists multiple +# ids (comma). Background: blockblaz/zeam#863. aggregate_subnet_ids_flag="" if [[ "${isAggregator:-false}" == "true" ]] && [[ -n "${aggregateSubnetIds:-}" ]] && [[ "$aggregateSubnetIds" == *,* ]]; then aggregate_subnet_ids_flag="--aggregate-subnet-ids $aggregateSubnetIds" diff --git a/client-cmds/peam-cmd.sh b/client-cmds/peam-cmd.sh index d1863b1..bf681bf 100644 --- a/client-cmds/peam-cmd.sh +++ b/client-cmds/peam-cmd.sh @@ -53,13 +53,14 @@ if [ "$isAggregator" == "true" ]; then aggregator_flag="--is-aggregator" fi -# In multi-subnet deployments, an aggregator must subscribe to every subnet's -# attestation topics so it can aggregate votes from all committees. The caller -# (spin-node.sh / ansible roles) exports aggregateSubnetIds as a CSV of the -# full subnet id set for the network. Note: peam already subscribes to all -# subnets in [0, committee_count) via allowed_topics above; this flag exists -# for contract parity with other clients and is a no-op unless the binary -# recognises it. +# Multi-subnet: lean-quickstart picks one aggregator per subnet (spin-node.sh). +# compute-aggregate-subnet-ids.sh reports the node's own committee subnet id; +# clients derive attestation gossip from local validator placement, so +# --aggregate-subnet-ids is only passed when the CSV lists multiple ids (comma). +# Background: blockblaz/zeam#863. +# Note: peam already subscribes to all subnets in [0, committee_count) +# via allowed_topics above; this flag exists for contract parity with +# other clients and is a no-op unless the binary recognises it. aggregate_subnet_ids_flag="" if [ "$isAggregator" == "true" ] && [ -n "${aggregateSubnetIds:-}" ] && [[ "$aggregateSubnetIds" == *,* ]]; then aggregate_subnet_ids_flag="--aggregate-subnet-ids $aggregateSubnetIds" diff --git a/client-cmds/qlean-cmd.sh b/client-cmds/qlean-cmd.sh index b228e6b..6079e6d 100644 --- a/client-cmds/qlean-cmd.sh +++ b/client-cmds/qlean-cmd.sh @@ -21,10 +21,11 @@ if [ "$isAggregator" == "true" ]; then aggregator_flag="--is-aggregator" fi -# In multi-subnet deployments, an aggregator must subscribe to every subnet's -# attestation topics so it can aggregate votes from all committees. The caller -# (spin-node.sh / ansible roles) exports aggregateSubnetIds as a CSV of the -# full subnet id set for the network. +# Multi-subnet: lean-quickstart picks one aggregator per subnet (spin-node.sh). +# compute-aggregate-subnet-ids.sh reports the node's own committee subnet id; +# Zeam and other clients derive attestation gossip from local validator placement, +# so --aggregate-subnet-ids is only passed when the CSV explicitly lists multiple +# ids (comma). Background: blockblaz/zeam#863. aggregate_subnet_ids_flag="" if [ "$isAggregator" == "true" ] && [ -n "${aggregateSubnetIds:-}" ] && [[ "$aggregateSubnetIds" == *,* ]]; then aggregate_subnet_ids_flag="--aggregate-subnet-ids $aggregateSubnetIds" diff --git a/client-cmds/ream-cmd.sh b/client-cmds/ream-cmd.sh index 41315c4..2e5bc83 100755 --- a/client-cmds/ream-cmd.sh +++ b/client-cmds/ream-cmd.sh @@ -10,10 +10,11 @@ if [ "$isAggregator" == "true" ]; then aggregator_flag="--is-aggregator" fi -# In multi-subnet deployments, an aggregator must subscribe to every subnet's -# attestation topics so it can aggregate votes from all committees. The caller -# (spin-node.sh / ansible roles) exports aggregateSubnetIds as a CSV of the -# full subnet id set for the network. +# Multi-subnet: lean-quickstart picks one aggregator per subnet (spin-node.sh). +# compute-aggregate-subnet-ids.sh reports the node's own committee subnet id; +# Zeam and other clients derive attestation gossip from local validator placement, +# so --aggregate-subnet-ids is only passed when the CSV explicitly lists multiple +# ids (comma). Background: blockblaz/zeam#863. aggregate_subnet_ids_flag="" if [ "$isAggregator" == "true" ] && [ -n "${aggregateSubnetIds:-}" ] && [[ "$aggregateSubnetIds" == *,* ]]; then aggregate_subnet_ids_flag="--aggregate-subnet-ids $aggregateSubnetIds" diff --git a/client-cmds/zeam-cmd.sh b/client-cmds/zeam-cmd.sh index dbe7207..0332847 100644 --- a/client-cmds/zeam-cmd.sh +++ b/client-cmds/zeam-cmd.sh @@ -17,10 +17,11 @@ if [ "$isAggregator" == "true" ]; then aggregator_flag="--is-aggregator" fi -# In multi-subnet deployments, an aggregator must subscribe to every subnet's -# attestation topics so it can aggregate votes from all committees. The caller -# (spin-node.sh / ansible roles) exports aggregateSubnetIds as a CSV of the -# full subnet id set for the network. +# Multi-subnet: lean-quickstart picks one aggregator per subnet (spin-node.sh). +# compute-aggregate-subnet-ids.sh reports the node's own committee subnet id; +# Zeam and other clients derive attestation gossip from local validator placement, +# so --aggregate-subnet-ids is only passed when the CSV explicitly lists multiple +# ids (comma). Background: blockblaz/zeam#863. aggregate_subnet_ids_flag="" if [ "$isAggregator" == "true" ] && [ -n "${aggregateSubnetIds:-}" ] && [[ "$aggregateSubnetIds" == *,* ]]; then aggregate_subnet_ids_flag="--aggregate-subnet-ids $aggregateSubnetIds" diff --git a/compute-aggregate-subnet-ids.sh b/compute-aggregate-subnet-ids.sh new file mode 100755 index 0000000..f4e5862 --- /dev/null +++ b/compute-aggregate-subnet-ids.sh @@ -0,0 +1,102 @@ +#!/bin/bash +# +# compute-aggregate-subnet-ids.sh +# +# Print the comma-separated subnet ids that a given aggregator should +# subscribe to. +# +# Why this exists (blockblaz/zeam#863 follow-up): +# The pre-fix shape exported a CSV of EVERY attestation subnet for +# every aggregator. With attestation_committee_count=N the libxev +# thread on each aggregator received N copies of every gossip +# attestation (one per subscribed topic), and on a 4-subnet devnet-4 +# aggregator that 4× fan-in was a primary contributor to slot-driver +# starvation (~74% of received attestations referenced unimported +# heads, each spawned a BlocksByRoot, the storm fed itself). +# +# We now output only this node's OWN attestation subnet id. Zeam +# already subscribes aggregators to attestation topics implied by +# local validator ids; lean-quickstart uses one aggregator per subnet +# (spin-node.sh), so an extra neighbor subnet on every aggregator is +# redundant and doubles gossip load for N=2. +# +# The output is always a single id (no comma). Callers such as +# client-cmds/zeam-cmd.sh only pass `--aggregate-subnet-ids` when the +# CSV contains a comma, so in normal layouts the flag is omitted and +# subscription comes from validator placement alone. +# +# Usage: +# compute-aggregate-subnet-ids.sh +# +# Output: +# `` (single subnet id, no comma) +# +# Subnet selection priority for ``: +# 1. Per-row `subnet:` field if present (matches the source of truth +# that `_node_subnet` in spin-node.sh consults — generated by +# generate-subnet-config.py / hand-edited devnet layouts). +# 2. Fallback: `validator_index % attestation_committee_count` +# where validator_index is the cumulative sum of the prior rows' +# `count` fields (matches how clients themselves choose subnets). +# +# Exits 0 with the CSV on stdout. Errors go to stderr and exit non-zero. + +set -euo pipefail + +if [ "$#" -ne 2 ]; then + echo "Usage: $0 " >&2 + exit 2 +fi + +cfg="$1" +node="$2" + +if [ ! -f "$cfg" ]; then + echo "Error: validator config not found: $cfg" >&2 + exit 1 +fi + +if ! command -v yq >/dev/null 2>&1; then + echo "Error: yq is required (brew install yq / apt install yq)" >&2 + exit 1 +fi + +# attestation_committee_count, sanitised to a positive integer (default 1). +ac=$(yq eval '.config.attestation_committee_count // 1' "$cfg" | tr -d '\r\n' | head -1) +case "$ac" in ''|*[!0-9]*) ac=1;; esac +if [ "$ac" -lt 1 ] 2>/dev/null; then ac=1; fi + +# Own subnet: prefer the explicit `subnet:` field on the validator row. +own=$(yq eval ".validators[] | select(.name == \"$node\") | .subnet // \"\"" "$cfg" | tr -d '\r\n' | head -1) + +if [ -z "$own" ]; then + # Fallback path mirrors `_node_subnet` in spin-node.sh: cumulative + # `count` sum of the rows preceding this node, modulo `ac`. yq emits + # rows as ` `; we walk them in declaration order. + vi=0 + while IFS=' ' read -r row_name row_count; do + if [ "$row_name" = "$node" ]; then + own=$(( vi % ac )) + break + fi + # Default count to 1 when the row omits the field — matches + # generate-subnet-config.py's implicit assumption. + case "$row_count" in ''|null|*[!0-9]*) row_count=1;; esac + vi=$(( vi + row_count )) + done < <(yq eval '.validators[] | .name + " " + ((.count // 1) | tostring)' "$cfg") + + if [ -z "$own" ]; then + echo "Error: node '$node' not found in $cfg" >&2 + exit 1 + fi +fi + +# Sanity-bound `own` so a stray YAML value (e.g. subnet: 99 with +# attestation_committee_count: 4) doesn't silently produce a CSV +# referring to non-existent subnets — clients reject those at startup. +if [ "$own" -ge "$ac" ] 2>/dev/null; then + echo "Error: node '$node' subnet=$own is out of range for attestation_committee_count=$ac" >&2 + exit 1 +fi + +printf '%s\n' "$own" diff --git a/parse-vc.sh b/parse-vc.sh index a3b0341..41658cd 100644 --- a/parse-vc.sh +++ b/parse-vc.sh @@ -69,20 +69,27 @@ if [ -z "$isAggregator" ] || [ "$isAggregator" == "null" ]; then isAggregator="false" fi -# CSV of all attestation subnet ids (e.g. "0,1"). Clients do not read a YAML -# `subnet:` field for consensus — subnets are validator_index % committee_count. -# Aggregators must still hear every subnet, so derive ids from -# config.attestation_committee_count (not from per-validator subnet metadata). -_ac=$(yq eval '.config.attestation_committee_count // 1' "$validator_config_file") -_ac=$(echo "$_ac" | tr -d '\r\n' | head -1) -case "$_ac" in ''|*[!0-9]*) _ac=1;; esac -if [ "$_ac" -lt 1 ] 2>/dev/null; then _ac=1; fi -aggregateSubnetIds="0" -_i=1 -while [ "$_i" -lt "$_ac" ] 2>/dev/null; do - aggregateSubnetIds+=",$_i" - _i=$((_i + 1)) -done +# CSV of attestation subnet ids THIS aggregator should subscribe to. +# +# blockblaz/zeam#863 follow-up: instead of every aggregator listening +# to every subnet (which under multi-subnet load fans every gossip +# attestation N-ways into the libxev thread), compute-aggregate-subnet-ids.sh +# now reports only this node's OWN committee subnet id. With one +# aggregator per subnet (spin-node.sh), clients' validator-derived gossip +# subscriptions are enough; a comma-separated list is only for rare +# explicit overrides (see client-cmds: they pass the flag when the CSV +# contains a comma). +# +# Computation lives in compute-aggregate-subnet-ids.sh so the same +# helper is reused by ansible/roles/{zeam,ethlambda}/tasks/main.yml +# without yq-logic drift between the two paths. +_compute_helper="${scriptDir:-$(dirname "${BASH_SOURCE[0]}")}/compute-aggregate-subnet-ids.sh" +if [ -x "$_compute_helper" ]; then + aggregateSubnetIds=$("$_compute_helper" "$validator_config_file" "$item") +else + echo "Warning: $_compute_helper not found or not executable; falling back to single-subnet coverage" >&2 + aggregateSubnetIds="0" +fi export aggregateSubnetIds # Extract attestation_committee_count from config section (optional - only if explicitly set)