diff --git a/Cargo.lock b/Cargo.lock index dad95ed252..e3fd286e47 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3002,6 +3002,18 @@ dependencies = [ "tracer", ] +[[package]] +name = "jolt-inlines-poseidon2-goldilocks" +version = "0.1.0" +dependencies = [ + "jolt-inlines-sdk", + "p3-field", + "p3-goldilocks", + "p3-poseidon2", + "p3-symmetric", + "tracer", +] + [[package]] name = "jolt-inlines-sdk" version = "0.1.0" @@ -3909,6 +3921,175 @@ dependencies = [ "jolt-sdk", ] +[[package]] +name = "p3-challenger" +version = "0.5.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8972ccd1d5dc90e46cdb1f2ab4ee2bae49b3917e5e98aa533f0c2b779c010445" +dependencies = [ + "p3-field", + "p3-maybe-rayon", + "p3-monty-31", + "p3-symmetric", + "p3-util", + "tracing", +] + +[[package]] +name = "p3-dft" +version = "0.5.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "17771aca44632f9cc11f2718d7ea7ec06794946c4190ef3a985bfc893f14c18a" +dependencies = [ + "itertools 0.14.0", + "p3-field", + "p3-matrix", + "p3-maybe-rayon", + "p3-util", + "spin 0.10.0", + "tracing", +] + +[[package]] +name = "p3-field" +version = "0.5.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6f3eb24d0591fd4d282d89cbe4e4efba5571c699375006f80b2cbf53ce83461c" +dependencies = [ + "itertools 0.14.0", + "num-bigint", + "p3-maybe-rayon", + "p3-util", + "paste", + "rand 0.10.1", + "serde", + "tracing", +] + +[[package]] +name = "p3-goldilocks" +version = "0.5.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5751c6591a0d2397d726620c2c29a7436ec6c5e19d2ed74ca5d078d4fbb18eb5" +dependencies = [ + "num-bigint", + "p3-challenger", + "p3-dft", + "p3-field", + "p3-mds", + "p3-poseidon1", + "p3-poseidon2", + "p3-symmetric", + "p3-util", + "paste", + "rand 0.10.1", + "serde", +] + +[[package]] +name = "p3-matrix" +version = "0.5.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ea9c94c0714944e7b8a9a62e6340b1e3e1d3f8ecfd3e35c08798360200e73eff" +dependencies = [ + "itertools 0.14.0", + "p3-field", + "p3-maybe-rayon", + "p3-util", + "rand 0.10.1", + "serde", + "tracing", +] + +[[package]] +name = "p3-maybe-rayon" +version = "0.5.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "eebc233a34b1ab0273f35b4052fa2eeb3114b22ba4575bd7da00716e878ffb77" + +[[package]] +name = "p3-mds" +version = "0.5.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6b5441fa8116246ec9e6c835f15273cb27777ca572960ec87476b67fef13e01e" +dependencies = [ + "p3-dft", + "p3-field", + "p3-symmetric", + "p3-util", + "rand 0.10.1", +] + +[[package]] +name = "p3-monty-31" +version = "0.5.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8724f330ea6d19dd4f2436aa0f88b5fcbf88f0f55ca7fccd3fea8b736dbcddad" +dependencies = [ + "itertools 0.14.0", + "num-bigint", + "p3-dft", + "p3-field", + "p3-matrix", + "p3-maybe-rayon", + "p3-mds", + "p3-poseidon1", + "p3-poseidon2", + "p3-symmetric", + "p3-util", + "paste", + "rand 0.10.1", + "serde", + "spin 0.10.0", + "tracing", +] + +[[package]] +name = "p3-poseidon1" +version = "0.5.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "04e2a562fea210baae390a32f9ecf0dd8724ae3f4352d1c8e413077b6f00a162" +dependencies = [ + "p3-field", + "p3-symmetric", + "rand 0.10.1", +] + +[[package]] +name = "p3-poseidon2" +version = "0.5.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "06394851c161d17e4aa4ad2aad5557d32f14cadd1dc838f965d8e1821a63b8c5" +dependencies = [ + "p3-field", + "p3-mds", + "p3-symmetric", + "p3-util", + "rand 0.10.1", +] + +[[package]] +name = "p3-symmetric" +version = "0.5.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9ac1a276d421f8ef3361bb7d8c39a02c93c6b3f10eeaa559cc4c50222f9a5b82" +dependencies = [ + "itertools 0.14.0", + "p3-field", + "p3-util", + "serde", +] + +[[package]] +name = "p3-util" +version = "0.5.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d08a58162a4c264269ef454f0b28dcda89939490eecacb2b2cf5b00f719b80f6" +dependencies = [ + "serde", + "transpose", +] + [[package]] name = "page_size" version = "0.6.0" @@ -4295,7 +4476,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "22505a5c94da8e3b7c2996394d1c933236c4d743e81a410bcca4e6989fc066a4" dependencies = [ "bytes", - "heck 0.4.1", + "heck 0.5.0", "itertools 0.12.1", "log", "multimap", @@ -4329,7 +4510,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "27c6023962132f4b30eb4c172c91ce92d933da334c59c23cddee82358ddafb0b" dependencies = [ "anyhow", - "itertools 0.12.1", + "itertools 0.14.0", "proc-macro2", "quote", "syn 2.0.117", @@ -4427,6 +4608,15 @@ dependencies = [ "serde", ] +[[package]] +name = "rand" +version = "0.10.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d2e8e8bcc7961af1fdac401278c6a831614941f6164ee3bf4ce61b7edb162207" +dependencies = [ + "rand_core 0.10.1", +] + [[package]] name = "rand_chacha" version = "0.3.1" @@ -4466,6 +4656,12 @@ dependencies = [ "serde", ] +[[package]] +name = "rand_core" +version = "0.10.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "63b8176103e19a2643978565ca18b50549f6101881c443590420e4dc998a3c69" + [[package]] name = "rand_xorshift" version = "0.4.0" @@ -5567,6 +5763,12 @@ version = "0.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9091b6114800a5f2141aee1d1b9d6ca3592ac062dc5decb3764ec5895a47b4eb" +[[package]] +name = "strength_reduce" +version = "0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fe895eb47f22e2ddd4dabc02bce419d2e643c8e3b585c78158b349195bc24d82" + [[package]] name = "strsim" version = "0.11.1" @@ -5982,6 +6184,16 @@ dependencies = [ "zklean-extractor", ] +[[package]] +name = "transpose" +version = "0.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1ad61aed86bc3faea4300c7aee358b4c6d0c8d6ccc36524c96e4c92ccf26e77e" +dependencies = [ + "num-integer", + "strength_reduce", +] + [[package]] name = "tree-sitter" version = "0.20.9" diff --git a/Cargo.toml b/Cargo.toml index d643be1a5a..9ece2d7559 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -53,6 +53,7 @@ members = [ "jolt-inlines/secp256k1", "jolt-inlines/grumpkin", "jolt-inlines/p256", + "jolt-inlines/poseidon2-goldilocks", "examples/btreemap/host", "examples/btreemap/guest", "examples/collatz", @@ -254,6 +255,10 @@ sha3 = "0.11" blake2 = "0.11.0-rc.6" blake3 = { version = "1.8.5" } light-poseidon = "0.4" +p3-field = "0.5.3" +p3-goldilocks = "0.5.3" +p3-poseidon2 = "0.5.3" +p3-symmetric = "0.5.3" digest = "0.11" jolt-optimizations = { git = "https://github.com/a16z/arkworks-algebra", branch = "dev/twist-shout" } dory = { package = "dory-pcs", version = "0.3.0", features = [ @@ -400,3 +405,4 @@ jolt-inlines-bigint = { path = "./jolt-inlines/bigint", default-features = false jolt-inlines-secp256k1 = { path = "./jolt-inlines/secp256k1", default-features = false } jolt-inlines-grumpkin = { path = "./jolt-inlines/grumpkin", default-features = false } jolt-inlines-p256 = { path = "./jolt-inlines/p256", default-features = false } +jolt-inlines-poseidon2-goldilocks = { path = "./jolt-inlines/poseidon2-goldilocks", default-features = false } diff --git a/crates/jolt-riscv/src/profile.rs b/crates/jolt-riscv/src/profile.rs index 26839dc187..6f8ef12bfb 100644 --- a/crates/jolt-riscv/src/profile.rs +++ b/crates/jolt-riscv/src/profile.rs @@ -36,6 +36,7 @@ pub enum InlineExtension { Secp256k1, Grumpkin, P256, + Poseidon2Goldilocks, } #[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)] @@ -84,6 +85,7 @@ pub const RV64IMAC_JOLT_ALL_INLINES: JoltInstructionProfile = JoltInstructionPro InlineExtension::Secp256k1, InlineExtension::Grumpkin, InlineExtension::P256, + InlineExtension::Poseidon2Goldilocks, ], }; @@ -254,6 +256,7 @@ const fn inline_extension_code(extension: InlineExtension) -> u8 { InlineExtension::Secp256k1 => 5, InlineExtension::Grumpkin => 6, InlineExtension::P256 => 7, + InlineExtension::Poseidon2Goldilocks => 8, } } diff --git a/jolt-inlines/poseidon2-goldilocks/Cargo.toml b/jolt-inlines/poseidon2-goldilocks/Cargo.toml new file mode 100644 index 0000000000..1065debcb3 --- /dev/null +++ b/jolt-inlines/poseidon2-goldilocks/Cargo.toml @@ -0,0 +1,22 @@ +[package] +name = "jolt-inlines-poseidon2-goldilocks" +version = "0.1.0" +edition = "2021" +description = "Poseidon2-Goldilocks inline implementation for Jolt VM" +license = "MIT" +homepage = "https://github.com/a16z/jolt/README.md" +repository = "https://github.com/a16z/jolt" + +[features] +default = [] +host = ["jolt-inlines-sdk/host"] + +[dependencies] +jolt-inlines-sdk = { workspace = true, optional = true } + +[dev-dependencies] +p3-field.workspace = true +p3-goldilocks.workspace = true +p3-poseidon2.workspace = true +p3-symmetric.workspace = true +tracer = { workspace = true, features = ["std", "test-utils"] } diff --git a/jolt-inlines/poseidon2-goldilocks/src/exec.rs b/jolt-inlines/poseidon2-goldilocks/src/exec.rs new file mode 100644 index 0000000000..d57980e7e5 --- /dev/null +++ b/jolt-inlines/poseidon2-goldilocks/src/exec.rs @@ -0,0 +1,153 @@ +// SPDX-License-Identifier: Apache-2.0 + +//! Host-side reference implementation of the Goldilocks Poseidon2 +//! permutation. +//! +//! This is the ground-truth comparator the sequence builder must +//! match byte-for-byte. The tests compare it against Plonky3's +//! canonical `Poseidon2Goldilocks<8>` implementation. + +use crate::{Poseidon2GoldilocksState, GOLDILOCKS_MODULUS, STATE_WIDTH}; + +// Re-exported from `crate` root so existing callers can keep their +// `crate::exec::POSEIDON2_ROUND_CONSTANTS_GOLDILOCKS_8` and +// `crate::exec::add_mod` imports. The canonical definitions live in +// `lib.rs` because the SDK guest path needs them in `no_std` builds. +pub use crate::{add_mod, POSEIDON2_ROUND_CONSTANTS_GOLDILOCKS_8}; + +/// Diagonal matrix for the internal-round diffusion step. +#[rustfmt::skip] +pub const POSEIDON2_INTERNAL_DIAG: [u64; STATE_WIDTH] = [ + 0xfffffffeffffffff, + 1, + 2, + 0x7fffffff80000001, + 3, + 0x7fffffff80000000, + 0xfffffffefffffffe, + 0xfffffffefffffffd, +]; + +#[inline] +pub fn mul_mod(a: u64, b: u64) -> u64 { + let res = (a as u128) * (b as u128); + let lo = res as u64; + let hi = (res >> 64) as u64; + let hi_hi = hi >> 32; + let hi_lo = hi as u32 as u64; + + // `add_term` is `lo + (hi_lo << 32)`. This sum can exceed 2^64. + // The naive wrapping_add loses 2^64 worth of magnitude in that + // case — and since 2^64 ≡ (2^32 - 1) mod P, the result is short + // by (2^32 - 1) mod P when the overflow happens. Detect and + // compensate. + let (add_term, add_overflow) = lo.overflowing_add(hi_lo << 32); + let sub_term = hi_lo + hi_hi; + + let mut r = add_term.wrapping_sub(sub_term); + if add_term < sub_term { + r = r.wrapping_add(GOLDILOCKS_MODULUS); + } + + if add_overflow { + // Add (2^32 - 1) to recover the lost magnitude. If THIS add + // overflows u64, the wrap is equivalent to subtracting another + // 2^64 ≡ (2^32 - 1) mod P from the result — so we add + // (2^32 - 1) one more time. + let (r1, wrapped) = r.overflowing_add(0xFFFFFFFF); + r = r1; + if wrapped { + r = r.wrapping_add(0xFFFFFFFF); + } + } + + while r >= GOLDILOCKS_MODULUS { + r -= GOLDILOCKS_MODULUS; + } + r +} + +/// S-box: `x^7` over Goldilocks. Computed as `x^4 * x^2 * x` (3 mults). +#[inline] +pub fn sbox(x: u64) -> u64 { + let x2 = mul_mod(x, x); + let x4 = mul_mod(x2, x2); + let x3 = mul_mod(x2, x); + mul_mod(x4, x3) +} + +/// External MDS layer: 8-wide matrix multiply via two m4 sub-blocks +/// plus the cross-mixing step. +pub fn external_mds(state: &mut [u64; STATE_WIDTH]) { + fn m4(s: &mut [u64]) { + let (a, b, c, d) = (s[0], s[1], s[2], s[3]); + let sum = add_mod(add_mod(a, b), add_mod(c, d)); + s[0] = add_mod(sum, add_mod(a, add_mod(b, b))); + s[1] = add_mod(sum, add_mod(b, add_mod(c, c))); + s[2] = add_mod(sum, add_mod(c, add_mod(d, d))); + s[3] = add_mod(sum, add_mod(d, add_mod(a, a))); + } + let mut left = [state[0], state[1], state[2], state[3]]; + let mut right = [state[4], state[5], state[6], state[7]]; + m4(&mut left); + m4(&mut right); + for i in 0..4 { + state[i] = add_mod(left[i], right[i]); + state[i + 4] = add_mod(left[i], right[i]); + } + for i in 0..4 { + state[i] = add_mod(state[i], left[i]); + state[i + 4] = add_mod(state[i + 4], right[i]); + } +} + +/// Internal-round diffusion: multiply by diagonal, then add row-sum +/// to every coordinate. +pub fn internal_diffusion(state: &mut [u64; STATE_WIDTH]) { + let mut sum = 0; + for &s in state.iter() { + sum = add_mod(sum, s); + } + for i in 0..STATE_WIDTH { + state[i] = add_mod(mul_mod(POSEIDON2_INTERNAL_DIAG[i], state[i]), sum); + } +} + +/// The Poseidon2 permutation in full. +pub fn execute_poseidon2_permutation(state: &mut Poseidon2GoldilocksState) { + let mut rc_idx = 0; + + external_mds(state); + + // 4 external initial rounds + for _ in 0..4 { + for s in state.iter_mut() { + *s = add_mod(*s, POSEIDON2_ROUND_CONSTANTS_GOLDILOCKS_8[rc_idx]); + rc_idx += 1; + } + for s in state.iter_mut() { + *s = sbox(*s); + } + external_mds(state); + } + + // 22 internal rounds + for _ in 0..22 { + state[0] = add_mod(state[0], POSEIDON2_ROUND_CONSTANTS_GOLDILOCKS_8[rc_idx]); + rc_idx += 1; + state[0] = sbox(state[0]); + internal_diffusion(state); + } + + // 4 external final rounds + for _ in 0..4 { + for s in state.iter_mut() { + *s = add_mod(*s, POSEIDON2_ROUND_CONSTANTS_GOLDILOCKS_8[rc_idx]); + rc_idx += 1; + } + for s in state.iter_mut() { + *s = sbox(*s); + } + external_mds(state); + } +} diff --git a/jolt-inlines/poseidon2-goldilocks/src/host.rs b/jolt-inlines/poseidon2-goldilocks/src/host.rs new file mode 100644 index 0000000000..54d681d4af --- /dev/null +++ b/jolt-inlines/poseidon2-goldilocks/src/host.rs @@ -0,0 +1,16 @@ +// SPDX-License-Identifier: Apache-2.0 + +//! Host-side registration of the Poseidon2-Goldilocks inline with +//! the Jolt prover/tracer. +//! +//! The `register_inlines!` macro generates the dispatcher that maps +//! our `(INLINE_OPCODE, FUNCT3, FUNCT7)` triple to +//! `Poseidon2GoldilocksPermutation::build_sequence`. + +use crate::sequence_builder::Poseidon2GoldilocksPermutation; + +jolt_inlines_sdk::register_inlines! { + trace_file: "poseidon2_goldilocks_trace.joltinline", + extension: jolt_inlines_sdk::host::InlineExtension::Poseidon2Goldilocks, + ops: [Poseidon2GoldilocksPermutation], +} diff --git a/jolt-inlines/poseidon2-goldilocks/src/lib.rs b/jolt-inlines/poseidon2-goldilocks/src/lib.rs new file mode 100644 index 0000000000..ea75fb1e0a --- /dev/null +++ b/jolt-inlines/poseidon2-goldilocks/src/lib.rs @@ -0,0 +1,134 @@ +// SPDX-License-Identifier: Apache-2.0 + +//! Goldilocks Poseidon2 inline for the Jolt zkVM. +//! +//! This crate implements the canonical Plonky3-compatible 8-wide +//! Poseidon2 permutation over the Goldilocks field. The guest API emits +//! one custom inline instruction that permutes an eight-limb state +//! in-place; the host registration expands that instruction into a +//! deterministic virtual-instruction sequence for Jolt tracing. +//! +//! ## Status +//! +//! - SDK module (`sdk`) emits the custom opcode in RISC-V guest builds +//! and falls back to the host reference implementation when the +//! `host` feature is enabled. +//! - Host reference (`exec`) is a standalone Goldilocks/Poseidon2 +//! implementation used by tests and host builds. +//! - Sequence builder (`sequence_builder`) emits the inline expansion +//! used by the Jolt tracer/prover. +//! +//! ## Inline opcode encoding +//! +//! Custom RISC-V instruction: +//! +//! ```text +//! .insn r INLINE_OPCODE, POSEIDON2_GOLDILOCKS_FUNCT3, POSEIDON2_GOLDILOCKS_FUNCT7, x0, rs1, x0 +//! ``` +//! +//! where `rs1` points to a 64-byte (8 × u64) state buffer that is +//! permuted in-place. +//! +//! `INLINE_OPCODE` (0x0B) is shared with the upstream +//! `jolt-inlines-*` crates. The funct3/funct7 pair is reserved for +//! Poseidon2-Goldilocks. If/when this crate moves upstream into +//! `jolt-inlines-poseidon2-goldilocks`, the encoding stays stable. + +#![cfg_attr(not(feature = "host"), no_std)] + +/// Shared custom inline opcode space. Same value used by all +/// `jolt-inlines-*` crates upstream. +pub const INLINE_OPCODE: u32 = 0x0B; + +/// `funct3` for the Goldilocks Poseidon2 permutation opcode. +/// +pub const POSEIDON2_GOLDILOCKS_FUNCT3: u32 = 0x01; + +/// `funct7` for the Goldilocks Poseidon2 permutation opcode. +pub const POSEIDON2_GOLDILOCKS_FUNCT7: u32 = 0x02; + +/// Human-readable inline name. Used in trace-file headers and +/// upstream registration. +pub const POSEIDON2_GOLDILOCKS_NAME: &str = "POSEIDON2_GOLDILOCKS_INLINE"; + +/// State width for our Poseidon2 instance. Hard-coded to 8; v0 is +/// not generic over width. +pub const STATE_WIDTH: usize = 8; + +/// Convenience: an 8-element Goldilocks state. +pub type Poseidon2GoldilocksState = [u64; STATE_WIDTH]; + +/// Goldilocks field modulus `p = 2^64 - 2^32 + 1`. +pub const GOLDILOCKS_MODULUS: u64 = 0xFFFF_FFFF_0000_0001; + +/// Goldilocks field modular addition. +/// +/// Lives in `lib.rs` (not `exec.rs`) because the SDK's +/// `poseidon2_hash_pair` absorbs inputs via `add_mod` in BOTH host and +/// no_std/guest builds. The guest path can't see `exec` (host-only). +#[inline] +pub fn add_mod(a: u64, b: u64) -> u64 { + let (mut sum, overflow) = a.overflowing_add(b); + if overflow { + sum = sum.wrapping_sub(GOLDILOCKS_MODULUS); + } + if sum >= GOLDILOCKS_MODULUS { + sum -= GOLDILOCKS_MODULUS; + } + sum +} + +/// 86 Poseidon2 round constants for the Goldilocks 8-wide instance. +/// +/// Layout: 32 external initial (4 rounds × 8 elements) + 22 internal +/// (state[0] only) + 32 external final (4 rounds × 8 elements). +/// +/// Defined as `pub static` (not `pub const`) so it has a stable address +/// that the SDK's RISC-V guest asm can reference via `sym ...` and pass +/// as `rs2` to the inline opcode. The sequence builder reads round +/// constants from `rs2`; passing `x0` (the previous default) silently +/// reads from address 0 in the guest, which is the rs2-wiring bug +/// covered by the emulator regression tests. +#[rustfmt::skip] +pub static POSEIDON2_ROUND_CONSTANTS_GOLDILOCKS_8: [u64; 86] = [ + // External initial: 4 rounds × 8 elements + 0xdd5743e7f2a5a5d9, 0xcb3a864e58ada44b, 0xffa2449ed32f8cdc, 0x42025f65d6bd13ee, + 0x7889175e25506323, 0x34b98bb03d24b737, 0xbdcc535ecc4faa2a, 0x5b20ad869fc0d033, + 0xf1dda5b9259dfcb4, 0x27515210be112d59, 0x4227d1718c766c3f, 0x26d333161a5bd794, + 0x49b938957bf4b026, 0x4a56b5938b213669, 0x1120426b48c8353d, 0x6b323c3f10a56cad, + 0xce57d6245ddca6b2, 0xb1fc8d402bba1eb1, 0xb5c5096ca959bd04, 0x6db55cd306d31f7f, + 0xc49d293a81cb9641, 0x1ce55a4fe979719f, 0xa92e60a9d178a4d1, 0x002cc64973bcfd8c, + 0xcea721cce82fb11b, 0xe5b55eb8098ece81, 0x4e30525c6f1ddd66, 0x43c6702827070987, + 0xaca68430a7b5762a, 0x3674238634df9c93, 0x88cee1c825e33433, 0xde99ae8d74b57176, + // Internal: 22 scalars (state[0] only) + 0x488897d85ff51f56, 0x1140737ccb162218, 0xa7eeb9215866ed35, 0x9bd2976fee49fcc9, + 0xc0c8f0de580a3fcc, 0x4fb2dae6ee8fc793, 0x343a89f35f37395b, 0x223b525a77ca72c8, + 0x56ccb62574aaa918, 0xc4d507d8027af9ed, 0xa080673cf0b7e95c, 0xf0184884eb70dcf8, + 0x044f10b0cb3d5c69, 0xe9e3f7993938f186, 0x1b761c80e772f459, 0x606cec607a1b5fac, + 0x14a0c2e1d45f03cd, 0x4eace8855398574f, 0xf905ca7103eff3e6, 0xf8c8f8d20862c059, + 0xb524fe8bdd678e5a, 0xfbb7865901a1ec41, + // External final: 4 rounds × 8 elements + 0x014ef1197d341346, 0x9725e20825d07394, 0xfdb25aef2c5bae3b, 0xbe5402dc598c971e, + 0x93a5711f04cdca3d, 0xc45a9a5b2f8fb97b, 0xfe8946a924933545, 0x2af997a27369091c, + 0xaa62c88e0b294011, 0x058eb9d810ce9f74, 0xb3cb23eced349ae4, 0xa3648177a77b4a84, + 0x43153d905992d95d, 0xf4e2a97cda44aa4b, 0x5baa2702b908682f, 0x082923bdf4f750d1, + 0x98ae09a325893803, 0xf8a6475077968838, 0xceb0735bf00b2c5f, 0x0a1a5d953888e072, + 0x2fcb190489f94475, 0xb5be06270dec69fc, 0x739cb934b09acf8b, 0x537750b75ec7f25b, + 0xe9dd318bae1f3961, 0xf7462137299efe1a, 0xb1f6b8eee9adb940, 0xbdebcc8a809dfe6b, + 0x40fc1f791b178113, 0x3ac1c3362d014864, 0x9a016184bdb8aeba, 0x95f2394459fbc25e, +]; + +pub mod sdk; +pub use sdk::*; + +#[cfg(feature = "host")] +pub mod exec; + +#[cfg(feature = "host")] +pub mod sequence_builder; + +#[cfg(feature = "host")] +pub mod host; + +#[cfg(all(test, feature = "host"))] +mod tests; diff --git a/jolt-inlines/poseidon2-goldilocks/src/sdk.rs b/jolt-inlines/poseidon2-goldilocks/src/sdk.rs new file mode 100644 index 0000000000..8c0587d64f --- /dev/null +++ b/jolt-inlines/poseidon2-goldilocks/src/sdk.rs @@ -0,0 +1,99 @@ +// SPDX-License-Identifier: Apache-2.0 + +//! Public Poseidon2-Goldilocks API for guests and hosts. +//! +//! In a RISC-V guest build (no_std), `poseidon2_permute` emits the +//! custom inline opcode and the Jolt prover dispatches it to the +//! `sequence_builder`. +//! +//! In a host build (feature = "host"), `poseidon2_permute` calls the +//! reference implementation in `exec.rs`. +//! +//! On non-RISC-V non-host targets (rare — basically tooling builds +//! that don't enable the `host` feature) the function panics with a +//! clear message. + +use crate::Poseidon2GoldilocksState; + +/// Permute an 8-element Goldilocks state in place. +/// +/// # Safety +/// +/// `state` must point to exactly `STATE_WIDTH` (= 8) contiguous u64 +/// values that are writable for the duration of the call. The pointer +/// must be 8-byte aligned. +#[inline(always)] +pub fn poseidon2_permute(state: &mut Poseidon2GoldilocksState) { + unsafe { + poseidon2_permute_inner(state.as_mut_ptr()); + } +} + +// ──────────────────────────────────────────────────────────────────────── +// Custom inline opcode dispatch +// ──────────────────────────────────────────────────────────────────────── + +/// RISC-V guest path: emit the custom inline opcode. +/// +/// Memory contract enforced by the sequence builder: +/// - `rs1` → pointer to the 8-element state (read+written in place). +/// - `rs2` → pointer to the 86-element round-constants table +/// (read-only; the sequence builder loads `RC[i]` from +/// `rs2 + i*8`). +/// +/// We load `rs2` from the static `POSEIDON2_ROUND_CONSTANTS_GOLDILOCKS_8` +/// via the `la` pseudo-instruction (resolved to `lui+addi` or +/// `auipc+addi` at link time). If `rs2` is left as `x0` (the v0.3 bug), +/// the inline silently reads round constants from address 0 and +/// produces wrong hashes with no visible error. +/// +/// # Safety +/// +/// `state` must be a valid, 8-byte-aligned pointer to 64 bytes of +/// readable+writable memory. +#[cfg(all( + not(feature = "host"), + any(target_arch = "riscv32", target_arch = "riscv64") +))] +#[inline(always)] +unsafe fn poseidon2_permute_inner(state: *mut u64) { + use crate::{ + INLINE_OPCODE, POSEIDON2_GOLDILOCKS_FUNCT3, POSEIDON2_GOLDILOCKS_FUNCT7, + POSEIDON2_ROUND_CONSTANTS_GOLDILOCKS_8, + }; + core::arch::asm!( + "la {rs2}, {rc}", + ".insn r {opcode}, {funct3}, {funct7}, x0, {rs1}, {rs2}", + opcode = const INLINE_OPCODE, + funct3 = const POSEIDON2_GOLDILOCKS_FUNCT3, + funct7 = const POSEIDON2_GOLDILOCKS_FUNCT7, + rs1 = in(reg) state, + rs2 = out(reg) _, + rc = sym POSEIDON2_ROUND_CONSTANTS_GOLDILOCKS_8, + options(nostack) + ); +} + +/// Host path: dispatch to the reference implementation in `exec.rs`. +#[cfg(feature = "host")] +#[inline(always)] +unsafe fn poseidon2_permute_inner(state: *mut u64) { + let slice = core::slice::from_raw_parts_mut(state, crate::STATE_WIDTH); + let arr: &mut [u64; 8] = slice + .try_into() + .expect("Poseidon2 state must be exactly 8 u64 elements"); + crate::exec::execute_poseidon2_permutation(arr); +} + +/// Non-RISC-V, non-host targets: fail loudly. +#[cfg(all( + not(feature = "host"), + not(any(target_arch = "riscv32", target_arch = "riscv64")) +))] +#[inline(always)] +unsafe fn poseidon2_permute_inner(_state: *mut u64) { + panic!( + "poseidon2_permute requires either the `host` feature or a \ + RISC-V target. Add `features = [\"host\"]` for tooling builds." + ); +} diff --git a/jolt-inlines/poseidon2-goldilocks/src/sequence_builder.rs b/jolt-inlines/poseidon2-goldilocks/src/sequence_builder.rs new file mode 100644 index 0000000000..b68d38f974 --- /dev/null +++ b/jolt-inlines/poseidon2-goldilocks/src/sequence_builder.rs @@ -0,0 +1,720 @@ +// SPDX-License-Identifier: Apache-2.0 + +//! Sequence builder for the Goldilocks Poseidon2 inline. +//! +//! Emits a flat sequence of virtual RISC-V instructions that permutes +//! an 8-element Goldilocks state in place. Operates over ~35 virtual +//! registers so the entire permutation runs without spilling state to +//! memory between rounds. +//! +//! Memory layout: +//! - `rs1`: pointer to the 8-element state (64 bytes), permuted in place +//! - `rs2`: pointer to the 86-element round-constant table (688 bytes) +//! +use core::array; + +use jolt_inlines_sdk::host::{ + instruction::{ + add::ADD, addi::ADDI, and::AND, ld::LD, mul::MUL, mulhu::MULHU, sd::SD, slli::SLLI, + sltu::SLTU, srli::SRLI, sub::SUB, + }, + FormatInline, InlineOp, InstrAssembler, Instruction, VirtualRegisterGuard, +}; + +use crate::exec::POSEIDON2_INTERNAL_DIAG; +use crate::STATE_WIDTH; + +/// Virtual-register count. +/// +/// Layout: +/// - `vr[0..8]` — state `S[0..7]` (live across all rounds) +/// - `vr[8..16]` — temp state `T[0..7]` for MDS reorganization +/// - `vr[16]` — P (Goldilocks modulus) loaded once +/// - `vr[17..24]` — mul_mod scratch: lo, hi, hi_lo, hi_hi, +/// shifted, add_term, sub_term +/// - `vr[24..28]` — add_mod / final-reduction scratch (4 regs) +/// - `vr[28..32]` — generic scratch (round constants, diff sums, etc.) +/// - `vr[32]` — internal-diffusion row-sum accumulator +/// - `vr[33..35]` — extra scratch for shifts and intermediates +/// - `vr[35]` — mask_low_32 (constant 2^32 - 1, loaded once) +/// - `vr[36]` — mm_add_ovf (mul_mod add_term overflow flag) +pub const NEEDED_REGISTERS: u8 = 37; + +const P_REG: usize = 16; + +const STATE_LEN: usize = STATE_WIDTH; + +/// Inline operation tag, registered with the Jolt prover via +/// [`crate::host`]. +pub struct Poseidon2GoldilocksPermutation; + +impl InlineOp for Poseidon2GoldilocksPermutation { + const OPCODE: u32 = crate::INLINE_OPCODE; + const FUNCT3: u32 = crate::POSEIDON2_GOLDILOCKS_FUNCT3; + const FUNCT7: u32 = crate::POSEIDON2_GOLDILOCKS_FUNCT7; + const NAME: &'static str = crate::POSEIDON2_GOLDILOCKS_NAME; + + fn build_sequence(asm: InstrAssembler, operands: FormatInline) -> Vec { + Poseidon2GoldilocksSequenceBuilder::new(asm, operands).build() + } +} + +pub(crate) struct Poseidon2GoldilocksSequenceBuilder { + asm: InstrAssembler, + vr: [VirtualRegisterGuard; NEEDED_REGISTERS as usize], + operands: FormatInline, +} + +impl Poseidon2GoldilocksSequenceBuilder { + fn new(asm: InstrAssembler, operands: FormatInline) -> Self { + let vr = array::from_fn(|_| asm.allocator.allocate_for_inline()); + Poseidon2GoldilocksSequenceBuilder { asm, vr, operands } + } + + // ── Register accessors ──────────────────────────────────────────── + + fn s(&self, i: usize) -> u8 { + *self.vr[i] + } + fn t(&self, i: usize) -> u8 { + *self.vr[STATE_LEN + i] + } + fn p_reg(&self) -> u8 { + *self.vr[P_REG] + } + // Named scratch registers for mul_mod + fn mm_lo(&self) -> u8 { + *self.vr[17] + } + fn mm_hi(&self) -> u8 { + *self.vr[18] + } + fn mm_hi_lo(&self) -> u8 { + *self.vr[19] + } + fn mm_hi_hi(&self) -> u8 { + *self.vr[20] + } + fn mm_shifted(&self) -> u8 { + *self.vr[21] + } + fn mm_add_term(&self) -> u8 { + *self.vr[22] + } + fn mm_sub_term(&self) -> u8 { + *self.vr[23] + } + // add_mod / final-reduction scratch + fn am_ovf(&self) -> u8 { + *self.vr[24] + } + fn am_corr(&self) -> u8 { + *self.vr[25] + } + fn am_less(&self) -> u8 { + *self.vr[26] + } + fn am_tmp(&self) -> u8 { + *self.vr[27] + } + // Generic scratch + fn sc_rc(&self) -> u8 { + *self.vr[28] + } + fn sc_diag(&self) -> u8 { + *self.vr[29] + } + fn sc_a(&self) -> u8 { + *self.vr[30] + } + fn sc_b(&self) -> u8 { + *self.vr[31] + } + fn sum_reg(&self) -> u8 { + *self.vr[32] + } + fn sc_c(&self) -> u8 { + *self.vr[33] + } + fn sc_d(&self) -> u8 { + *self.vr[34] + } + fn mask_low32(&self) -> u8 { + *self.vr[35] + } + fn mm_add_ovf(&self) -> u8 { + *self.vr[36] + } + + // ── Top-level build ─────────────────────────────────────────────── + + fn build(mut self) -> Vec { + // 1. Load Goldilocks modulus P into p_reg (3 instructions). + self.load_p(); + + // 2. Load state from memory into vr[0..8]. + self.load_state(); + + // 3. Initial external MDS. + self.external_mds(); + + // 4. 4 external initial rounds. + let mut rc_idx: usize = 0; + for _ in 0..4 { + self.add_round_constants_full(rc_idx); + rc_idx += STATE_LEN; + self.sbox_full(); + self.external_mds(); + } + + // 5. 22 internal rounds. + for _ in 0..22 { + self.add_round_constant_partial(rc_idx); + rc_idx += 1; + self.sbox_state_0(); + self.internal_diffusion(); + } + + // 6. 4 external final rounds. + for _ in 0..4 { + self.add_round_constants_full(rc_idx); + rc_idx += STATE_LEN; + self.sbox_full(); + self.external_mds(); + } + debug_assert_eq!(rc_idx, 86, "round-constant index off"); + + // 7. Store state back to memory. + self.store_state(); + + drop(self.vr); + self.asm.finalize_inline() + } + + // ── Constant loading ────────────────────────────────────────────── + + /// Load P = 2^64 - 2^32 + 1 into p_reg, and (2^32 - 1) into + /// mask_low32. Both are constants used throughout the permutation. + /// + /// The Jolt inline assembler's `emit_i::` accepts a full + /// u64 immediate (it stores it raw and the emulator does + /// `x[rs1].wrapping_add(imm as i64)`). So a single ADDI with rs1=x0 + /// loads any u64 value. + fn load_p(&mut self) { + let p = self.p_reg(); + let mask = self.mask_low32(); + self.asm.emit_i::(p, 0, crate::GOLDILOCKS_MODULUS); + self.asm.emit_i::(mask, 0, 0xFFFF_FFFF); + } + + // ── State load / store ──────────────────────────────────────────── + + fn load_state(&mut self) { + for i in 0..STATE_LEN { + self.asm + .emit_ld::(self.s(i), self.operands.rs1, (i * 8) as i64); + } + } + + fn store_state(&mut self) { + for i in 0..STATE_LEN { + self.asm + .emit_s::(self.operands.rs1, self.s(i), (i * 8) as i64); + } + } + + // ── Field arithmetic primitives ─────────────────────────────────── + + /// dst = (a + b) mod P. + /// + /// 11 instructions. Note: `dst` may alias `a` or `b` — we snapshot + /// `a` first to avoid clobbering it before the overflow check. + fn add_mod_into(&mut self, dst: u8, a: u8, b: u8) { + let p = self.p_reg(); + let ovf = self.am_ovf(); + let corr = self.am_corr(); + let less = self.am_less(); + let tmp = self.am_tmp(); + + // 0. Snapshot `a` into tmp to survive the dst-write aliasing. + // Common call shape is `add_mod_into(s, s, rc)` where dst == a. + self.asm.emit_r::(tmp, a, 0); + // 1. dst = a + b (wrapping) + self.asm.emit_r::(dst, a, b); + // 2. ovf = (dst < tmp) ? 1 : 0 -- overflow detection using snapshot + self.asm.emit_r::(ovf, dst, tmp); + // 3-4. corr = ovf * (2^32 - 1) = (ovf << 32) - ovf + self.asm.emit_i::(corr, ovf, 32); + self.asm.emit_r::(corr, corr, ovf); + // 5. dst = dst + corr -- if no overflow, corr = 0 + self.asm.emit_r::(dst, dst, corr); + // 6-10. Final reduction: if dst >= P, dst -= P. + self.asm.emit_r::(less, dst, p); + self.asm.emit_r::(tmp, dst, p); + self.asm.emit_r::(corr, 0, less); // corr = -less = 0 or all-ones + self.asm.emit_r::(corr, corr, p); // corr = P if less else 0 + self.asm.emit_r::(dst, tmp, corr); + } + + /// dst = (a * b) mod P using the Goldilocks reduction trick. + /// + /// Mirrors the corrected `exec::mul_mod`. ~25 instructions. + /// + /// Critical detail: when `lo + (hi_lo << 32)` overflows 2^64 + /// during the `add_term` step, naive wrapping loses 2^64 worth of + /// magnitude. Since 2^64 ≡ (2^32 - 1) mod P, the result is short + /// by (2^32 - 1) mod P. We detect this overflow and add the + /// correction, handling the double-wrap case (where the + /// correction itself overflows). + fn mul_mod_into(&mut self, dst: u8, a: u8, b: u8) { + let p = self.p_reg(); + let mask = self.mask_low32(); + let lo = self.mm_lo(); + let hi = self.mm_hi(); + let hi_lo = self.mm_hi_lo(); + let hi_hi = self.mm_hi_hi(); + let shifted = self.mm_shifted(); + let add_term = self.mm_add_term(); + let sub_term = self.mm_sub_term(); + let add_ovf = self.mm_add_ovf(); + let sub_ovf = self.am_ovf(); + let corr = self.am_corr(); + let less = self.am_less(); + let tmp = self.am_tmp(); + + // 1. lo = low 64 bits of a*b + self.asm.emit_r::(lo, a, b); + // 2. hi = high 64 bits of a*b + self.asm.emit_r::(hi, a, b); + // 3. hi_hi = hi >> 32 + self.asm.emit_i::(hi_hi, hi, 32); + // 4-5. hi_lo = (hi << 32) >> 32 -- zero-extends the low 32 bits + self.asm.emit_i::(hi_lo, hi, 32); + self.asm.emit_i::(hi_lo, hi_lo, 32); + // 6. shifted = hi_lo << 32 + self.asm.emit_i::(shifted, hi_lo, 32); + // 7. add_term = lo + shifted (wrapping) + self.asm.emit_r::(add_term, lo, shifted); + // 8. add_ovf = (add_term < lo) ? 1 : 0 -- detect 2^64 overflow + self.asm.emit_r::(add_ovf, add_term, lo); + // 9. sub_term = hi_lo + hi_hi + self.asm.emit_r::(sub_term, hi_lo, hi_hi); + // 10. r = add_term - sub_term (wrapping) + self.asm.emit_r::(dst, add_term, sub_term); + // 11. sub_ovf = (add_term < sub_term) ? 1 : 0 + self.asm.emit_r::(sub_ovf, add_term, sub_term); + // 12-14. If underflow, add P. Compute corr = (0 - sub_ovf) AND P; r += corr. + self.asm.emit_r::(corr, 0, sub_ovf); + self.asm.emit_r::(corr, corr, p); + self.asm.emit_r::(dst, dst, corr); + // 15-21. add_term-overflow correction: if add_ovf, add + // (2^32 - 1). Snapshot dst, do the conditional add, detect + // wrap, conditional second add of (2^32 - 1). + self.asm.emit_r::(corr, 0, add_ovf); + self.asm.emit_r::(corr, corr, mask); // corr = (2^32-1) if add_ovf else 0 + self.asm.emit_r::(tmp, dst, 0); // snapshot dst for wrap detection + self.asm.emit_r::(dst, dst, corr); // dst += corr + self.asm.emit_r::(less, dst, tmp); // less = 1 if wrap (dst < snapshot) + self.asm.emit_r::(corr, 0, less); + self.asm.emit_r::(corr, corr, mask); + self.asm.emit_r::(dst, dst, corr); // if wrap, add another (2^32-1) + // 22-26. Final reduction: if dst >= P, dst -= P. + // After all corrections, dst < 2^64 < 2P, so one sub + // suffices. + self.asm.emit_r::(less, dst, p); + self.asm.emit_r::(tmp, dst, p); + self.asm.emit_r::(corr, 0, less); + self.asm.emit_r::(corr, corr, p); + self.asm.emit_r::(dst, tmp, corr); + } + + /// dst = x^7 over Goldilocks. 4 mul_mod calls. + /// + /// Now that `mul_mod_into` correctly handles `add_term` overflow, + /// the natural `x^7 = x^6 * x` decomposition works correctly. + /// (Previously this triggered a bug in `mul_mod_into`; the + /// workaround was the `x^7 = x^4 * x^3` decomposition. Fixed in + /// the v1 mul_mod correction.) + fn sbox_into(&mut self, dst: u8, x: u8) { + let x2 = self.sc_a(); + let x4 = self.sc_b(); + let x6 = self.sc_c(); + + // x2 = x * x + self.mul_mod_into(x2, x, x); + // x4 = x2 * x2 + self.mul_mod_into(x4, x2, x2); + // x6 = x4 * x2 + self.mul_mod_into(x6, x4, x2); + // dst = x6 * x = x^7 + self.mul_mod_into(dst, x6, x); + } + + // ── Round-constant loading ──────────────────────────────────────── + + /// Load round constant at index `idx` into sc_rc from rs2 base. + /// 1 instruction. + fn load_rc(&mut self, idx: usize) { + let dst = self.sc_rc(); + self.asm + .emit_ld::(dst, self.operands.rs2, (idx * 8) as i64); + } + + /// Load the i'th diagonal constant into sc_diag. + /// Uses inline u64 immediate construction since DIAG only has 8 entries. + fn load_diag(&mut self, i: usize) { + let value = POSEIDON2_INTERNAL_DIAG[i]; + let dst = self.sc_diag(); + self.load_u64_immediate(dst, value); + } + + /// Load a 64-bit constant into `dst`. Single instruction: the Jolt + /// inline `emit_i::` accepts a full u64 immediate. + fn load_u64_immediate(&mut self, dst: u8, value: u64) { + self.asm.emit_i::(dst, 0, value); + } + + /// For each i in 0..8: S[i] = (S[i] + RC[rc_base + i]) mod P. + fn add_round_constants_full(&mut self, rc_base: usize) { + for i in 0..STATE_LEN { + self.load_rc(rc_base + i); + let s_i = self.s(i); + let rc = self.sc_rc(); + self.add_mod_into(s_i, s_i, rc); + } + } + + /// S[0] = (S[0] + RC[idx]) mod P. Used in internal rounds. + fn add_round_constant_partial(&mut self, idx: usize) { + self.load_rc(idx); + let s0 = self.s(0); + let rc = self.sc_rc(); + self.add_mod_into(s0, s0, rc); + } + + /// For each i in 0..8: S[i] = sbox(S[i]). + fn sbox_full(&mut self) { + for i in 0..STATE_LEN { + let s_i = self.s(i); + self.sbox_into(s_i, s_i); + } + } + + fn sbox_state_0(&mut self) { + let s0 = self.s(0); + self.sbox_into(s0, s0); + } + + // ── External MDS ────────────────────────────────────────────────── + + /// Apply the m4 sub-block to four registers in place. + /// + /// Reference (from exec.rs::external_mds): + /// ```text + /// let (a, b, c, d) = (s[0], s[1], s[2], s[3]); + /// let sum = add_mod(add_mod(a, b), add_mod(c, d)); + /// s[0] = add_mod(sum, add_mod(a, add_mod(b, b))); + /// s[1] = add_mod(sum, add_mod(b, add_mod(c, c))); + /// s[2] = add_mod(sum, add_mod(c, add_mod(d, d))); + /// s[3] = add_mod(sum, add_mod(d, add_mod(a, a))); + /// ``` + /// + /// We snapshot a,b,c,d into named scratch (sc_a/b/c/d) so we can + /// freely overwrite s[]. + fn m4_apply(&mut self, s: [u8; 4]) { + // Snapshot inputs. + let a = self.sc_a(); + let b = self.sc_b(); + let c = self.sc_c(); + let d = self.sc_d(); + self.asm.emit_r::(a, s[0], 0); + self.asm.emit_r::(b, s[1], 0); + self.asm.emit_r::(c, s[2], 0); + self.asm.emit_r::(d, s[3], 0); + + // sum = (a+b) + (c+d), using a temp register (mm_lo is free here). + let ab = self.mm_lo(); + let cd = self.mm_hi(); + let sum = self.sum_reg(); + self.add_mod_into(ab, a, b); + self.add_mod_into(cd, c, d); + self.add_mod_into(sum, ab, cd); + + // s[0] = sum + a + 2b + let bb = self.mm_hi_lo(); + let a_plus_bb = self.mm_hi_hi(); + self.add_mod_into(bb, b, b); + self.add_mod_into(a_plus_bb, a, bb); + self.add_mod_into(s[0], sum, a_plus_bb); + + // s[1] = sum + b + 2c + let cc = self.mm_hi_lo(); + let b_plus_cc = self.mm_hi_hi(); + self.add_mod_into(cc, c, c); + self.add_mod_into(b_plus_cc, b, cc); + self.add_mod_into(s[1], sum, b_plus_cc); + + // s[2] = sum + c + 2d + let dd = self.mm_hi_lo(); + let c_plus_dd = self.mm_hi_hi(); + self.add_mod_into(dd, d, d); + self.add_mod_into(c_plus_dd, c, dd); + self.add_mod_into(s[2], sum, c_plus_dd); + + // s[3] = sum + d + 2a + let aa = self.mm_hi_lo(); + let d_plus_aa = self.mm_hi_hi(); + self.add_mod_into(aa, a, a); + self.add_mod_into(d_plus_aa, d, aa); + self.add_mod_into(s[3], sum, d_plus_aa); + } + + /// External MDS: two m4 sub-blocks (on left and right halves of the + /// state) then the cross-mix described in exec.rs::external_mds. + fn external_mds(&mut self) { + let left: [u8; 4] = [self.s(0), self.s(1), self.s(2), self.s(3)]; + let right: [u8; 4] = [self.s(4), self.s(5), self.s(6), self.s(7)]; + + self.m4_apply(left); + self.m4_apply(right); + + // After m4, left = transformed(S[0..4]), right = transformed(S[4..8]). + // Reference: + // for i in 0..4 { state[i] = left[i] + right[i]; state[i+4] = left[i] + right[i]; } + // for i in 0..4 { state[i] = state[i] + left[i]; state[i+4] = state[i+4] + right[i]; } + // + // After these two passes: + // state[i] = (left[i] + right[i]) + left[i] = 2*left[i] + right[i] + // state[i+4] = (left[i] + right[i]) + right[i] = left[i] + 2*right[i] + // + // Implement directly: snapshot left[i] and right[i] before + // overwriting them. + for i in 0..4 { + // We can't freely snapshot here because left/right ARE the + // state registers. Use t[i] and t[i+4] as scratch. + let l = left[i]; // = S[i] + let r = right[i]; // = S[i+4] + let t_l = self.t(i); + let t_r = self.t(i + 4); + + // t_l = l, t_r = r + self.asm.emit_r::(t_l, l, 0); + self.asm.emit_r::(t_r, r, 0); + + // l = 2*t_l + t_r = (t_l + t_r) + t_l + let sum_lr = self.sc_a(); + self.add_mod_into(sum_lr, t_l, t_r); + self.add_mod_into(l, sum_lr, t_l); + + // r = t_l + 2*t_r = (t_l + t_r) + t_r + self.add_mod_into(r, sum_lr, t_r); + } + } + + // ── Internal diffusion ──────────────────────────────────────────── + + /// Compute row-sum into sum_reg, then state[i] = diag[i]*state[i] + sum. + fn internal_diffusion(&mut self) { + // 1. sum = S[0] + S[1] + ... + S[7] + let sum = self.sum_reg(); + // Start with sum = S[0]+S[1]. + let s0 = self.s(0); + let s1 = self.s(1); + self.add_mod_into(sum, s0, s1); + for i in 2..STATE_LEN { + let s_i = self.s(i); + self.add_mod_into(sum, sum, s_i); + } + + // 2. For each i in 0..8: S[i] = (diag[i] * S[i]) + sum. + for i in 0..STATE_LEN { + self.load_diag(i); + let diag = self.sc_diag(); + let s_i = self.s(i); + // S[i] = diag * S[i] + self.mul_mod_into(s_i, diag, s_i); + // S[i] = S[i] + sum + self.add_mod_into(s_i, s_i, sum); + } + } +} + +// Compile-time sanity: the constants we pull in MUST be the same shape +// the reference implementation uses. If these `_` bindings fail to +// type-check, the implementer has the wrong constants. +const _: [u64; STATE_WIDTH] = POSEIDON2_INTERNAL_DIAG; + +// ── Test-only helpers for sub-operation isolation ──────────────────── + +#[cfg(test)] +#[allow(dead_code)] +impl Poseidon2GoldilocksSequenceBuilder { + pub fn new_for_test(asm: InstrAssembler, operands: FormatInline) -> Self { + Self::new(asm, operands) + } + + pub fn test_load_p_and_state_and_add_rc_full(&mut self, rc_base: usize) { + self.load_p(); + self.load_state(); + self.add_round_constants_full(rc_base); + } + + pub fn test_load_p_state_addrc_sbox(&mut self, rc_base: usize) { + self.load_p(); + self.load_state(); + self.add_round_constants_full(rc_base); + self.sbox_full(); + } + + pub fn test_load_p_state_addrc_sbox_mds(&mut self, rc_base: usize) { + self.load_p(); + self.load_state(); + self.add_round_constants_full(rc_base); + self.sbox_full(); + self.external_mds(); + } + + pub fn test_load_p_state_mds_only(&mut self) { + self.load_p(); + self.load_state(); + self.external_mds(); + } + + pub fn test_load_p_state_intdiff_only(&mut self) { + self.load_p(); + self.load_state(); + self.internal_diffusion(); + } + + /// For each i in 0..8: S[i] = S[i] * S[i] mod P (single squaring). + pub fn test_load_p_state_square_only(&mut self) { + self.load_p(); + self.load_state(); + for i in 0..STATE_LEN { + let s_i = self.s(i); + self.mul_mod_into(s_i, s_i, s_i); + } + } + + /// For each i in 0..8: S[i] = sbox(S[i]) (without the surrounding RC add). + pub fn test_load_p_state_sbox_only(&mut self) { + self.load_p(); + self.load_state(); + self.sbox_full(); + } + + /// For each i: S[i] = x^4 mod P (just two squares). + pub fn test_load_p_state_x4_only(&mut self) { + self.load_p(); + self.load_state(); + for i in 0..STATE_LEN { + let s_i = self.s(i); + let scratch = self.sc_a(); + self.mul_mod_into(scratch, s_i, s_i); // x^2 + self.mul_mod_into(s_i, scratch, scratch); // x^4 + } + } + + /// For each i: S[i] = x^3 = x^2 * x. Tests asymmetric mul. + pub fn test_load_p_state_x3(&mut self) { + self.load_p(); + self.load_state(); + for i in 0..STATE_LEN { + let s_i = self.s(i); + let scratch = self.sc_a(); + self.mul_mod_into(scratch, s_i, s_i); // x^2 + self.mul_mod_into(s_i, scratch, s_i); // x^3 = x^2 * x + } + } + + /// For each i: S[i] = x^6 = x^4 * x^2. + pub fn test_load_p_state_x6(&mut self) { + self.load_p(); + self.load_state(); + for i in 0..STATE_LEN { + let s_i = self.s(i); + let x2 = self.sc_a(); + let x4 = self.sc_b(); + self.mul_mod_into(x2, s_i, s_i); // x^2 + self.mul_mod_into(x4, x2, x2); // x^4 + self.mul_mod_into(s_i, x4, x2); // x^6 = x^4 * x^2 + } + } + + /// For each i: S[i] = x * x (square), but written as + /// scratch = x; mul_mod(s_i, scratch, s_i). Tests dst-aliases-b + /// when a is a non-aliasing reg. + /// For each i: S[i] = S[i] * S[i+8] mod P. Used to stress + /// mul_mod_into with arbitrary (a, b) pairs supplied by the test + /// harness. The state's first 8 elements are a's, next 8 are b's. + /// Output is written back to the first 8 (overwriting a's). + /// + /// Note: this requires a 16-element state buffer at rs1. The + /// emulator test allocates 128 bytes for this. + pub fn test_load_p_state_mul_pairs(&mut self) { + self.load_p(); + // Load 16 u64s from rs1. + for i in 0..8 { + let s_i = self.s(i); + self.asm + .emit_ld::(s_i, self.operands.rs1, (i * 8) as i64); + } + // Load the b's into T[0..8] (vr[8..16]). + for i in 0..8 { + let t_i = self.t(i); + self.asm + .emit_ld::(t_i, self.operands.rs1, ((i + 8) * 8) as i64); + } + // For each i: S[i] = S[i] * T[i] mod P + for i in 0..8 { + let s_i = self.s(i); + let t_i = self.t(i); + self.mul_mod_into(s_i, s_i, t_i); + } + // Store the 8 results back to the first 8 slots. + for i in 0..8 { + let s_i = self.s(i); + self.asm + .emit_s::(self.operands.rs1, s_i, (i * 8) as i64); + } + } + + pub fn test_load_p_state_mul_dst_aliases_b(&mut self) { + self.load_p(); + self.load_state(); + for i in 0..STATE_LEN { + let s_i = self.s(i); + let scratch = self.sc_a(); + // scratch = s_i (just a copy via ADD scratch, s_i, 0) + self.asm.emit_r::(scratch, s_i, 0); + // s_i = scratch * s_i (dst aliases b) + self.mul_mod_into(s_i, scratch, s_i); + } + } + + /// For each i: S[i] = x^7 via INLINED sbox using the natural + /// `x^7 = x^6 * x` decomposition. Previously diagnosed as + /// triggering a `mul_mod_into` bug; now fixed by the add_term- + /// overflow correction. This test stays in place as a regression + /// guard. + pub fn test_load_p_state_sbox_inlined(&mut self) { + self.load_p(); + self.load_state(); + for i in 0..STATE_LEN { + let s_i = self.s(i); + let a = self.sc_a(); + let b = self.sc_b(); + let c = self.sc_c(); + self.mul_mod_into(a, s_i, s_i); // a = x^2 + self.mul_mod_into(b, a, a); // b = x^4 + self.mul_mod_into(c, b, a); // c = x^6 = x^4 * x^2 + self.mul_mod_into(s_i, c, s_i); // s_i = x^7 = x^6 * x + } + } + + pub fn test_store_and_finalize(mut self) -> Vec { + self.store_state(); + drop(self.vr); + self.asm.finalize_inline() + } +} diff --git a/jolt-inlines/poseidon2-goldilocks/src/tests.rs b/jolt-inlines/poseidon2-goldilocks/src/tests.rs new file mode 100644 index 0000000000..c3f31b5fcc --- /dev/null +++ b/jolt-inlines/poseidon2-goldilocks/src/tests.rs @@ -0,0 +1,521 @@ +// SPDX-License-Identifier: Apache-2.0 + +//! Parity and emulator tests for the Poseidon2-Goldilocks inline. + +use crate::exec::{execute_poseidon2_permutation, POSEIDON2_ROUND_CONSTANTS_GOLDILOCKS_8}; + +const P: u64 = crate::GOLDILOCKS_MODULUS; + +fn u128_mul_mod(a: u64, b: u64) -> u64 { + ((a as u128) * (b as u128) % (P as u128)) as u64 +} + +#[test] +fn exec_mul_mod_matches_u128_for_known_cases() { + let cases: &[(u64, u64)] = &[ + (0, 0), + (1, 1), + (P - 1, P - 1), + (P - 1, 1), + (2, 3), + (0xC0000000_00000000, 0xC0000000_00000000), + (0x80000000_00000001, 0x80000000_00000001), + (0xFFFFFFFF_FFFFFFFF_u64 % P, 0xFFFFFFFF_FFFFFFFF_u64 % P), + (12345, P - 1), + ]; + for &(a, b) in cases { + assert_eq!(crate::exec::mul_mod(a, b), u128_mul_mod(a, b)); + } +} + +#[test] +fn exec_mul_mod_matches_u128_random_stress() { + let mut seed: u64 = 0xDEADBEEFCAFEBABE; + let mut next = || { + seed = seed + .wrapping_mul(6364136223846793005) + .wrapping_add(1442695040888963407); + seed + }; + for _ in 0..100_000 { + let a = next() % P; + let b = next() % P; + assert_eq!(crate::exec::mul_mod(a, b), u128_mul_mod(a, b)); + } +} + +mod plonky3_parity { + use p3_goldilocks::{default_goldilocks_poseidon2_8, Goldilocks}; + use p3_symmetric::Permutation; + + use super::{execute_poseidon2_permutation, P}; + + fn plonky3_permute(state_u64: [u64; 8]) -> [u64; 8] { + use p3_field::{PrimeCharacteristicRing, PrimeField64}; + + let perm = default_goldilocks_poseidon2_8(); + let mut state: [Goldilocks; 8] = state_u64.map(Goldilocks::from_u64); + perm.permute_mut(&mut state); + state.map(|f| f.as_canonical_u64()) + } + + fn plonky3_permute_generic(state_u64: [u64; 8]) -> [u64; 8] { + use p3_field::{PrimeCharacteristicRing, PrimeField64}; + use p3_goldilocks::{ + Poseidon2ExternalLayerGoldilocks, Poseidon2InternalLayerGoldilocks, + GOLDILOCKS_POSEIDON2_RC_8_EXTERNAL_FINAL, GOLDILOCKS_POSEIDON2_RC_8_EXTERNAL_INITIAL, + GOLDILOCKS_POSEIDON2_RC_8_INTERNAL, + }; + use p3_poseidon2::{ExternalLayerConstants, Poseidon2}; + + let external = ExternalLayerConstants::::new( + GOLDILOCKS_POSEIDON2_RC_8_EXTERNAL_INITIAL.to_vec(), + GOLDILOCKS_POSEIDON2_RC_8_EXTERNAL_FINAL.to_vec(), + ); + let internal = GOLDILOCKS_POSEIDON2_RC_8_INTERNAL.to_vec(); + let perm: Poseidon2< + Goldilocks, + Poseidon2ExternalLayerGoldilocks<8>, + Poseidon2InternalLayerGoldilocks, + 8, + 7, + > = Poseidon2::new(external, internal); + + let mut state: [Goldilocks; 8] = state_u64.map(Goldilocks::from_u64); + perm.permute_mut(&mut state); + state.map(|f| f.as_canonical_u64()) + } + + fn assert_matches_plonky3(initial: [u64; 8]) { + let mut ours = initial; + execute_poseidon2_permutation(&mut ours); + assert_eq!(ours, plonky3_permute(initial)); + assert_eq!(ours, plonky3_permute_generic(initial)); + } + + #[test] + fn permute_all_zero_matches_plonky3() { + assert_matches_plonky3([0u64; 8]); + } + + #[test] + fn permute_known_input_matches_plonky3() { + assert_matches_plonky3([1, 2, 3, 4, 5, 6, 7, 8]); + } + + #[test] + fn permute_large_values_match_plonky3() { + assert_matches_plonky3([P - 1, P - 2, P - 3, P - 4, P - 5, P - 6, P - 7, P - 8]); + } + + #[test] + fn permute_stress_matches_plonky3() { + let mut seed: u64 = 0x0BADC0DEF00DCAFE; + let mut next = || { + seed = seed + .wrapping_mul(6364136223846793005) + .wrapping_add(1442695040888963407); + seed + }; + + for _ in 0..200 { + assert_matches_plonky3([ + next() % P, + next() % P, + next() % P, + next() % P, + next() % P, + next() % P, + next() % P, + next() % P, + ]); + } + } + + #[test] + fn round_constants_match_plonky3_layout() { + use p3_field::PrimeField64; + use p3_goldilocks::{ + GOLDILOCKS_POSEIDON2_RC_8_EXTERNAL_FINAL, GOLDILOCKS_POSEIDON2_RC_8_EXTERNAL_INITIAL, + GOLDILOCKS_POSEIDON2_RC_8_INTERNAL, + }; + + let mut idx = 0; + for round_constants in &GOLDILOCKS_POSEIDON2_RC_8_EXTERNAL_INITIAL { + for constant in round_constants { + assert_eq!( + super::POSEIDON2_ROUND_CONSTANTS_GOLDILOCKS_8[idx], + constant.as_canonical_u64() + ); + idx += 1; + } + } + for constant in &GOLDILOCKS_POSEIDON2_RC_8_INTERNAL { + assert_eq!( + super::POSEIDON2_ROUND_CONSTANTS_GOLDILOCKS_8[idx], + constant.as_canonical_u64() + ); + idx += 1; + } + for round_constants in &GOLDILOCKS_POSEIDON2_RC_8_EXTERNAL_FINAL { + for constant in round_constants { + assert_eq!( + super::POSEIDON2_ROUND_CONSTANTS_GOLDILOCKS_8[idx], + constant.as_canonical_u64() + ); + idx += 1; + } + } + assert_eq!(idx, 86); + } + + #[test] + fn internal_diagonal_matches_plonky3() { + use crate::exec::POSEIDON2_INTERNAL_DIAG; + use p3_field::PrimeField64; + use p3_goldilocks::MATRIX_DIAG_8_GOLDILOCKS; + + for i in 0..8 { + assert_eq!( + POSEIDON2_INTERNAL_DIAG[i], + MATRIX_DIAG_8_GOLDILOCKS[i].as_canonical_u64() + ); + } + } +} + +#[test] +fn sequence_builder_emits_nonempty_instruction_list() { + use jolt_inlines_sdk::host::InlineOp; + use tracer::utils::inline_sequence_writer::SequenceInputs; + + let inputs = SequenceInputs::default(); + let instructions = crate::sequence_builder::Poseidon2GoldilocksPermutation::build_sequence( + (&inputs).into(), + (&inputs).into(), + ); + assert!(instructions.len() >= 100); +} + +#[test] +fn sequence_builder_emission_is_deterministic() { + use jolt_inlines_sdk::host::InlineOp; + use tracer::utils::inline_sequence_writer::SequenceInputs; + + let inputs1 = SequenceInputs::default(); + let inputs2 = SequenceInputs::default(); + let seq1 = crate::sequence_builder::Poseidon2GoldilocksPermutation::build_sequence( + (&inputs1).into(), + (&inputs1).into(), + ); + let seq2 = crate::sequence_builder::Poseidon2GoldilocksPermutation::build_sequence( + (&inputs2).into(), + (&inputs2).into(), + ); + assert_eq!(seq1.len(), seq2.len()); + let dbg1: Vec = seq1.iter().map(|i| format!("{i:?}")).collect(); + let dbg2: Vec = seq2.iter().map(|i| format!("{i:?}")).collect(); + assert_eq!(dbg1, dbg2); +} + +#[cfg(test)] +mod emulator { + use core::array; + + use super::*; + use jolt_inlines_sdk::host::{ + instruction::{ld::LD, sd::SD}, + FormatInline, InlineOp, InlineOp as InlineOpTrait, InstrAssembler, Instruction, + VirtualRegisterGuard, + }; + use tracer::utils::inline_test_harness::{InlineMemoryLayout, InlineTestHarness}; + + fn create_harness(output_size: usize) -> InlineTestHarness { + let layout = InlineMemoryLayout::single_input( + POSEIDON2_ROUND_CONSTANTS_GOLDILOCKS_8.len() * 8, + output_size, + ); + InlineTestHarness::new(layout) + } + + fn execute_inline_permutation(initial_state: &[u64; 8]) -> [u64; 8] { + let mut harness = create_harness(64); + harness.setup_registers(); + harness.load_input64(&POSEIDON2_ROUND_CONSTANTS_GOLDILOCKS_8); + harness.load_state64(initial_state); + let inline_instr = InlineTestHarness::create_default_instruction( + crate::INLINE_OPCODE, + crate::POSEIDON2_GOLDILOCKS_FUNCT3, + crate::POSEIDON2_GOLDILOCKS_FUNCT7, + ); + harness.execute_inline(inline_instr); + let result_vec = harness.read_output64(8); + let mut result = [0u64; 8]; + result.copy_from_slice(&result_vec); + result + } + + #[test] + fn emulator_permute_all_zero_matches_reference() { + let mut reference = [0u64; 8]; + execute_poseidon2_permutation(&mut reference); + assert_eq!(execute_inline_permutation(&[0u64; 8]), reference); + } + + #[test] + fn emulator_permute_known_input_matches_reference() { + let initial = [1u64, 2, 3, 4, 5, 6, 7, 8]; + let mut reference = initial; + execute_poseidon2_permutation(&mut reference); + assert_eq!(execute_inline_permutation(&initial), reference); + } + + #[test] + fn emulator_permute_stress_matches_reference() { + let mut seed: u64 = 0xFACEFEED0BADBEEF; + let mut next = || { + seed = seed + .wrapping_mul(6364136223846793005) + .wrapping_add(1442695040888963407); + seed + }; + for _ in 0..50 { + let initial = [ + next() % P, + next() % P, + next() % P, + next() % P, + next() % P, + next() % P, + next() % P, + next() % P, + ]; + let mut reference = initial; + execute_poseidon2_permutation(&mut reference); + assert_eq!(execute_inline_permutation(&initial), reference); + } + } + + struct IdentityPermutation; + + impl InlineOpTrait for IdentityPermutation { + const OPCODE: u32 = crate::INLINE_OPCODE; + const FUNCT3: u32 = 0x05; + const FUNCT7: u32 = 0x05; + const NAME: &'static str = "IDENTITY_TEST_INLINE"; + + fn build_sequence(asm: InstrAssembler, operands: FormatInline) -> Vec { + let vr: [VirtualRegisterGuard; 8] = + array::from_fn(|_| asm.allocator.allocate_for_inline()); + let mut asm = asm; + for (i, reg) in vr.iter().enumerate() { + asm.emit_ld::(**reg, operands.rs1, (i * 8) as i64); + } + for (i, reg) in vr.iter().enumerate() { + asm.emit_s::(operands.rs1, **reg, (i * 8) as i64); + } + drop(vr); + asm.finalize_inline() + } + } + + struct AddRcOnlyTest; + + impl InlineOpTrait for AddRcOnlyTest { + const OPCODE: u32 = crate::INLINE_OPCODE; + const FUNCT3: u32 = 0x06; + const FUNCT7: u32 = 0x06; + const NAME: &'static str = "ADD_RC_ONLY_TEST_INLINE"; + + fn build_sequence(asm: InstrAssembler, operands: FormatInline) -> Vec { + let mut builder = + crate::sequence_builder::Poseidon2GoldilocksSequenceBuilder::new_for_test( + asm, operands, + ); + builder.test_load_p_and_state_and_add_rc_full(0); + builder.test_store_and_finalize() + } + } + + struct MdsOnlyTest; + + impl InlineOpTrait for MdsOnlyTest { + const OPCODE: u32 = crate::INLINE_OPCODE; + const FUNCT3: u32 = 0x07; + const FUNCT7: u32 = 0x09; + const NAME: &'static str = "MDS_ONLY_TEST"; + + fn build_sequence(asm: InstrAssembler, operands: FormatInline) -> Vec { + let mut builder = + crate::sequence_builder::Poseidon2GoldilocksSequenceBuilder::new_for_test( + asm, operands, + ); + builder.test_load_p_state_mds_only(); + builder.test_store_and_finalize() + } + } + + struct IntDiffOnlyTest; + + impl InlineOpTrait for IntDiffOnlyTest { + const OPCODE: u32 = crate::INLINE_OPCODE; + const FUNCT3: u32 = 0x07; + const FUNCT7: u32 = 0x0A; + const NAME: &'static str = "INT_DIFF_ONLY_TEST"; + + fn build_sequence(asm: InstrAssembler, operands: FormatInline) -> Vec { + let mut builder = + crate::sequence_builder::Poseidon2GoldilocksSequenceBuilder::new_for_test( + asm, operands, + ); + builder.test_load_p_state_intdiff_only(); + builder.test_store_and_finalize() + } + } + + struct SboxOnlyTest; + + impl InlineOpTrait for SboxOnlyTest { + const OPCODE: u32 = crate::INLINE_OPCODE; + const FUNCT3: u32 = 0x07; + const FUNCT7: u32 = 0x0C; + const NAME: &'static str = "SBOX_ONLY_TEST"; + + fn build_sequence(asm: InstrAssembler, operands: FormatInline) -> Vec { + let mut builder = + crate::sequence_builder::Poseidon2GoldilocksSequenceBuilder::new_for_test( + asm, operands, + ); + builder.test_load_p_state_sbox_only(); + builder.test_store_and_finalize() + } + } + + struct MulPairsTest; + + impl InlineOpTrait for MulPairsTest { + const OPCODE: u32 = crate::INLINE_OPCODE; + const FUNCT3: u32 = 0x07; + const FUNCT7: u32 = 0x12; + const NAME: &'static str = "MUL_PAIRS_TEST"; + + fn build_sequence(asm: InstrAssembler, operands: FormatInline) -> Vec { + let mut builder = + crate::sequence_builder::Poseidon2GoldilocksSequenceBuilder::new_for_test( + asm, operands, + ); + builder.test_load_p_state_mul_pairs(); + builder.test_store_and_finalize() + } + } + + jolt_inlines_sdk::register_inlines! { + trace_file: "poseidon2_test_inlines_trace.joltinline", + extension: jolt_inlines_sdk::host::InlineExtension::Poseidon2Goldilocks, + ops: [IdentityPermutation, AddRcOnlyTest, MdsOnlyTest, IntDiffOnlyTest, SboxOnlyTest, MulPairsTest], + } + + fn run_inline_with_state(funct3: u32, funct7: u32, initial: &[u64; 8]) -> [u64; 8] { + let mut harness = create_harness(64); + harness.setup_registers(); + harness.load_input64(&POSEIDON2_ROUND_CONSTANTS_GOLDILOCKS_8); + harness.load_state64(initial); + let instr = + InlineTestHarness::create_default_instruction(crate::INLINE_OPCODE, funct3, funct7); + harness.execute_inline(instr); + let v = harness.read_output64(8); + let mut out = [0u64; 8]; + out.copy_from_slice(&v); + out + } + + #[test] + fn inline_mul_mod_stress_vs_u128() { + let mut seed: u64 = 0xBADC0FFEE0DDF00D; + let mut next = || { + seed = seed + .wrapping_mul(6364136223846793005) + .wrapping_add(1442695040888963407); + seed + }; + for _ in 0..100 { + let mut state = [0u64; 16]; + for i in 0..8 { + state[i] = next() % P; + state[i + 8] = next() % P; + } + let layout = InlineMemoryLayout::single_input(8, 128); + let mut harness = InlineTestHarness::new(layout); + harness.setup_registers(); + harness.load_input64(&[0u64]); + harness.load_state64(&state); + let instr = InlineTestHarness::create_default_instruction( + crate::INLINE_OPCODE, + MulPairsTest::FUNCT3, + MulPairsTest::FUNCT7, + ); + harness.execute_inline(instr); + let result = harness.read_output64(8); + for i in 0..8 { + assert_eq!(result[i], u128_mul_mod(state[i], state[i + 8])); + } + } + } + + #[test] + fn add_rc_only_inline_matches_reference() { + let result = + run_inline_with_state(AddRcOnlyTest::FUNCT3, AddRcOnlyTest::FUNCT7, &[0u64; 8]); + let mut expected = [0u64; 8]; + expected.copy_from_slice(&POSEIDON2_ROUND_CONSTANTS_GOLDILOCKS_8[..8]); + assert_eq!(result, expected); + } + + #[test] + fn mds_only_matches_reference() { + let initial = [1u64, 2, 3, 4, 5, 6, 7, 8]; + let got = run_inline_with_state(MdsOnlyTest::FUNCT3, MdsOnlyTest::FUNCT7, &initial); + let mut expected = initial; + crate::exec::external_mds(&mut expected); + assert_eq!(got, expected); + } + + #[test] + fn int_diff_only_matches_reference() { + let initial = [11u64, 22, 33, 44, 55, 66, 77, 88]; + let got = run_inline_with_state(IntDiffOnlyTest::FUNCT3, IntDiffOnlyTest::FUNCT7, &initial); + let mut expected = initial; + crate::exec::internal_diffusion(&mut expected); + assert_eq!(got, expected); + } + + #[test] + fn sbox_only_matches_reference() { + let initial: [u64; 8] = POSEIDON2_ROUND_CONSTANTS_GOLDILOCKS_8[..8] + .try_into() + .unwrap(); + let got = run_inline_with_state(SboxOnlyTest::FUNCT3, SboxOnlyTest::FUNCT7, &initial); + let mut expected = [0u64; 8]; + for i in 0..8 { + expected[i] = crate::exec::sbox(initial[i]); + } + assert_eq!(got, expected); + } + + #[test] + fn identity_inline_preserves_state() { + let layout = InlineMemoryLayout::single_input(8, 64); + let mut harness = InlineTestHarness::new(layout); + harness.setup_registers(); + harness.load_input64(&[0u64]); + let initial: [u64; 8] = [11, 22, 33, 44, 55, 66, 77, 88]; + harness.load_state64(&initial); + let instr = InlineTestHarness::create_default_instruction( + crate::INLINE_OPCODE, + IdentityPermutation::FUNCT3, + IdentityPermutation::FUNCT7, + ); + harness.execute_inline(instr); + assert_eq!(harness.read_output64(8), initial); + } +} diff --git a/specs/1570-poseidon2-goldilocks-inline.md b/specs/1570-poseidon2-goldilocks-inline.md new file mode 100644 index 0000000000..cde060bf02 --- /dev/null +++ b/specs/1570-poseidon2-goldilocks-inline.md @@ -0,0 +1,125 @@ +# Spec: Poseidon2-Goldilocks Inline + +| Field | Value | +|-------------|--------------------------------| +| Author(s) | @jay-clarke | +| Created | 2026-05-24 | +| Status | proposed | +| PR | #1570 | + +## Summary + +Add a Jolt inline for the canonical 8-wide Poseidon2 permutation over the Goldilocks field. Poseidon2-Goldilocks is a common ZK-native permutation used for proof-friendly commitments and Merkle-tree style constructions. Executing it as ordinary guest Rust expands into many traced RISC-V instructions; an inline lets Jolt recognize this specific operation and replace it with a deterministic, tested virtual-instruction expansion. + +## Intent + +### Goal + +Provide a `jolt-inlines-poseidon2-goldilocks` crate that exposes a guest-callable `poseidon2_permute(&mut [u64; 8])`, registers a `Poseidon2Goldilocks` inline extension with the Jolt tracer, and expands the custom instruction into a sequence that is byte-equivalent to Plonky3's canonical `Poseidon2Goldilocks<8>` permutation. + +### Invariants + +1. The host reference implementation produces the same output as Plonky3's canonical `Poseidon2Goldilocks<8>` for every tested state. +2. The sequence-builder output, when executed through Jolt's inline emulator harness, produces the same output as the host reference implementation. +3. Goldilocks arithmetic stays in the field `p = 2^64 - 2^32 + 1`; multiplication reduction must match `u128` modular arithmetic for edge cases and random stress inputs. +4. Round constants are loaded in the same order as the permutation executes them: external initial constants, internal constants, then external final constants. +5. The internal diagonal matches Plonky3's `MATRIX_DIAG_8_GOLDILOCKS`. +6. The inline mutates only the 8-limb state buffer supplied by `rs1` and reads round constants from the table supplied by `rs2`. +7. The inline is gated behind a distinct `InlineExtension::Poseidon2Goldilocks` entry so profiles can opt into it explicitly. + +No `jolt-eval` invariant is proposed in this initial patch because existing inline crates primarily validate these properties through crate-local unit and emulator tests. A follow-up could add a shared inline-permutation equivalence invariant if maintainers want a broader framework-level check. + +### Non-Goals + +1. Supporting Poseidon2 widths other than 8. +2. Supporting fields other than Goldilocks. +3. Providing a sponge/hash API beyond the raw 8-limb permutation. +4. Changing existing Poseidon transcript code over BN254. +5. Replacing or modifying any existing hash/curve inline. +6. Claiming a specific performance improvement before benchmark review. + +## Evaluation + +### Acceptance Criteria + +- [ ] `cargo check -p jolt-inlines-poseidon2-goldilocks` passes. +- [ ] `cargo check -p jolt-inlines-poseidon2-goldilocks --features host` passes. +- [ ] `cargo test -p jolt-inlines-poseidon2-goldilocks --features host` passes. +- [ ] `cargo test -p jolt-riscv` passes after adding the new inline extension. +- [ ] Host permutation tests match Plonky3's default `Poseidon2Goldilocks<8>` path. +- [ ] Host permutation tests match an explicitly constructed generic Plonky3 `Poseidon2` path. +- [ ] Inline emulator tests match the host reference for fixed and randomized states. +- [ ] Goldilocks multiplication tests match `u128` modular arithmetic for edge and random stress cases. +- [ ] The new inline is registered through the existing `register_inlines!` mechanism. + +### Testing Strategy + +Existing tests that must continue passing: + +- `cargo test -p jolt-riscv` + +New tests added under `jolt-inlines/poseidon2-goldilocks`: + +- Field multiplication reduction tests against `u128`. +- Plonky3 parity tests for all-zero, known, near-modulus, and randomized states. +- Round-constant layout tests against Plonky3 constants. +- Internal diagonal tests against Plonky3 constants. +- Sequence-builder determinism tests. +- Inline emulator tests for full permutation and isolated sub-operations. + +Feature coverage is `--features host`, matching the existing inline-crate testing pattern. No `zk` feature coverage is required for this crate-local inline expansion patch. + +### Performance + +The expected direction is fewer traced instructions than executing the same Poseidon2 permutation as ordinary guest Rust. This PR does not set a hard speedup target. If maintainers want a merge-blocking benchmark, the natural follow-up is a small benchmark comparing: + +1. plain guest Rust Poseidon2-Goldilocks-8 execution, and +2. the inline opcode expanded through `Poseidon2GoldilocksPermutation::build_sequence`. + +No existing `jolt-eval` objective is modified in this initial patch. + +## Design + +### Architecture + +The change follows the existing `jolt-inlines/*` crate pattern: + +- `jolt-inlines/poseidon2-goldilocks/src/sdk.rs` exposes the guest API. In RISC-V guest builds, it emits a custom inline instruction. In host builds, it calls the host reference implementation. +- `jolt-inlines/poseidon2-goldilocks/src/exec.rs` contains the standalone host reference implementation of the 8-wide Poseidon2-Goldilocks permutation. +- `jolt-inlines/poseidon2-goldilocks/src/sequence_builder.rs` expands the inline instruction into virtual RISC-V instructions. +- `jolt-inlines/poseidon2-goldilocks/src/host.rs` registers the inline using the existing `register_inlines!` macro. +- `crates/jolt-riscv/src/profile.rs` adds `InlineExtension::Poseidon2Goldilocks`. +- The root workspace includes the new crate and adds Plonky3 crates as dev/test dependencies for parity checks. + +The instruction contract is: + +- `rs1` points to 8 writable `u64` limbs representing the state. +- `rs2` points to the static 86-element round-constant table. +- The operation permutes the state in place. + +### Alternatives Considered + +1. **Leave Poseidon2 as ordinary guest Rust.** Rejected because Poseidon2-Goldilocks is a proof-native primitive likely to appear in Jolt guest programs; inline support should reduce trace size for this hot operation. +2. **Expose a hash/sponge API instead of the raw permutation.** Rejected for the initial version. The raw permutation is the narrowest reusable primitive and avoids committing to one absorption/domain-separation policy. +3. **Support multiple widths immediately.** Rejected to keep the review surface small. Width 8 is the concrete Plonky3-compatible instance covered by the current implementation and tests. +4. **Use fixed test vectors only.** Considered as an alternative to Plonky3 dev dependencies. The current patch uses Plonky3 directly for stronger parity coverage, but fixed vectors would be a reasonable maintainer preference if dependency surface is a concern. + +## Documentation + +No Jolt book changes are required for the initial patch because this adds an internal inline crate and does not change user-facing Jolt APIs or examples. If maintainers want to advertise the inline, a follow-up can add a short entry to the inline documentation alongside the existing hash and curve inlines. + +## Execution + +The implementation should: + +1. Add the new inline crate under `jolt-inlines/poseidon2-goldilocks`. +2. Add a `Poseidon2Goldilocks` inline extension entry. +3. Register the inline with `register_inlines!`. +4. Implement field addition, multiplication, S-box, external MDS, internal diffusion, and round scheduling for the 8-wide Goldilocks instance. +5. Add parity and emulator tests described above. + +## References + +- [Poseidon2 paper](https://eprint.iacr.org/2023/323) +- [Plonky3 repository](https://github.com/Plonky3/Plonky3) +- Existing Jolt inline crates under `jolt-inlines/`