diff --git a/scripts/verify-baseline-static/src/main.rs b/scripts/verify-baseline-static/src/main.rs index 1963814bcde..7bf19995b69 100644 --- a/scripts/verify-baseline-static/src/main.rs +++ b/scripts/verify-baseline-static/src/main.rs @@ -878,8 +878,10 @@ fn scan_aarch64( // ARM64 padding is NOP (0xD503201F) or zeros — neither matches any of our // classify() patterns — so we don't need to special-case tail slop. - // chunks_exact(4) naturally drops any trailing 1-3 bytes. - for (i, chunk) in bytes.chunks_exact(4).enumerate() { + // as_chunks yields `&[u8; 4]` words (dropping any trailing 1-3 bytes via + // the remainder we ignore), so the byte accesses below are statically in + // bounds. + for (i, chunk) in bytes.as_chunks::<4>().0.iter().enumerate() { let ip = sec_addr + (i as u64) * 4; // Skip literal-pool data. data_ranges is sorted; partition_point finds // the first range whose start is > ip, so the candidate is the one @@ -888,7 +890,7 @@ fn scan_aarch64( if dr > 0 && ip < data_ranges[dr - 1].1 { continue; } - let w = u32::from_le_bytes([chunk[0], chunk[1], chunk[2], chunk[3]]); + let w = u32::from_le_bytes(*chunk); total_insns += 1; let Some(feat) = aarch64::classify(w) else { continue; @@ -1179,3 +1181,70 @@ fn main() -> ExitCode { } } } + +#[cfg(test)] +mod tests { + use super::*; + + // Little-endian bytes for an ARM64 instruction word. + fn word(w: u32) -> [u8; 4] { + w.to_le_bytes() + } + + #[test] + fn scan_aarch64_flags_post_baseline_word() { + // 0xc8a07c41 "cas x" -> Feature::Lse (post-baseline -> violation) + // 0x1f420c20 "fmadd" -> None (baseline, counted but skipped) + let mut bytes = Vec::new(); + bytes.extend_from_slice(&word(0xc8a07c41)); + bytes.extend_from_slice(&word(0x1f420c20)); + + let r = scan_aarch64(&bytes, 0x1000, &[], &Allowlist::new(), &[]); + + assert_eq!(r.total_insns, 2); + assert!(r.allowlisted.is_empty()); + assert_eq!(r.violations.len(), 1); + let report = r.violations.values().next().unwrap(); + assert_eq!(report.hits.len(), 1); + assert_eq!(report.hits[0].feature, "LSE"); + assert_eq!(report.hits[0].mnemonic, "cas"); + assert_eq!(report.hits[0].ip, 0x1000); + } + + #[test] + fn scan_aarch64_maps_each_chunk_to_its_address() { + // Three "cas" words; each 4-byte chunk's index must map to sec_addr + i*4. + // A single symbol spans the range so all three hits land in one bucket. + let mut bytes = Vec::new(); + for _ in 0..3 { + bytes.extend_from_slice(&word(0xc8a07c41)); + } + let syms = [Sym { + addr: 0x2000, + end: 0x2000 + bytes.len() as u64, + name: "func".to_string(), + }]; + + let r = scan_aarch64(&bytes, 0x2000, &syms, &Allowlist::new(), &[]); + + assert_eq!(r.total_insns, 3); + assert_eq!(r.violations.len(), 1); + let report = r.violations.values().next().unwrap(); + let ips: Vec = report.hits.iter().map(|h| h.ip).collect(); + assert_eq!(ips, vec![0x2000, 0x2004, 0x2008]); + } + + #[test] + fn scan_aarch64_drops_trailing_partial_word() { + // One full word plus 3 trailing bytes: the remainder is ignored, so + // exactly one instruction is scanned. + let mut bytes = Vec::new(); + bytes.extend_from_slice(&word(0xc8a07c41)); + bytes.extend_from_slice(&[0x00, 0x11, 0x22]); + + let r = scan_aarch64(&bytes, 0x1000, &[], &Allowlist::new(), &[]); + + assert_eq!(r.total_insns, 1); + assert_eq!(r.violations.len(), 1); + } +} diff --git a/src/ast/char_freq.rs b/src/ast/char_freq.rs index dca51831233..e9701546f85 100644 --- a/src/ast/char_freq.rs +++ b/src/ast/char_freq.rs @@ -114,10 +114,12 @@ fn scan_big(out: &mut Buffer, text: &[u8], delta: i32) { debug_assert!(text.len() >= SCAN_BIG_CHUNK_SIZE); - let unrolled = text.len() - (text.len() % SCAN_BIG_CHUNK_SIZE); - let (chunks, remain) = text.split_at(unrolled); + // `as_chunks` yields `&[u8; SCAN_BIG_CHUNK_SIZE]` arrays (plus the tail + // remainder), so the inner `chunk[i]` accesses are statically in bounds and + // the per-element bounds checks `chunks_exact` leaves in are elided. + let (chunks, remain) = text.as_chunks::(); - for chunk in chunks.chunks_exact(SCAN_BIG_CHUNK_SIZE) { + for chunk in chunks { // PERF: candidate for unrolling — profile for i in 0..SCAN_BIG_CHUNK_SIZE { deltas[chunk[i] as usize] += delta; diff --git a/src/bun_core/lib.rs b/src/bun_core/lib.rs index 4434af71e54..bffe20c8533 100644 --- a/src/bun_core/lib.rs +++ b/src/bun_core/lib.rs @@ -1938,18 +1938,26 @@ pub(crate) mod strings_impl { const HIGH_BITS: u64 = 0x8080_8080_8080_8080; let mut copied = 0usize; - for (d, s) in dst.chunks_exact_mut(8).zip(src.chunks_exact(8)) { - let word = u64::from_ne_bytes(s.try_into().expect("infallible: size matches")); + + // `dst` and `src` are the same length (asserted above), so they split + // into the same number of `&[u8; 8]` words and equal-length remainders. + // The array `s` makes the word load a plain `from_ne_bytes(*s)` with no + // fallible `try_into`, and the store a fixed-size array assignment. + let (dst_chunks, dst_remainder) = dst.as_chunks_mut::<8>(); + let (src_chunks, src_remainder) = src.as_chunks::<8>(); + + for (d, s) in dst_chunks.iter_mut().zip(src_chunks.iter()) { + let word = u64::from_ne_bytes(*s); let mask = word & HIGH_BITS; if mask != 0 { let ascii = (mask.trailing_zeros() / 8) as usize; d[..ascii].copy_from_slice(&s[..ascii]); return copied + ascii; } - d.copy_from_slice(&word.to_ne_bytes()); + *d = word.to_ne_bytes(); copied += 8; } - for (d, &s) in dst[copied..].iter_mut().zip(&src[copied..]) { + for (d, &s) in dst_remainder.iter_mut().zip(src_remainder.iter()) { if s >= 0x80 { return copied; } diff --git a/src/http_jsc/websocket_client/WebSocketUpgradeClient.rs b/src/http_jsc/websocket_client/WebSocketUpgradeClient.rs index a3f83798204..c4fcbd89ed8 100644 --- a/src/http_jsc/websocket_client/WebSocketUpgradeClient.rs +++ b/src/http_jsc/websocket_client/WebSocketUpgradeClient.rs @@ -1873,7 +1873,9 @@ impl<'a> Headers8Bit<'a> { fn iter(&self) -> impl Iterator + '_ { self.slices - .chunks_exact(2) + .as_chunks::<2>() + .0 + .iter() .map(|pair| (pair[0].slice(), pair[1].slice())) } diff --git a/src/install/lockfile/Package.rs b/src/install/lockfile/Package.rs index e532f984701..f3e1f1fc611 100644 --- a/src/install/lockfile/Package.rs +++ b/src/install/lockfile/Package.rs @@ -3364,6 +3364,12 @@ pub mod serializer { // raw u8 here. Layout: `ResolutionType` is `#[repr(C)] // { tag: Tag, _padding: [u8; 7], value: ... }`, so the // discriminant is the first byte of each element. + // + // Unlike the `Meta`/`Bin` blocks below, this stays on + // `chunks_exact`: `size_of::>()` + // depends on the generic `SemverIntType`, and a const-generic + // argument that uses a type parameter needs the unstable + // `generic_const_exprs`, which this crate does not enable. let stride = mem::size_of::>(); debug_assert!(stride != 0 && src.len().is_multiple_of(stride)); for raw in src.chunks_exact(stride) { @@ -3380,13 +3386,13 @@ pub mod serializer { // `HasInstallScript` = 0..=2). Copying an out-of-range byte // into either field and reading it back as the enum would // be immediate UB, so check the raw stream bytes first. - let stride = mem::size_of::(); - let origin_at = mem::offset_of!(Meta, origin); - let install_script_at = mem::offset_of!(Meta, has_install_script); - debug_assert!(stride != 0 && src.len().is_multiple_of(stride)); - for raw in src.chunks_exact(stride) { - if !matches!(raw[origin_at], 0..=2) - || !matches!(raw[install_script_at], 0..=2) + const STRIDE: usize = mem::size_of::(); + const ORIGIN_AT: usize = mem::offset_of!(Meta, origin); + const INSTALL_SCRIPT_AT: usize = mem::offset_of!(Meta, has_install_script); + debug_assert!(STRIDE != 0 && src.len().is_multiple_of(STRIDE)); + for raw in src.as_chunks::().0 { + if !matches!(raw[ORIGIN_AT], 0..=2) + || !matches!(raw[INSTALL_SCRIPT_AT], 0..=2) { return Err(bun_core::err!( "Lockfile validation failed: invalid package meta" @@ -3397,11 +3403,11 @@ pub mod serializer { if matches!(field, PackageField::Bin) { // `Bin.tag` is a `#[repr(u8)]` enum with discriminants // 0..=4; validate it the same way before the copy. - let stride = mem::size_of::(); - let tag_at = mem::offset_of!(Bin, tag); - debug_assert!(stride != 0 && src.len().is_multiple_of(stride)); - for raw in src.chunks_exact(stride) { - if !matches!(raw[tag_at], 0..=4) { + const STRIDE: usize = mem::size_of::(); + const TAG_AT: usize = mem::offset_of!(Bin, tag); + debug_assert!(STRIDE != 0 && src.len().is_multiple_of(STRIDE)); + for raw in src.as_chunks::().0 { + if !matches!(raw[TAG_AT], 0..=4) { return Err(bun_core::err!( "Lockfile validation failed: invalid bin tag" )); diff --git a/src/runtime/image/codecs.rs b/src/runtime/image/codecs.rs index 279c3adfeb2..a5dbb06f3f1 100644 --- a/src/runtime/image/codecs.rs +++ b/src/runtime/image/codecs.rs @@ -299,7 +299,7 @@ pub fn decode(bytes: &[u8], max_pixels: u64, hint: DecodeHint) -> Result().0 { if px[3] == 0 { px[0] = 0; px[1] = 0; diff --git a/src/runtime/webcore/encoding.rs b/src/runtime/webcore/encoding.rs index 2bc37bf9489..4fa5e9a94ca 100644 --- a/src/runtime/webcore/encoding.rs +++ b/src/runtime/webcore/encoding.rs @@ -781,8 +781,8 @@ pub(crate) unsafe fn construct_from_u8( // directly into a `Vec` so we never depend on an allocator- // layout-dependent `Vec → Vec` header reinterpret. let mut to = vec![0u8; len * 2]; - for (out, &b) in to.chunks_exact_mut(2).zip(input_slice) { - out.copy_from_slice(&u16::from(b).to_ne_bytes()); + for (out, &b) in to.as_chunks_mut::<2>().0.iter_mut().zip(input_slice) { + *out = u16::from(b).to_ne_bytes(); } to } diff --git a/test/cli/install/bun-lockb-field-validation.test.ts b/test/cli/install/bun-lockb-field-validation.test.ts new file mode 100644 index 00000000000..5b0b94a5f25 --- /dev/null +++ b/test/cli/install/bun-lockb-field-validation.test.ts @@ -0,0 +1,131 @@ +import { install_test_helpers } from "bun:internal-for-testing"; +import { expect, test } from "bun:test"; +import { bunEnv, bunExe, tempDir } from "harness"; +import { copyFileSync, readFileSync, writeFileSync } from "node:fs"; +import { join } from "node:path"; + +const { parseLockfile } = install_test_helpers; + +// These tests exercise the raw-byte validation loops in the binary-lockfile +// loader (`Package::load_fields`), which iterate the `meta`/`bin` columns and +// reject out-of-range enum discriminants before the bytes are reinterpreted as +// `#[repr(u8)]` enums. A `file:` tarball dependency is used so the lockfile is +// produced and re-parsed entirely offline — no registry needed. `parseLockfile` +// drives `Lockfile::load_from_dir`, which runs `load_fields`. + +const tarball = join(import.meta.dir, "bar-0.0.2.tgz"); + +async function installFileDep(dir: string) { + copyFileSync(tarball, join(dir, "bar-0.0.2.tgz")); + await using proc = Bun.spawn({ + cmd: [bunExe(), "install", "--no-progress"], + cwd: dir, + env: bunEnv, + stdout: "ignore", + stderr: "pipe", + }); + const [stderr, exitCode] = await Promise.all([proc.stderr.text(), proc.exited]); + expect(stderr).not.toContain("error:"); + expect(exitCode).toBe(0); +} + +// Locate the `meta` and `bin` columns in a binary lockfile. Packages are stored +// SoA and the columns are written back-to-back in declaration order: name (8), +// name_hash (8), resolution (72 in format v3, 64 in v2), dependencies (8), +// resolutions (8), then meta (88 bytes/record) and bin (20 bytes/record). +const META_SIZE = 88; +const BIN_SIZE = 20; + +function packageColumns(lockb: Buffer) { + const fmt = lockb.readUInt32LE(42); + const n = Number(lockb.readBigUInt64LE(86)); + const begin = Number(lockb.readBigUInt64LE(110)); + let resolutionSize: number; + switch (fmt) { + case 2: + resolutionSize = 64; + break; + case 3: + resolutionSize = 72; + break; + default: + // If the binary format changes again, fail loudly rather than silently + // corrupting the wrong byte and masking it as a field-validation test. + throw new Error(`unexpected bun.lockb format version ${fmt}`); + } + const metaStart = begin + n * (8 + 8 + resolutionSize + 8 + 8); + const binStart = metaStart + n * META_SIZE; + return { n, metaStart, binStart }; +} + +test("valid binary lockfile round-trips through the field loader", async () => { + using dir = tempDir("lockb-field-valid", { + "package.json": JSON.stringify({ + name: "lockb-field-valid", + version: "1.0.0", + dependencies: { "dummy-package": "file:./bar-0.0.2.tgz" }, + }), + "bunfig.toml": "[install]\nsaveTextLockfile = false\n", + }); + await installFileDep(String(dir)); + + const parsed = parseLockfile(String(dir)) as { packages?: Record }; + // Loading succeeds, which means `load_fields` ran its meta/bin validation + // loops over the real column bytes without rejecting them. + expect(parsed.packages).toBeDefined(); + expect(Object.keys(parsed.packages!).length).toBe(2); +}); + +test("rejects a binary lockfile whose meta.origin byte is out of range", async () => { + using dir = tempDir("lockb-field-origin", { + "package.json": JSON.stringify({ + name: "lockb-field-origin", + version: "1.0.0", + dependencies: { "dummy-package": "file:./bar-0.0.2.tgz" }, + }), + "bunfig.toml": "[install]\nsaveTextLockfile = false\n", + }); + await installFileDep(String(dir)); + + const lockbPath = join(String(dir), "bun.lockb"); + const lockb = readFileSync(lockbPath); + const { n, metaStart } = packageColumns(lockb); + + // `Meta.origin` is the first byte of each 88-byte record; the `Origin` enum + // is `#[repr(u8)]` with discriminants 0..=2, so 0x42 is out of range and the + // per-element check in the `meta` validation loop must reject it. + expect(n).toBeGreaterThan(0); + const originOffset = metaStart + (n - 1) * META_SIZE + 0; + expect(lockb[originOffset]).toBeLessThanOrEqual(2); // sanity: valid before + lockb[originOffset] = 0x42; + writeFileSync(lockbPath, lockb); + + expect(() => parseLockfile(String(dir))).toThrow("Lockfile validation failed: invalid package meta"); +}); + +test("rejects a binary lockfile whose bin.tag byte is out of range", async () => { + using dir = tempDir("lockb-field-bin", { + "package.json": JSON.stringify({ + name: "lockb-field-bin", + version: "1.0.0", + dependencies: { "dummy-package": "file:./bar-0.0.2.tgz" }, + }), + "bunfig.toml": "[install]\nsaveTextLockfile = false\n", + }); + await installFileDep(String(dir)); + + const lockbPath = join(String(dir), "bun.lockb"); + const lockb = readFileSync(lockbPath); + const { n, binStart } = packageColumns(lockb); + + // `Bin.tag` is the first byte of each 20-byte record; the `Tag` enum is + // `#[repr(u8)]` with discriminants 0..=4, so 0x42 is out of range and the + // per-element check in the `bin` validation loop must reject it. + expect(n).toBeGreaterThan(0); + const tagOffset = binStart + (n - 1) * BIN_SIZE + 0; + expect(lockb[tagOffset]).toBeLessThanOrEqual(4); // sanity: valid before + lockb[tagOffset] = 0x42; + writeFileSync(lockbPath, lockb); + + expect(() => parseLockfile(String(dir))).toThrow("Lockfile validation failed: invalid bin tag"); +}); diff --git a/test/js/node/buffer-utf16.test.ts b/test/js/node/buffer-utf16.test.ts index 2504d052e4c..4434d975b2d 100644 --- a/test/js/node/buffer-utf16.test.ts +++ b/test/js/node/buffer-utf16.test.ts @@ -1,4 +1,4 @@ -import { expect, test } from "bun:test"; +import { describe, expect, test } from "bun:test"; test("utf16-le buffer", () => { const twoByteString = new Array(16) @@ -32,3 +32,44 @@ e000e100e200e300e400e500e600e700e800e900ea00eb00ec00ed00ee00ef00 f000f100f200f300f400f500f600f700f800f900fa00fb00fc00fd00fe00ff00`, ); }); + +// Buffer.from(latin1String, "utf16le" | "ucs2") widens each Latin-1 byte to +// one little-endian UTF-16 code unit. A JS string stays Latin-1-backed (8-bit) +// when every code point is <= U+00FF, so these inputs exercise the byte-pair +// widening loop in Bun__encoding__constructFromLatin1. +describe("latin1 -> UTF-16 widening (Buffer.from)", () => { + describe.each(["utf16le", "utf-16le", "ucs2", "ucs-2"] as const)("%s", encoding => { + test("empty string produces an empty buffer", () => { + expect(Buffer.from("", encoding)).toEqual(Buffer.alloc(0)); + }); + + test("single byte widens to one little-endian code unit", () => { + // 'A' (0x41) -> 0x41 0x00 + expect([...Buffer.from("A", encoding)]).toEqual([0x41, 0x00]); + }); + + test("high bytes (0x80-0xFF) zero-extend, not sign-extend", () => { + // \u00ff stays 8-bit; must become ff 00, not ff ff. + expect([...Buffer.from("\x80\xff", encoding)]).toEqual([0x80, 0x00, 0xff, 0x00]); + }); + + test("every Latin-1 byte 0x00-0xFF widens correctly", () => { + const all = Array.from({ length: 256 }, (_, i) => String.fromCharCode(i)).join(""); + const buf = Buffer.from(all, encoding); + expect(buf.length).toBe(512); + const expected = Buffer.alloc(512); + for (let i = 0; i < 256; i++) expected[i * 2] = i; // low byte = i, high byte = 0 + expect(buf).toEqual(expected); + }); + + test("long input widens every chunk (round-trips back to the source)", () => { + // Long enough to span many byte-pairs; all code points <= 0xFF so the + // string stays 8-bit and takes the widening path. + const latin1 = Buffer.alloc(1000, 0xe9).toString("latin1"); // "é" * 1000 + const buf = Buffer.from(latin1, encoding); + // Every byte-pair widens 0xe9 -> [0xe9, 0x00]. + expect(buf).toEqual(Buffer.from(Array.from({ length: 1000 }, () => [0xe9, 0x00]).flat())); + expect(buf.toString("utf16le")).toBe(latin1); + }); + }); +});