Skip to content
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
239 changes: 168 additions & 71 deletions src/bun_core/string/StringJoiner.rs
Original file line number Diff line number Diff line change
@@ -1,95 +1,61 @@
//! Rope-like data structure for joining many small strings into one big string.
//! Implemented as a flat `Vec` of potentially-owned slices plus a running
//! Implemented as a flat `Vec` of borrowed-or-owned slices plus a running
//! length, so the join-time output buffer can be sized exactly once.

use crate::RawSlice;
use crate::string::strings;
use bun_alloc::AllocError;

// PORT NOTE: Zig's `std.mem.Allocator` param field dropped — global mimalloc is used for
// node and duplicated-string allocations.
#[derive(Default)]
pub struct StringJoiner {
pub struct StringJoiner<'a> {
/// Total length of all nodes
pub len: usize,

/// Slices in insertion order. Stored flat instead of as a singly-linked
/// list so a join with N pieces does ~log₂N Vec reallocs instead of N
/// `Box<Node>` allocations and N pointer-chasing dereferences on drain.
nodes: Vec<Node>,
nodes: Vec<Node<'a>>,

/// Avoid an extra pass over the list when joining
pub watcher: Watcher,
pub watcher: Watcher<'a>,
}

// SAFETY: `nodes` holds `RawSlice<u8>` raw fat pointers which alias
// caller-owned (`owns_slice = false`) or joiner-owned (`owns_slice = true`)
// storage; no aliasing escapes `&mut self` methods. The Zig original is
// passed across bundler worker threads (see Chunk.IntermediateOutput).
unsafe impl Send for StringJoiner {}
// SAFETY: `&StringJoiner` only exposes read-only views (`last_byte`,
// `node_slices`, `contains`) over `RawSlice<u8>` storage with no interior
// mutability; concurrent shared reads of the owned/borrowed-until-`done()`
// byte buffers are data-race-free.
unsafe impl Sync for StringJoiner {}

struct Node {
/// Replaces Zig's `NullableAllocator`: when `true`, `slice` was heap-allocated by
/// this joiner (via `push_owned`/`push_cloned`) and is freed on node drop;
/// when `false`, `slice` is borrowed and the caller guarantees it outlives `done()`.
owns_slice: bool,
// TODO(port): lifetime — borrowed slices must outlive `done()`; the port avoids
// struct lifetime params, so this is stored as a typed raw fat pointer.
// `RawSlice` (one encapsulated unsafe in `.slice()`) replaces the open-coded
// raw deref at every read site; the backing storage outlives the node by
// either ownership (`owns_slice`) or caller contract.
slice: RawSlice<u8>,
enum Node<'a> {
/// Borrowed for `'a`; the caller's data must stay valid until the joiner's
/// last read (`done`/`done_with_end`/`node_slices`/`contains`/`last_byte`).
Borrowed(&'a [u8]),
/// Heap-allocated by this joiner (via `push_owned`/`push_cloned`); freed
/// when the node drops.
Owned(Box<[u8]>),
}

impl Node {
impl Node<'_> {
#[inline]
fn slice(&self) -> &[u8] {
self.slice.slice()
}
}

// SAFETY: `Node` is a plain (slice, ownership-bit) record; the `RawSlice` raw
// pointer is uniquely owned (or borrowed under caller contract) through the
// `Vec` rooted at `StringJoiner.nodes` and never shared aliased across threads
// concurrently. The Zig original moves these between bundler worker threads.
unsafe impl Send for Node {}
// SAFETY: `&Node` only reads the immutable `RawSlice<u8>` backing bytes via
// `slice()`; there is no interior mutability, so concurrent shared access from
// multiple threads cannot race.
unsafe impl Sync for Node {}

impl Drop for Node {
fn drop(&mut self) {
if self.owns_slice {
// SAFETY: when owns_slice is true, slice was produced by Box::<[u8]>::into_raw
// in `push_cloned`/`push_owned` and has not been freed.
drop(unsafe { crate::heap::take(self.slice.as_ptr().cast_mut()) });
match self {
Node::Borrowed(slice) => slice,
Node::Owned(boxed) => boxed,
}
}
}

#[derive(Default)]
pub struct Watcher {
// TODO(port): lifetime — callers may assign non-'static data; never freed in Zig.
pub input: &'static [u8],
pub struct Watcher<'a> {
pub input: &'a [u8],
pub estimated_count: u32,
pub needs_newline: bool,
}

impl StringJoiner {
impl<'a> StringJoiner<'a> {
/// Pre-allocate room for `additional` more pushed slices, so a join with a
/// known piece count does a single nodes allocation instead of log₂N grows.
pub fn reserve(&mut self, additional: usize) {
self.nodes.reserve(additional);
}

/// `data` is expected to live until `.done` is called
pub fn push_static(&mut self, data: &[u8]) {
pub fn push_static(&mut self, data: &'a [u8]) {
self.push(data);
}

Expand All @@ -98,10 +64,7 @@ impl StringJoiner {
if data.is_empty() {
return;
}
let raw: *const [u8] = crate::heap::into_raw(data);
// SAFETY: `raw` is a fresh `Box::into_raw` allocation owned by the node
// until `Node::drop` reclaims it (`owns_slice = true`).
self.push_raw(unsafe { RawSlice::from_raw(raw) }, true);
self.push_node(Node::Owned(data));
}

/// `data` is cloned
Expand All @@ -117,29 +80,58 @@ impl StringJoiner {
// The optional allocator only encoded ownership of `data`, which has no Rust
// analogue for a borrowed `&[u8]`; callers wanting owned semantics use
// `push_owned`/`push_cloned` instead.
pub fn push(&mut self, data: &[u8]) {
pub fn push(&mut self, data: &'a [u8]) {
if data.is_empty() {
return;
}
self.push_raw(RawSlice::new(data), false);
self.push_node(Node::Borrowed(data));
}

fn push_raw(&mut self, data: RawSlice<u8>, owned: bool) {
let data_slice = data.slice();
if data_slice.is_empty() {
return;
}
fn push_node(&mut self, node: Node<'a>) {
let data_slice = node.slice();
debug_assert!(!data_slice.is_empty());
self.len += data_slice.len();

self.watcher.estimated_count += (self.watcher.input.len() > 0
&& strings::index_of(data_slice, self.watcher.input).is_some())
as u32;
self.watcher.needs_newline = data_slice[data_slice.len() - 1] != b'\n';

self.nodes.push(Node {
owns_slice: owned,
slice: data,
});
self.nodes.push(node);
}

/// Re-tag every borrowed segment (and `watcher.input`) as `'static` so the
/// joiner can be stored in lifetime-free storage and read later (e.g. the
/// bundler's deferred `Chunk.intermediate_output`).
///
/// # Safety
/// Every borrowed segment previously pushed (`push`/`push_static`) and
/// `watcher.input` must remain valid — not freed, moved, or reallocated —
/// for as long as the returned joiner (or anything it is moved into) is
/// alive.
pub unsafe fn detach_lifetime(self) -> StringJoiner<'static> {
StringJoiner {
len: self.len,
nodes: self
.nodes
.into_iter()
.map(|node| match node {
Node::Borrowed(slice) => {
// SAFETY: caller contract — the backing storage outlives
// the returned joiner.
Node::Borrowed(unsafe { &*core::ptr::from_ref::<[u8]>(slice) })
}
Node::Owned(boxed) => Node::Owned(boxed),
})
.collect(),
watcher: Watcher {
// SAFETY: caller contract — `watcher.input` outlives the
// returned joiner.
input: unsafe { &*core::ptr::from_ref::<[u8]>(self.watcher.input) },
estimated_count: self.watcher.estimated_count,
needs_newline: self.watcher.needs_newline,
},
}
}
Comment thread
coderabbitai[bot] marked this conversation as resolved.

/// This deinits the string joiner on success, the new string is owned by the caller.
Expand All @@ -158,7 +150,7 @@ impl StringJoiner {
let mut out = Vec::<u8>::with_capacity(len);
for node in self.nodes.drain(..) {
out.extend_from_slice(node.slice());
// `drop(node)` runs `Node::drop`, freeing `slice` when owned.
// `drop(node)` frees the buffer when owned.
}
debug_assert_eq!(out.len(), len);
Ok(out.into_boxed_slice())
Expand Down Expand Up @@ -212,7 +204,112 @@ impl StringJoiner {
}
}

// `Drop` for `StringJoiner` is implicit: `Vec<Node>::drop` runs `Node::drop`
// for each element, which frees joiner-owned slices.
// `Drop` for `StringJoiner` is implicit: `Vec<Node>::drop` frees joiner-owned
// slices (`Node::Owned`); borrowed nodes are not freed.

#[cfg(test)]
mod tests {
use super::*;

#[test]
fn push_kinds_concatenate_in_insertion_order() {
let owned: Box<[u8]> = Box::from(b"owned".as_slice());
let cloned_src = b"cloned".to_vec();
let mut j = StringJoiner::default();
j.push(b"borrowed ");
j.push_static(b"static ");
j.push_owned(owned);
j.push_cloned(&cloned_src);
drop(cloned_src);
assert_eq!(j.len, "borrowed static ownedcloned".len());
assert_eq!(&*j.done().unwrap(), b"borrowed static ownedcloned");
assert_eq!(j.len, 0);
}

#[test]
fn empty_pushes_are_skipped() {
let mut j = StringJoiner::default();
j.push(b"");
j.push_static(b"");
j.push_owned(Box::default());
j.push_cloned(b"");
assert_eq!(j.len, 0);
assert_eq!(j.node_slices().count(), 0);
assert_eq!(&*j.done().unwrap(), b"");
}

#[test]
fn done_with_end_appends_suffix() {
let mut j = StringJoiner::default();
assert_eq!(&*j.done_with_end(b"").unwrap(), b"");
assert_eq!(&*j.done_with_end(b"suffix").unwrap(), b"suffix");
j.push(b"body");
assert_eq!(&*j.done_with_end(b"!\n").unwrap(), b"body!\n");
}

#[test]
fn last_byte_contains_and_node_slices() {
let mut j = StringJoiner::default();
assert_eq!(j.last_byte(), 0);
j.push(b"abc");
j.push_cloned(b"def");
assert_eq!(j.last_byte(), b'f');
assert!(!j.contains(b"cd"));
assert!(j.contains(b"de"));
let slices: Vec<&[u8]> = j.node_slices().collect();
assert_eq!(slices, vec![b"abc".as_slice(), b"def".as_slice()]);
}

#[test]
fn ensure_newline_at_end_tracks_watcher() {
let mut j = StringJoiner::default();
j.push(b"no newline");
j.ensure_newline_at_end();
j.ensure_newline_at_end();
assert_eq!(&*j.done().unwrap(), b"no newline\n");

let mut j = StringJoiner::default();
j.push(b"has newline\n");
j.ensure_newline_at_end();
assert_eq!(&*j.done().unwrap(), b"has newline\n");
}

#[test]
fn watcher_estimates_unique_key_occurrences() {
let mut j = StringJoiner {
watcher: Watcher {
input: b"KEY",
..Default::default()
},
..Default::default()
};
j.push(b"prefix KEY suffix");
j.push(b"no match");
j.push_cloned(b"another KEY");
assert_eq!(j.watcher.estimated_count, 2);
}

#[test]
fn detach_lifetime_round_trips_borrowed_data() {
let borrowed = b"KEY borrowed ".to_vec();
let input = b"KEY".to_vec();
let mut j = StringJoiner {
watcher: Watcher {
input: &input,
..Default::default()
},
..Default::default()
};
j.push(&borrowed);
j.push_cloned(b"cloned KEY");
// SAFETY: `borrowed` and `input` are declared before `detached`, so they outlive it.
let mut detached = unsafe { j.detach_lifetime() };
assert_eq!(detached.len, "KEY borrowed cloned KEY".len());
assert_eq!(detached.watcher.input, b"KEY");
assert_eq!(detached.watcher.estimated_count, 2);
assert!(detached.watcher.needs_newline);
assert_eq!(&*detached.done().unwrap(), b"KEY borrowed cloned KEY");
}
}

// ported from: src/string/StringJoiner.zig
4 changes: 2 additions & 2 deletions src/bundler/Chunk.rs
Original file line number Diff line number Diff line change
Expand Up @@ -113,7 +113,7 @@ impl Default for Content {

// SAFETY: `Chunk` is processed across the bundler thread pool (see
// `computeCrossChunkDependencies`, `generateChunksInParallel`). Raw-pointer
// fields (`Layers::Borrowed`, `StringJoiner` nodes, `ChunkRenamer` arena)
// fields (`Layers::Borrowed`, `ChunkRenamer` arena)
// point into bundler-arena storage that outlives the
// pool join and is only mutated by the owning task. Zig has no Send/Sync
// distinction; mirror `InputFile`'s blanket impls (bundle_v2.rs).
Expand Down Expand Up @@ -378,7 +378,7 @@ pub enum IntermediateOutput {
/// If the chunk doesn't have any references to other chunks, then
/// `joiner` contains the contents of the chunk. This is more efficient
/// because it avoids doing a join operation twice.
Joiner(StringJoiner),
Joiner(StringJoiner<'static>),
Comment thread
Jarred-Sumner marked this conversation as resolved.

#[default]
Empty,
Expand Down
2 changes: 1 addition & 1 deletion src/bundler/LinkerContext.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4149,7 +4149,7 @@ impl<'a> LinkerContext<'a> {
pub fn break_output_into_pieces(
&self,
_alloc: *const Bump,
j: &mut StringJoiner,
j: &mut StringJoiner<'static>,
count: u32,
) -> Result<crate::chunk::IntermediateOutput, BunError> {
let _trace = bun::perf::trace("Bundler.breakOutputIntoPieces");
Expand Down
5 changes: 5 additions & 0 deletions src/bundler/linker_context/MetafileBuilder.rs
Original file line number Diff line number Diff line change
Expand Up @@ -410,6 +410,11 @@ pub fn generate(c: &mut LinkerContext, chunks: &mut [Chunk]) -> Result<Box<[u8]>

// Break output into pieces and resolve chunk references to final paths
let alloc = c.arena();
// SAFETY: every borrowed node in `j` points into `chunk.metafile_chunk_json`,
// parse-graph data (import-record kind labels), or `'static` literals, all of
// which outlive `intermediate` — it is consumed by `code()` below while `chunks`
// and `c` are still alive.
let mut j = unsafe { j.detach_lifetime() };
let mut intermediate = c.break_output_into_pieces(
alloc,
&mut j,
Expand Down
7 changes: 7 additions & 0 deletions src/bundler/linker_context/postProcessCSSChunk.rs
Original file line number Diff line number Diff line change
Expand Up @@ -142,6 +142,13 @@ pub fn post_process_css_chunk(

// SAFETY: `worker.arena` set by `Worker::create`, outlives the worker step.
let alloc = worker.arena();
// SAFETY: every borrowed node in `j` points into `chunk.compile_results_for_chunk`
// (filled in place before post-processing, never reassigned afterwards), graph
// source paths (`Path<'static>`), or `'static` literals; `watcher.input` is
// `chunk.unique_key` (`&'static`). All of these outlive the joiner stored in
// `chunk.intermediate_output`, which is only read while the chunk and the linker
// graph are alive.
let mut j = unsafe { j.detach_lifetime() };
chunk.intermediate_output =
bun_core::handle_oom(c.break_output_into_pieces(alloc, &mut j, ctx.chunks.len() as u32));
// TODO: meta contents
Expand Down
6 changes: 6 additions & 0 deletions src/bundler/linker_context/postProcessHTMLChunk.rs
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,12 @@ pub fn post_process_html_chunk(

// SAFETY: `worker.arena` is set by `Worker::create` and outlives the worker step.
let alloc = worker.arena();
// SAFETY: every borrowed node in `j` points into `chunk.compile_results_for_chunk`
// (filled in place before post-processing, never reassigned afterwards);
// `watcher.input` is `chunk.unique_key` (`&'static`). Both outlive the joiner
// stored in `chunk.intermediate_output`, which is only read while the chunk and
// the linker graph are alive.
let mut j = unsafe { j.detach_lifetime() };
chunk.intermediate_output = bun_core::handle_oom(c.break_output_into_pieces(
alloc,
&mut j,
Expand Down
Loading
Loading