From 4b72a0a76c3cb1136f31bc125233a01edda569c4 Mon Sep 17 00:00:00 2001 From: Alistair Smith Date: Mon, 8 Jun 2026 14:59:09 -0700 Subject: [PATCH 1/3] parsers: consolidate lexer logging and number scanning helpers --- src/ast/lexer_log.rs | 409 +++++++++++++++++++++++++ src/js_parser/lexer.rs | 375 +++-------------------- src/js_parser/parse/parse_entry.rs | 105 ++----- src/parsers/json.rs | 316 ++++--------------- src/parsers/json_lexer.rs | 108 ++----- src/parsers/lib.rs | 4 + src/parsers/number_scan.rs | 128 ++++++++ src/parsers/toml/lexer.rs | 477 +++-------------------------- 8 files changed, 744 insertions(+), 1178 deletions(-) create mode 100644 src/parsers/number_scan.rs diff --git a/src/ast/lexer_log.rs b/src/ast/lexer_log.rs index 7346a5ee2ad..9f25dfe124a 100644 --- a/src/ast/lexer_log.rs +++ b/src/ast/lexer_log.rs @@ -6,6 +6,10 @@ //! `prev_error_loc`, push into `Log`, then record the loc. This trait //! collapses all three. //! +//! It also hosts the shared string escape-sequence decoder +//! ([`decode_escape_sequences`] / [`EscapeLexer`]) that the js/json and toml +//! lexers previously each carried a ~330-line copy of. +//! //! The trait carries a `'s` lifetime so `source()` can hand back the lexer's //! stored `&'s Source` *without* borrowing `self` — that is what lets the //! provided bodies call `self.log_mut()` afterwards without a split-borrow @@ -111,3 +115,408 @@ pub trait LexerLog<'s> { Err(Self::syntax_err()) } } + +/// Surface [`decode_escape_sequences`] needs from a lexer. Monomorphizes per +/// lexer type, so codegen matches the previous per-lexer inline copies. +pub trait EscapeLexer<'s>: LexerLog<'s> { + /// Decoded output sink: UTF-16 code units for the js lexer, WTF-8 bytes + /// for the toml lexer. + type Buf; + + /// JSON mode: reject legacy octal, `\u{...}`, line continuations, and any + /// simple escape outside the JSON set. + const IS_JSON: bool = false; + + /// toml only: keep error spans in their historical shape — the legacy + /// octal `Range` start is text-relative (no `start +`) and the `\u{...}` + /// span start also subtracts the width of `{`. The js lexer computes both + /// absolutely (oven-sh/bun#31134). + const LEGACY_ERROR_SPANS: bool = false; + + fn end_mut(&mut self) -> &mut usize; + fn push_codepoint(buf: &mut Self::Buf, c: u32); +} + +/// Decodes the backslash escape sequences of a string-literal body `text` +/// into `buf`. `start` is the absolute source offset of `text`'s first byte, +/// used to report error locations. +/// +/// `ALLOW_LINE_CONTINUATIONS` permits `\` (always true for js; +/// toml multiline basic strings only). `REJECT_HEX_ESCAPE` errors on `\x` +/// (toml multiline basic strings only). +pub fn decode_escape_sequences< + 's, + L: EscapeLexer<'s>, + const ALLOW_LINE_CONTINUATIONS: bool, + const REJECT_HEX_ESCAPE: bool, +>( + lexer: &mut L, + start: usize, + text: &[u8], + buf: &mut L::Buf, +) -> Result<(), L::Err> { + use bun_core::fmt::hex_digit_value_u32; + use bun_core::strings; + use bun_core::strings::CodePoint; + + let iterator = strings::CodepointIterator::init(text); + let mut iter = strings::Cursor::default(); + while iterator.next(&mut iter) { + let width = iter.width; + match iter.c { + 0x0D => { + // From the specification: + // + // 11.8.6.1 Static Semantics: TV and TRV + // + // TV excludes the code units of LineContinuation while TRV includes + // them. and LineTerminatorSequences are normalized to + // for both TV and TRV. An explicit EscapeSequence is needed to + // include a or sequence. + + // Convert '\r\n' into '\n'. After `next()` returns for `\r`, + // `iter.i` is the start byte of the `\r` itself — the `\n` we're + // looking for is at `iter.i + 1`. + let next_i: usize = iter.i as usize + 1; + iter.i += (next_i < text.len() && text[next_i] == b'\n') as u32; + + // Convert '\r' into '\n' + L::push_codepoint(buf, u32::from(b'\n')); + continue; + } + + 0x5C => { + if !iterator.next(&mut iter) { + return Ok(()); + } + + let c2 = iter.c; + let width2 = iter.width; + match c2 { + // https://mathiasbynens.be/notes/javascript-escapes#single + 0x62 => { + L::push_codepoint(buf, 0x08); + continue; + } + 0x66 => { + L::push_codepoint(buf, 0x0C); + continue; + } + 0x6E => { + L::push_codepoint(buf, 0x0A); + continue; + } + 0x76 => { + // Vertical tab is invalid JSON + // We're going to allow it. + L::push_codepoint(buf, 0x0B); + continue; + } + 0x74 => { + L::push_codepoint(buf, 0x09); + continue; + } + 0x72 => { + L::push_codepoint(buf, 0x0D); + continue; + } + + // legacy octal literals + 0x30..=0x37 => { + let octal_start = (iter.i as usize + width2 as usize).saturating_sub(2); + if L::IS_JSON { + *lexer.end_mut() = + (start + iter.i as usize).saturating_sub(width2 as usize); + lexer.syntax_error()?; + } + + // 1-3 digit octal + let mut is_bad = false; + let mut value: i64 = (c2 - 0x30) as i64; + let mut prev = iter; + + if !iterator.next(&mut iter) { + if value == 0 { + L::push_codepoint(buf, 0); + return Ok(()); + } + lexer.syntax_error()?; + return Ok(()); + } + + let c3: CodePoint = iter.c; + + match c3 { + 0x30..=0x37 => { + value = value * 8 + (c3 - 0x30) as i64; + prev = iter; + if !iterator.next(&mut iter) { + return lexer.syntax_error(); + } + + let c4 = iter.c; + match c4 { + 0x30..=0x37 => { + let temp = value * 8 + (c4 - 0x30) as i64; + if temp < 256 { + value = temp; + } else { + iter = prev; + } + } + 0x38 | 0x39 => { + is_bad = true; + } + _ => { + iter = prev; + } + } + } + 0x38 | 0x39 => { + is_bad = true; + } + _ => { + iter = prev; + } + } + + iter.c = i32::try_from(value).expect("int cast"); + if is_bad { + // `octal_start` is text-relative like `iter.i`; map back + // to an absolute source position the same way every + // sibling error path does (e.g. `start + hex_start` in + // the `\u{}` branch) — unless the lexer keeps its + // historical text-relative span. + let range_start = if L::LEGACY_ERROR_SPANS { + octal_start + } else { + start + octal_start + }; + // `add_range_error` has no failing path; `?` keeps the + // signature free of a `Debug` bound on `L::Err`. + lexer.add_range_error( + Range { + loc: Loc { + start: i32::try_from(range_start).expect("int cast"), + }, + len: i32::try_from(iter.i as usize - octal_start) + .expect("int cast"), + }, + format_args!("Invalid legacy octal literal"), + )?; + } + } + 0x38 | 0x39 => { + iter.c = c2; + } + // 2-digit hexadecimal + 0x78 => { + if REJECT_HEX_ESCAPE { + *lexer.end_mut() = + (start + iter.i as usize).saturating_sub(width2 as usize); + lexer.syntax_error()?; + } + + let mut value: CodePoint = 0; + let mut c3: CodePoint; + let mut width3: u8; + + if !iterator.next(&mut iter) { + return lexer.syntax_error(); + } + c3 = iter.c; + width3 = iter.width; + match hex_digit_value_u32(c3 as u32) { + Some(d) => value = (value * 16) | d as CodePoint, + None => { + *lexer.end_mut() = + (start + iter.i as usize).saturating_sub(width3 as usize); + return lexer.syntax_error(); + } + } + + if !iterator.next(&mut iter) { + return lexer.syntax_error(); + } + c3 = iter.c; + width3 = iter.width; + match hex_digit_value_u32(c3 as u32) { + Some(d) => value = (value * 16) | d as CodePoint, + None => { + *lexer.end_mut() = + (start + iter.i as usize).saturating_sub(width3 as usize); + return lexer.syntax_error(); + } + } + + iter.c = value; + } + 0x75 => { + // We're going to make this an i64 so we don't risk integer overflows + // when people do weird things + let mut value: i64 = 0; + + if !iterator.next(&mut iter) { + return lexer.syntax_error(); + } + let mut c3 = iter.c; + let mut width3 = iter.width; + + // variable-length + if c3 == 0x7B { + if L::IS_JSON { + *lexer.end_mut() = + (start + iter.i as usize).saturating_sub(width2 as usize); + lexer.syntax_error()?; + } + + // `iter.i` is the byte offset of `{` inside `text`; + // back up past `\` and `u` only. `width3` is the + // width of `{` itself, which `iter.i` already points + // at — subtracting it lands one character too early + // (kept for lexers with `LEGACY_ERROR_SPANS`). + let mut hex_start = (iter.i as usize) + .saturating_sub(width as usize) + .saturating_sub(width2 as usize); + if L::LEGACY_ERROR_SPANS { + hex_start = hex_start.saturating_sub(width3 as usize); + } + let mut is_first = true; + let mut is_out_of_range = false; + 'variable_length: loop { + if !iterator.next(&mut iter) { + break 'variable_length; + } + c3 = iter.c; + + if c3 == 0x7D { + if is_first { + *lexer.end_mut() = (start + iter.i as usize) + .saturating_sub(width3 as usize); + return lexer.syntax_error(); + } + break 'variable_length; + } + match hex_digit_value_u32(c3 as u32) { + Some(d) => value = (value * 16) | d as i64, + None => { + *lexer.end_mut() = (start + iter.i as usize) + .saturating_sub(width3 as usize); + return lexer.syntax_error(); + } + } + + // '\U0010FFFF + // copied from golang utf8.MaxRune + if value > 1_114_111 { + is_out_of_range = true; + } + is_first = false; + } + + if is_out_of_range { + lexer.add_range_error( + Range { + loc: Loc { + start: i32::try_from(start + hex_start) + .expect("int cast"), + }, + len: i32::try_from( + (iter.i as usize).saturating_sub(hex_start), + ) + .unwrap(), + }, + format_args!("Unicode escape sequence is out of range"), + )?; + + return Ok(()); + } + + // fixed-length + } else { + // Fixed-length + let mut j: usize = 0; + while j < 4 { + match hex_digit_value_u32(c3 as u32) { + Some(d) => value = (value * 16) | d as i64, + None => { + *lexer.end_mut() = (start + iter.i as usize) + .saturating_sub(width3 as usize); + return lexer.syntax_error(); + } + } + + if j < 3 { + if !iterator.next(&mut iter) { + return lexer.syntax_error(); + } + c3 = iter.c; + width3 = iter.width; + } + j += 1; + } + let _ = width3; + } + + iter.c = value as CodePoint; // @truncate + } + 0x0D => { + if L::IS_JSON { + *lexer.end_mut() = + (start + iter.i as usize).saturating_sub(width2 as usize); + lexer.syntax_error()?; + } else if !ALLOW_LINE_CONTINUATIONS { + *lexer.end_mut() = + (start + iter.i as usize).saturating_sub(width2 as usize); + lexer.add_default_error(b"Unexpected end of line")?; + } + + // Make sure Windows CRLF counts as a single newline. + // Guard on the index we actually read (`iter.i + 1`), not + // `iter.i` — a string ending in `\` would otherwise + // read `text[len]`. + let next_i: usize = iter.i as usize + 1; + iter.i += (next_i < text.len() && text[next_i] == b'\n') as u32; + + // Ignore line continuations. A line continuation is not an escaped newline. + continue; + } + 0x0A | 0x2028 | 0x2029 => { + if L::IS_JSON { + *lexer.end_mut() = + (start + iter.i as usize).saturating_sub(width2 as usize); + lexer.syntax_error()?; + } else if !ALLOW_LINE_CONTINUATIONS { + *lexer.end_mut() = + (start + iter.i as usize).saturating_sub(width2 as usize); + lexer.add_default_error(b"Unexpected end of line")?; + } + + // Ignore line continuations. A line continuation is not an escaped newline. + continue; + } + _ => { + if L::IS_JSON { + match c2 { + 0x22 | 0x5C | 0x2F => {} + _ => { + *lexer.end_mut() = + (start + iter.i as usize).saturating_sub(width2 as usize); + lexer.syntax_error()?; + } + } + } + iter.c = c2; + } + } + } + _ => {} + } + + match iter.c { + -1 => return lexer.add_default_error(b"Unexpected end of file"), + c => L::push_codepoint(buf, c as u32), + } + } + Ok(()) +} diff --git a/src/js_parser/lexer.rs b/src/js_parser/lexer.rs index 291cbd0ed0a..c09eab335fa 100644 --- a/src/js_parser/lexer.rs +++ b/src/js_parser/lexer.rs @@ -5,7 +5,6 @@ use core::fmt; use bun_ast as js_ast; use bun_ast::lexer_tables as tables; use bun_ast::{LexerLog, Loc, Log, Range, Source}; -use bun_core::fmt::hex_digit_value_u32; use bun_core::strings; use bun_core::strings::CodepointIterator; use bun_core::{Environment, feature_flags as FeatureFlags}; @@ -448,6 +447,41 @@ impl< } } +impl< + 'a, + const IS_JSON: bool, + const ALLOW_COMMENTS: bool, + const ALLOW_TRAILING_COMMAS: bool, + const IGNORE_LEADING_ESCAPE_SEQUENCES: bool, + const IGNORE_TRAILING_ESCAPE_SEQUENCES: bool, + const JSON_WARN_DUPLICATE_KEYS: bool, + const WAS_ORIGINALLY_MACRO: bool, + const GUESS_INDENTATION: bool, +> bun_ast::lexer_log::EscapeLexer<'a> + for LexerType< + 'a, + IS_JSON, + ALLOW_COMMENTS, + ALLOW_TRAILING_COMMAS, + IGNORE_LEADING_ESCAPE_SEQUENCES, + IGNORE_TRAILING_ESCAPE_SEQUENCES, + JSON_WARN_DUPLICATE_KEYS, + WAS_ORIGINALLY_MACRO, + GUESS_INDENTATION, + > +{ + type Buf = Vec; + const IS_JSON: bool = IS_JSON; + #[inline] + fn end_mut(&mut self) -> &mut usize { + &mut self.end + } + #[inline] + fn push_codepoint(buf: &mut Vec, c: u32) { + strings::push_codepoint_utf16(buf, c); + } +} + lexer_impl_header! { /// Reborrow the shared `Log`. The `&self` receiver lets call sites pass /// other `self.*` fields as arguments without a borrow-checker conflict; @@ -605,344 +639,7 @@ lexer_impl_header! { if IS_JSON { self.is_ascii_only = false; } - - let iterator = CodepointIterator::init(text); - let mut iter = strings::Cursor::default(); - while iterator.next(&mut iter) { - let width = iter.width; - match iter.c { - 0x0D => { - // From the specification: - // - // 11.8.6.1 Static Semantics: TV and TRV - // - // TV excludes the code units of LineContinuation while TRV includes - // them. and LineTerminatorSequences are normalized to - // for both TV and TRV. An explicit EscapeSequence is needed to - // include a or sequence. - - // Convert '\r\n' into '\n' - let next_i: usize = iter.i as usize + 1; - iter.i += (next_i < text.len() && text[next_i] == b'\n') as u32; - - // Convert '\r' into '\n' - buf.push(u16::from(b'\n')); - continue; - } - - 0x5C => { - if !iterator.next(&mut iter) { - return Ok(()); - } - - let c2 = iter.c; - let width2 = iter.width; - match c2 { - // https://mathiasbynens.be/notes/javascript-escapes#single - 0x62 => { - buf.push(0x08); - continue; - } - 0x66 => { - buf.push(0x0C); - continue; - } - 0x6E => { - buf.push(0x0A); - continue; - } - 0x76 => { - // Vertical tab is invalid JSON - // We're going to allow it. - buf.push(0x0B); - continue; - } - 0x74 => { - buf.push(0x09); - continue; - } - 0x72 => { - buf.push(0x0D); - continue; - } - - // legacy octal literals - 0x30..=0x37 => { - let octal_start = - (iter.i as usize + width2 as usize).saturating_sub(2); - if IS_JSON { - self.end = (start + iter.i as usize) - .saturating_sub(width2 as usize); - self.syntax_error()?; - } - - // 1-3 digit octal - let mut is_bad = false; - let mut value: i64 = (c2 - 0x30) as i64; - let mut prev = iter; - - if !iterator.next(&mut iter) { - if value == 0 { - buf.push(0); - return Ok(()); - } - self.syntax_error()?; - return Ok(()); - } - - let c3: CodePoint = iter.c; - - match c3 { - 0x30..=0x37 => { - value = value * 8 + (c3 - 0x30) as i64; - prev = iter; - if !iterator.next(&mut iter) { - return self.syntax_error(); - } - - let c4 = iter.c; - match c4 { - 0x30..=0x37 => { - let temp = - value * 8 + (c4 - 0x30) as i64; - if temp < 256 { - value = temp; - } else { - iter = prev; - } - } - 0x38 | 0x39 => { - is_bad = true; - } - _ => { - iter = prev; - } - } - } - 0x38 | 0x39 => { - is_bad = true; - } - _ => { - iter = prev; - } - } - - iter.c = i32::try_from(value).expect("int cast"); - if is_bad { - // `octal_start` is text-relative like `iter.i`; - // map back to absolute source position the same - // way every sibling error path does (e.g. - // `start + hex_start` in the `\u{}` branch). - self.add_range_error( - Range { - loc: Loc { - start: i32::try_from(start + octal_start).expect("int cast"), - }, - len: i32::try_from( - iter.i as usize - octal_start, - ) - .unwrap(), - }, - format_args!("Invalid legacy octal literal"), - ) - .expect("unreachable"); - } - } - 0x38 | 0x39 => { - iter.c = c2; - } - // 2-digit hexadecimal - 0x78 => { - let mut value: CodePoint = 0; - let mut c3: CodePoint; - let mut width3: u8; - - if !iterator.next(&mut iter) { - return self.syntax_error(); - } - c3 = iter.c; - width3 = iter.width; - match hex_digit_value_u32(c3 as u32) { - Some(d) => value = (value * 16) | d as CodePoint, - None => { - self.end = (start + iter.i as usize) - .saturating_sub(width3 as usize); - return self.syntax_error(); - } - } - - if !iterator.next(&mut iter) { - return self.syntax_error(); - } - c3 = iter.c; - width3 = iter.width; - match hex_digit_value_u32(c3 as u32) { - Some(d) => value = (value * 16) | d as CodePoint, - None => { - self.end = (start + iter.i as usize) - .saturating_sub(width3 as usize); - return self.syntax_error(); - } - } - - iter.c = value; - } - 0x75 => { - // We're going to make this an i64 so we don't risk integer overflows - // when people do weird things - let mut value: i64 = 0; - - if !iterator.next(&mut iter) { - return self.syntax_error(); - } - let mut c3 = iter.c; - let mut width3 = iter.width; - - // variable-length - if c3 == 0x7B { - if IS_JSON { - self.end = (start + iter.i as usize) - .saturating_sub(width2 as usize); - self.syntax_error()?; - } - - // `iter.i` is the byte offset of `{` inside `text`; - // back up past `\` and `u` only. `width3` is the - // width of `{` itself, which `iter.i` already points - // at — subtracting it lands one character too early. - let hex_start = (iter.i as usize) - .saturating_sub(width as usize) - .saturating_sub(width2 as usize); - let mut is_first = true; - let mut is_out_of_range = false; - 'variable_length: loop { - if !iterator.next(&mut iter) { - break 'variable_length; - } - c3 = iter.c; - - if c3 == 0x7D { - if is_first { - self.end = (start + iter.i as usize) - .saturating_sub(width3 as usize); - return self.syntax_error(); - } - break 'variable_length; - } - match hex_digit_value_u32(c3 as u32) { - Some(d) => value = (value * 16) | d as i64, - None => { - self.end = (start + iter.i as usize) - .saturating_sub(width3 as usize); - return self.syntax_error(); - } - } - - // '\U0010FFFF - // copied from golang utf8.MaxRune - if value > 1_114_111 { - is_out_of_range = true; - } - is_first = false; - } - - if is_out_of_range { - self.add_range_error( - Range { - loc: Loc { - start: i32::try_from(start + hex_start) - .unwrap(), - }, - len: i32::try_from( - (iter.i as usize).saturating_sub(hex_start), - ) - .unwrap(), - }, - format_args!( - "Unicode escape sequence is out of range" - ), - )?; - - return Ok(()); - } - - // fixed-length - } else { - // Fixed-length - let mut j: usize = 0; - while j < 4 { - match hex_digit_value_u32(c3 as u32) { - Some(d) => value = (value * 16) | d as i64, - None => { - self.end = (start + iter.i as usize) - .saturating_sub(width3 as usize); - return self.syntax_error(); - } - } - - if j < 3 { - if !iterator.next(&mut iter) { - return self.syntax_error(); - } - c3 = iter.c; - width3 = iter.width; - } - j += 1; - } - let _ = width3; - } - - iter.c = value as CodePoint; // @truncate - } - 0x0D => { - if IS_JSON { - self.end = (start + iter.i as usize) - .saturating_sub(width2 as usize); - self.syntax_error()?; - } - - // Make sure Windows CRLF counts as a single newline - let next_i: usize = iter.i as usize + 1; - iter.i += - (next_i < text.len() && text[next_i] == b'\n') as u32; - - // Ignore line continuations. A line continuation is not an escaped newline. - continue; - } - 0x0A | 0x2028 | 0x2029 => { - if IS_JSON { - self.end = (start + iter.i as usize) - .saturating_sub(width2 as usize); - self.syntax_error()?; - } - - // Ignore line continuations. A line continuation is not an escaped newline. - continue; - } - _ => { - if IS_JSON { - match c2 { - 0x22 | 0x5C | 0x2F => {} - _ => { - self.end = (start + iter.i as usize) - .saturating_sub(width2 as usize); - self.syntax_error()?; - } - } - } - iter.c = c2; - } - } - } - _ => {} - } - - match iter.c { - -1 => return self.add_default_error(b"Unexpected end of file"), - c => strings::push_codepoint_utf16(buf, c as u32), - } - } - Ok(()) + bun_ast::lexer_log::decode_escape_sequences::<_, true, false>(self, start, text, buf) } // PERF: heavy sub-scanner — the per-byte string body loop plus the diff --git a/src/js_parser/parse/parse_entry.rs b/src/js_parser/parse/parse_entry.rs index f72795f5d20..b276dd1dd81 100644 --- a/src/js_parser/parse/parse_entry.rs +++ b/src/js_parser/parse/parse_entry.rs @@ -56,6 +56,37 @@ macro_rules! init_p { }}; } +/// `init_p!` plus the shared `&mut self` prologue of `_scan_imports`, +/// `to_lazy_export_ast`, and `analyze`: `Lexer` owns `Vec`s and `Options` +/// owns `jsx: Pragma` boxes, so a bitwise `ptr::read` would double-free when +/// `self` later drops. Move them out, leaving inert placeholders, build the +/// parser in place, and bind `$p` to it. +/// +/// The inert placeholder lexer is given its *own* arena-allocated `Log` +/// (empty `Vec`, arena-leaked) so it does not alias `self.log` at all — +/// keeps the placeholder fully disjoint from the real `Log` handed to `P` +/// and never read again. +macro_rules! take_and_init_p { + (let $p:ident: $ty:ty = $self:ident) => { + let lexer = core::mem::replace( + &mut $self.lexer, + js_lexer::Lexer::init_without_reading( + $self.bump.alloc(bun_ast::Log::default()), + $self.source, + $self.bump, + ), + ); + let options = core::mem::take(&mut $self.options); + // `P.log` and `Lexer.log` are both `NonNull` (see P.rs / lexer.rs + // field docs), so handing the same raw pointer to both is defined — + // no `&mut` is materialized. + let mut __p = init_p!($ty; + $self.bump, $self.log, $self.source, $self.define, lexer, options); + // SAFETY: `init_p!` only yields after `init` succeeded. + let $p: &mut $ty = unsafe { __p.assume_init_mut() }; + }; +} + pub struct Parser<'a> { pub options: Options<'a>, pub lexer: js_lexer::Lexer<'a>, @@ -387,31 +418,7 @@ impl<'a> Parser<'a> { scan_pass: &'a mut ScanPassResult, ) -> Result<(), Error> { type Pi<'a, const TS: bool> = P<'a, TS, true>; - // `Lexer` owns `Vec`s and `Options` owns - // `jsx: Pragma` boxes, so a bitwise `ptr::read` would double-free - // when `self` later drops. Move them out, leaving inert placeholders. - // - // The inert placeholder lexer is given its *own* arena-allocated `Log` - // so it does not alias `self.log` at all — keeps the placeholder fully - // disjoint from the real `Log` handed to `P` and never read again. - let lexer = core::mem::replace( - &mut self.lexer, - js_lexer::Lexer::init_without_reading( - // Disjoint dummy `Log` (empty `Vec`, arena-leaked); the - // placeholder is never read after this point. - self.bump.alloc(bun_ast::Log::default()), - self.source, - self.bump, - ), - ); - let options = core::mem::take(&mut self.options); - // `P.log` and `Lexer.log` are both `NonNull` (see P.rs / lexer.rs - // field docs), so handing the same raw pointer to both is defined — - // no `&mut` is materialized. - let mut __p = init_p!(Pi<'_, TS>; - self.bump, self.log, self.source, self.define, lexer, options); - // SAFETY: `init_p!` only yields after `init` succeeded. - let p: &mut Pi<'_, TS> = unsafe { __p.assume_init_mut() }; + take_and_init_p!(let p: Pi<'_, TS> = self); p.import_records = crate::p::ImportRecordList::Borrowed(&mut scan_pass.import_records); p.named_imports = crate::p::NamedImportsType::Borrowed(&mut scan_pass.named_imports); @@ -531,29 +538,7 @@ impl<'a> Parser<'a> { runtime_api_call: &'static [u8], symbols: js_ast::symbol::List<'a>, ) -> Result, Error> { - // Move lexer/options out and leave inert - // placeholders so `self` may drop without double-free. - // - // The placeholder lexer gets its own arena `Log` so it does not alias - // `self.log` (see `_scan_imports`). - let lexer = core::mem::replace( - &mut self.lexer, - js_lexer::Lexer::init_without_reading( - // Disjoint dummy `Log` (empty `Vec`, arena-leaked); the - // placeholder is never read after this point. - self.bump.alloc(bun_ast::Log::default()), - self.source, - self.bump, - ), - ); - let options = core::mem::take(&mut self.options); - // `P.log` and `Lexer.log` are both `NonNull` (see P.rs / lexer.rs - // field docs), so handing the same raw pointer to both is defined — - // no `&mut` is materialized. - let mut __p = init_p!(JavaScriptParser<'_>; - self.bump, self.log, self.source, self.define, lexer, options); - // SAFETY: `init_p!` only yields after `init` succeeded. - let p: &mut JavaScriptParser<'_> = unsafe { __p.assume_init_mut() }; + take_and_init_p!(let p: JavaScriptParser<'_> = self); // Instead of doing "should_fold_typescript_constant_expressions or features.minify_syntax" // Let's enable this flag file-wide @@ -622,29 +607,7 @@ impl<'a> Parser<'a> { context: *mut c_void, callback: &dyn Fn(*mut c_void, &mut TSXParser, &mut [js_ast::Part]) -> Result<(), Error>, ) -> Result<(), Error> { - // See `_scan_imports`: move lexer/options out, leaving inert - // placeholders so `self` may drop without double-free. - // - // The placeholder lexer gets its own arena `Log` so it does not alias - // `self.log` (see `_scan_imports`). - let lexer = core::mem::replace( - &mut self.lexer, - js_lexer::Lexer::init_without_reading( - // Disjoint dummy `Log` (empty `Vec`, arena-leaked); the - // placeholder is never read after this point. - self.bump.alloc(bun_ast::Log::default()), - self.source, - self.bump, - ), - ); - let options = core::mem::take(&mut self.options); - // `P.log` and `Lexer.log` are both `NonNull` (see P.rs / lexer.rs - // field docs), so handing the same raw pointer to both is defined — - // no `&mut` is materialized. - let mut __p = init_p!(TSXParser<'_>; - self.bump, self.log, self.source, self.define, lexer, options); - // SAFETY: `init_p!` only yields after `init` succeeded. - let p: &mut TSXParser<'_> = unsafe { __p.assume_init_mut() }; + take_and_init_p!(let p: TSXParser<'_> = self); // Consume a leading hashbang comment let mut hashbang: &[u8] = b""; diff --git a/src/parsers/json.rs b/src/parsers/json.rs index 7771dd9126a..d8d6c7060ef 100644 --- a/src/parsers/json.rs +++ b/src/parsers/json.rs @@ -811,6 +811,39 @@ fn empty_array_data() -> js_ast::expr::Data { js_ast::expr::Data::EArray(js_ast::StoreRef::from_raw(EMPTY_ARRAY.get())) } +/// Shared fast-path prologue for every JSON entry point: empty input parses +/// as an empty object (consistent with how disabled JS files are handled), +/// and two-byte `""`/`''`/`{}`/`[]` inputs skip the lexer entirely. +/// +/// Note: the two-byte arms compare a one-byte slice (`contents[0..1]`) +/// against two-byte literals, so they never match. This mirrors the Zig +/// reference (`json.zig` does the same with `eqlComptime`) — kept as-is to +/// preserve behavior, since "fixing" it would accept `''` in strict JSON. +#[inline] +fn empty_source_fast_path(source: &bun_ast::Source) -> Option { + let expr = |data| { + Some(Expr { + loc: bun_ast::Loc { start: 0 }, + data, + }) + }; + match source.contents.len() { + 0 => expr(empty_object_data()), + 2 => { + if &source.contents[0..1] == b"\"\"" || &source.contents[0..1] == b"''" { + expr(empty_string_data()) + } else if &source.contents[0..1] == b"{}" { + expr(empty_object_data()) + } else if &source.contents[0..1] == b"[]" { + expr(empty_array_data()) + } else { + None + } + } + _ => None, + } +} + // ────────────────────────────────────────────────────────────────────────── /// Parse JSON @@ -827,37 +860,12 @@ pub fn parse( log: &mut bun_ast::Log, bump: &Bump, ) -> Result { - let mut parser = JSONLikeParser::init(JSON_OPTS, bump, source, log)?; - match source.contents.len() { - // This is to be consisntent with how disabled JS files are handled - 0 => { - return Ok(Expr { - loc: bun_ast::Loc { start: 0 }, - data: empty_object_data(), - }); - } - // This is a fast pass I guess - 2 => { - if &source.contents[0..1] == b"\"\"" || &source.contents[0..1] == b"''" { - return Ok(Expr { - loc: bun_ast::Loc { start: 0 }, - data: empty_string_data(), - }); - } else if &source.contents[0..1] == b"{}" { - return Ok(Expr { - loc: bun_ast::Loc { start: 0 }, - data: empty_object_data(), - }); - } else if &source.contents[0..1] == b"[]" { - return Ok(Expr { - loc: bun_ast::Loc { start: 0 }, - data: empty_array_data(), - }); - } - } - _ => {} + if let Some(expr) = empty_source_fast_path(source) { + return Ok(expr); } + let mut parser = JSONLikeParser::init(JSON_OPTS, bump, source, log)?; + parser.parse_expr(false, FORCE_UTF8) } @@ -871,36 +879,8 @@ pub fn parse_package_json_utf8( log: &mut bun_ast::Log, bump: &Bump, ) -> Result { - let len = source.contents.len(); - - match len { - // This is to be consisntent with how disabled JS files are handled - 0 => { - return Ok(Expr { - loc: bun_ast::Loc { start: 0 }, - data: empty_object_data(), - }); - } - // This is a fast pass I guess - 2 => { - if &source.contents[0..1] == b"\"\"" || &source.contents[0..1] == b"''" { - return Ok(Expr { - loc: bun_ast::Loc { start: 0 }, - data: empty_string_data(), - }); - } else if &source.contents[0..1] == b"{}" { - return Ok(Expr { - loc: bun_ast::Loc { start: 0 }, - data: empty_object_data(), - }); - } else if &source.contents[0..1] == b"[]" { - return Ok(Expr { - loc: bun_ast::Loc { start: 0 }, - data: empty_array_data(), - }); - } - } - _ => {} + if let Some(expr) = empty_source_fast_path(source) { + return Ok(expr); } let mut parser = JSONLikeParser::init(PACKAGE_JSON_OPTS, bump, source, log)?; @@ -961,48 +941,11 @@ pub fn parse_package_json_utf8_with_opts_rt( log: &mut bun_ast::Log, bump: &Bump, ) -> Result { - let len = source.contents.len(); - - match len { - // This is to be consisntent with how disabled JS files are handled - 0 => { - return Ok(JsonResult { - root: Expr { - loc: bun_ast::Loc { start: 0 }, - data: empty_object_data(), - }, - indentation: Indentation::default(), - }); - } - // This is a fast pass I guess - 2 => { - if &source.contents[0..1] == b"\"\"" || &source.contents[0..1] == b"''" { - return Ok(JsonResult { - root: Expr { - loc: bun_ast::Loc { start: 0 }, - data: empty_string_data(), - }, - indentation: Indentation::default(), - }); - } else if &source.contents[0..1] == b"{}" { - return Ok(JsonResult { - root: Expr { - loc: bun_ast::Loc { start: 0 }, - data: empty_object_data(), - }, - indentation: Indentation::default(), - }); - } else if &source.contents[0..1] == b"[]" { - return Ok(JsonResult { - root: Expr { - loc: bun_ast::Loc { start: 0 }, - data: empty_array_data(), - }, - indentation: Indentation::default(), - }); - } - } - _ => {} + if let Some(root) = empty_source_fast_path(source) { + return Ok(JsonResult { + root, + indentation: Indentation::default(), + }); } let mut parser = JSONLikeParser::init(opts, bump, source, log)?; @@ -1039,36 +982,8 @@ pub fn parse_utf8_impl( log: &mut bun_ast::Log, bump: &Bump, ) -> Result { - let len = source.contents.len(); - - match len { - // This is to be consisntent with how disabled JS files are handled - 0 => { - return Ok(Expr { - loc: bun_ast::Loc { start: 0 }, - data: empty_object_data(), - }); - } - // This is a fast pass I guess - 2 => { - if &source.contents[0..1] == b"\"\"" || &source.contents[0..1] == b"''" { - return Ok(Expr { - loc: bun_ast::Loc { start: 0 }, - data: empty_string_data(), - }); - } else if &source.contents[0..1] == b"{}" { - return Ok(Expr { - loc: bun_ast::Loc { start: 0 }, - data: empty_object_data(), - }); - } else if &source.contents[0..1] == b"[]" { - return Ok(Expr { - loc: bun_ast::Loc { start: 0 }, - data: empty_array_data(), - }); - } - } - _ => {} + if let Some(expr) = empty_source_fast_path(source) { + return Ok(expr); } let mut parser = JSONLikeParser::init(JSON_OPTS, bump, source, log)?; @@ -1090,34 +1005,8 @@ pub fn parse_for_macro( log: &mut bun_ast::Log, bump: &Bump, ) -> Result { - match source.contents.len() { - // This is to be consisntent with how disabled JS files are handled - 0 => { - return Ok(Expr { - loc: bun_ast::Loc { start: 0 }, - data: empty_object_data(), - }); - } - // This is a fast pass I guess - 2 => { - if &source.contents[0..1] == b"\"\"" || &source.contents[0..1] == b"''" { - return Ok(Expr { - loc: bun_ast::Loc { start: 0 }, - data: empty_string_data(), - }); - } else if &source.contents[0..1] == b"{}" { - return Ok(Expr { - loc: bun_ast::Loc { start: 0 }, - data: empty_object_data(), - }); - } else if &source.contents[0..1] == b"[]" { - return Ok(Expr { - loc: bun_ast::Loc { start: 0 }, - data: empty_array_data(), - }); - } - } - _ => {} + if let Some(expr) = empty_source_fast_path(source) { + return Ok(expr); } let mut parser = JSONLikeParser::init(MACRO_JSON_OPTS, bump, source, log)?; @@ -1143,46 +1032,15 @@ pub fn parse_for_bundling( log: &mut bun_ast::Log, bump: &Bump, ) -> Result { - match source.contents.len() { - // This is to be consisntent with how disabled JS files are handled - 0 => { - return Ok(JSONParseResult { - expr: Expr { - loc: bun_ast::Loc { start: 0 }, - data: empty_object_data(), - }, - tag: JSONParseResultTag::Empty, - }); - } - // This is a fast pass I guess - 2 => { - if &source.contents[0..1] == b"\"\"" || &source.contents[0..1] == b"''" { - return Ok(JSONParseResult { - expr: Expr { - loc: bun_ast::Loc { start: 0 }, - data: empty_string_data(), - }, - tag: JSONParseResultTag::Expr, - }); - } else if &source.contents[0..1] == b"{}" { - return Ok(JSONParseResult { - expr: Expr { - loc: bun_ast::Loc { start: 0 }, - data: empty_object_data(), - }, - tag: JSONParseResultTag::Expr, - }); - } else if &source.contents[0..1] == b"[]" { - return Ok(JSONParseResult { - expr: Expr { - loc: bun_ast::Loc { start: 0 }, - data: empty_array_data(), - }, - tag: JSONParseResultTag::Expr, - }); - } - } - _ => {} + if let Some(expr) = empty_source_fast_path(source) { + return Ok(JSONParseResult { + expr, + tag: if source.contents.is_empty() { + JSONParseResultTag::Empty + } else { + JSONParseResultTag::Expr + }, + }); } let mut parser = JSONLikeParser::init(JSON_OPTS, bump, source, log)?; @@ -1204,34 +1062,8 @@ pub fn parse_env_json( log: &mut bun_ast::Log, bump: &Bump, ) -> Result { - match source.contents.len() { - // This is to be consisntent with how disabled JS files are handled - 0 => { - return Ok(Expr { - loc: bun_ast::Loc { start: 0 }, - data: empty_object_data(), - }); - } - // This is a fast pass I guess - 2 => { - if &source.contents[0..1] == b"\"\"" || &source.contents[0..1] == b"''" { - return Ok(Expr { - loc: bun_ast::Loc { start: 0 }, - data: empty_string_data(), - }); - } else if &source.contents[0..1] == b"{}" { - return Ok(Expr { - loc: bun_ast::Loc { start: 0 }, - data: empty_object_data(), - }); - } else if &source.contents[0..1] == b"[]" { - return Ok(Expr { - loc: bun_ast::Loc { start: 0 }, - data: empty_array_data(), - }); - } - } - _ => {} + if let Some(expr) = empty_source_fast_path(source) { + return Ok(expr); } let mut parser = JSONLikeParser::init(DOTENV_JSON_OPTS, bump, source, log)?; @@ -1272,34 +1104,8 @@ pub fn parse_ts_config( log: &mut bun_ast::Log, bump: &Bump, ) -> Result { - match source.contents.len() { - // This is to be consisntent with how disabled JS files are handled - 0 => { - return Ok(Expr { - loc: bun_ast::Loc { start: 0 }, - data: empty_object_data(), - }); - } - // This is a fast pass I guess - 2 => { - if &source.contents[0..1] == b"\"\"" || &source.contents[0..1] == b"''" { - return Ok(Expr { - loc: bun_ast::Loc { start: 0 }, - data: empty_string_data(), - }); - } else if &source.contents[0..1] == b"{}" { - return Ok(Expr { - loc: bun_ast::Loc { start: 0 }, - data: empty_object_data(), - }); - } else if &source.contents[0..1] == b"[]" { - return Ok(Expr { - loc: bun_ast::Loc { start: 0 }, - data: empty_array_data(), - }); - } - } - _ => {} + if let Some(expr) = empty_source_fast_path(source) { + return Ok(expr); } let mut parser = JSONLikeParser::init(TSCONFIG_OPTS, bump, source, log)?; diff --git a/src/parsers/json_lexer.rs b/src/parsers/json_lexer.rs index 1b5954bd403..5f53dee8e90 100644 --- a/src/parsers/json_lexer.rs +++ b/src/parsers/json_lexer.rs @@ -222,6 +222,28 @@ impl<'a, 'bump> LexerLog<'a> for Lexer<'a, 'bump> { } } +impl<'a, 'bump> crate::number_scan::DecimalLexer<'a> for Lexer<'a, 'bump> +where + 'bump: 'a, +{ + #[inline] + fn code_point(&self) -> CodePoint { + self.code_point + } + #[inline] + fn end(&self) -> usize { + self.end + } + #[inline] + fn end_mut(&mut self) -> &mut usize { + &mut self.end + } + #[inline] + fn step(&mut self) { + Lexer::step(self) + } +} + impl<'a, 'bump> Lexer<'a, 'bump> where // `identifier` may point into `source.contents` (`'a`) *or* a bump-alloc'd @@ -697,7 +719,6 @@ where return self.syntax_error(); } - let mut underscore_count: usize = 0; let mut last_underscore_end: usize = 0; let mut has_dot_or_exponent = first == '.' as CodePoint; let mut base: f64 = 0.0; @@ -818,87 +839,10 @@ where } } else { // Floating-point literal; - let is_invalid_legacy_octal_literal = first == '0' as CodePoint - && (self.code_point == '8' as CodePoint || self.code_point == '9' as CodePoint); - - // Initial digits; - loop { - if self.code_point < '0' as CodePoint || self.code_point > '9' as CodePoint { - if self.code_point != '_' as CodePoint { - break; - } - // Cannot have multiple underscores in a row; - if last_underscore_end > 0 && self.end == last_underscore_end + 1 { - self.syntax_error()?; - } - // The specification forbids underscores in this case; - if is_invalid_legacy_octal_literal { - self.syntax_error()?; - } - last_underscore_end = self.end; - underscore_count += 1; - } - self.step(); - } - - // Fractional digits; - if first != '.' as CodePoint && self.code_point == '.' as CodePoint { - // An underscore must not come last; - if last_underscore_end > 0 && self.end == last_underscore_end + 1 { - self.end -= 1; - self.syntax_error()?; - } - has_dot_or_exponent = true; - self.step(); - if self.code_point == '_' as CodePoint { - self.syntax_error()?; - } - loop { - if self.code_point < '0' as CodePoint || self.code_point > '9' as CodePoint { - if self.code_point != '_' as CodePoint { - break; - } - // Cannot have multiple underscores in a row; - if last_underscore_end > 0 && self.end == last_underscore_end + 1 { - self.syntax_error()?; - } - last_underscore_end = self.end; - underscore_count += 1; - } - self.step(); - } - } - - // Exponent; - if self.code_point == 'e' as CodePoint || self.code_point == 'E' as CodePoint { - // An underscore must not come last; - if last_underscore_end > 0 && self.end == last_underscore_end + 1 { - self.end -= 1; - self.syntax_error()?; - } - has_dot_or_exponent = true; - self.step(); - if self.code_point == '+' as CodePoint || self.code_point == '-' as CodePoint { - self.step(); - } - if self.code_point < '0' as CodePoint || self.code_point > '9' as CodePoint { - self.syntax_error()?; - } - loop { - if self.code_point < '0' as CodePoint || self.code_point > '9' as CodePoint { - if self.code_point != '_' as CodePoint { - break; - } - // Cannot have multiple underscores in a row; - if last_underscore_end > 0 && self.end == last_underscore_end + 1 { - self.syntax_error()?; - } - last_underscore_end = self.end; - underscore_count += 1; - } - self.step(); - } - } + let scan = crate::number_scan::scan_decimal_digits(self, first)?; + let underscore_count = scan.underscore_count; + last_underscore_end = scan.last_underscore_end; + has_dot_or_exponent = scan.has_dot_or_exponent; // Take a slice of the text to parse; let text = self.raw(); diff --git a/src/parsers/lib.rs b/src/parsers/lib.rs index e5fb1f707a7..997bff8c022 100644 --- a/src/parsers/lib.rs +++ b/src/parsers/lib.rs @@ -8,6 +8,10 @@ // Crate-private: implementation detail of `json.rs`; no external consumers. mod json_lexer; +// ───── number_scan ──────────────────────────────────────────────────────── +// Decimal number-literal digit scanner shared by the json and toml lexers. +mod number_scan; + // ───── json ─────────────────────────────────────────────────────────────── // Real port — wired against `crate::json_lexer` (the cycle-break above) and // `bun_ast::js_ast`; resolves against the local lexer so `bun_js_parser` diff --git a/src/parsers/number_scan.rs b/src/parsers/number_scan.rs new file mode 100644 index 00000000000..1b900d50a01 --- /dev/null +++ b/src/parsers/number_scan.rs @@ -0,0 +1,128 @@ +//! Shared decimal number-literal digit scanner. +//! +//! The json and toml lexers each carried an identical ~80-line scan of a +//! decimal literal's digits — underscore-separator rules, optional fraction, +//! optional exponent, and the invalid-legacy-octal underscore check (see the +//! matching regions in `js_parser/lexer.zig` and `parsers/toml/lexer.zig`). +//! This generic helper collapses both; it monomorphizes per lexer type, so +//! codegen matches the previous inline copies. + +use bun_ast::LexerLog; +use bun_core::strings::CodePoint; + +/// Cursor surface `scan_decimal_digits` needs from a lexer. +pub(crate) trait DecimalLexer<'s>: LexerLog<'s> { + fn code_point(&self) -> CodePoint; + fn end(&self) -> usize; + fn end_mut(&mut self) -> &mut usize; + fn step(&mut self); +} + +pub(crate) struct DecimalScan { + pub underscore_count: usize, + pub last_underscore_end: usize, + pub has_dot_or_exponent: bool, +} + +/// Scans the digits of a decimal (non-radix-prefixed) number literal: +/// initial digits, then an optional fraction and exponent. The caller has +/// already consumed `first` (the literal's first code point); on return the +/// cursor sits on the first code point past the literal and the caller +/// parses `lexer.raw()` into a value. +#[inline] +pub(crate) fn scan_decimal_digits<'s, L: DecimalLexer<'s>>( + lexer: &mut L, + first: CodePoint, +) -> Result { + let mut underscore_count: usize = 0; + let mut last_underscore_end: usize = 0; + let mut has_dot_or_exponent = first == '.' as CodePoint; + + let is_invalid_legacy_octal_literal = first == '0' as CodePoint + && (lexer.code_point() == '8' as CodePoint || lexer.code_point() == '9' as CodePoint); + + // Initial digits; + loop { + if lexer.code_point() < '0' as CodePoint || lexer.code_point() > '9' as CodePoint { + if lexer.code_point() != '_' as CodePoint { + break; + } + // Cannot have multiple underscores in a row; + if last_underscore_end > 0 && lexer.end() == last_underscore_end + 1 { + lexer.syntax_error()?; + } + // The specification forbids underscores in this case; + if is_invalid_legacy_octal_literal { + lexer.syntax_error()?; + } + last_underscore_end = lexer.end(); + underscore_count += 1; + } + lexer.step(); + } + + // Fractional digits; + if first != '.' as CodePoint && lexer.code_point() == '.' as CodePoint { + // An underscore must not come last; + if last_underscore_end > 0 && lexer.end() == last_underscore_end + 1 { + *lexer.end_mut() -= 1; + lexer.syntax_error()?; + } + has_dot_or_exponent = true; + lexer.step(); + if lexer.code_point() == '_' as CodePoint { + lexer.syntax_error()?; + } + loop { + if lexer.code_point() < '0' as CodePoint || lexer.code_point() > '9' as CodePoint { + if lexer.code_point() != '_' as CodePoint { + break; + } + // Cannot have multiple underscores in a row; + if last_underscore_end > 0 && lexer.end() == last_underscore_end + 1 { + lexer.syntax_error()?; + } + last_underscore_end = lexer.end(); + underscore_count += 1; + } + lexer.step(); + } + } + + // Exponent; + if lexer.code_point() == 'e' as CodePoint || lexer.code_point() == 'E' as CodePoint { + // An underscore must not come last; + if last_underscore_end > 0 && lexer.end() == last_underscore_end + 1 { + *lexer.end_mut() -= 1; + lexer.syntax_error()?; + } + has_dot_or_exponent = true; + lexer.step(); + if lexer.code_point() == '+' as CodePoint || lexer.code_point() == '-' as CodePoint { + lexer.step(); + } + if lexer.code_point() < '0' as CodePoint || lexer.code_point() > '9' as CodePoint { + lexer.syntax_error()?; + } + loop { + if lexer.code_point() < '0' as CodePoint || lexer.code_point() > '9' as CodePoint { + if lexer.code_point() != '_' as CodePoint { + break; + } + // Cannot have multiple underscores in a row; + if last_underscore_end > 0 && lexer.end() == last_underscore_end + 1 { + lexer.syntax_error()?; + } + last_underscore_end = lexer.end(); + underscore_count += 1; + } + lexer.step(); + } + } + + Ok(DecimalScan { + underscore_count, + last_underscore_end, + has_dot_or_exponent, + }) +} diff --git a/src/parsers/toml/lexer.rs b/src/parsers/toml/lexer.rs index cfe8c61a5ca..9c043b7495f 100644 --- a/src/parsers/toml/lexer.rs +++ b/src/parsers/toml/lexer.rs @@ -2,7 +2,6 @@ use bun_alloc::Arena; // bumpalo::Bump re-export use bun_alloc::ArenaVecExt as _; use bun_ast as js_ast; use bun_ast::LexerLog; -use bun_core::fmt::hex_digit_value_u32; use bun_core::strings; use bun_core::strings::CodePoint; @@ -129,6 +128,44 @@ impl<'a> LexerLog<'a> for Lexer<'a> { } } +impl<'a> bun_ast::lexer_log::EscapeLexer<'a> for Lexer<'a> { + type Buf = bun_alloc::ArenaVec<'a, u8>; + const LEGACY_ERROR_SPANS: bool = true; + #[inline] + fn end_mut(&mut self) -> &mut usize { + &mut self.end + } + #[inline] + fn push_codepoint(buf: &mut Self::Buf, c: u32) { + if c <= 127 { + buf.push(c as u8); + } else { + let mut part: [u8; 4] = [0; 4]; + let len = strings::encode_wtf8_rune(&mut part, c); + buf.extend_from_slice(&part[0..len]); + } + } +} + +impl<'a> crate::number_scan::DecimalLexer<'a> for Lexer<'a> { + #[inline] + fn code_point(&self) -> CodePoint { + self.code_point + } + #[inline] + fn end(&self) -> usize { + self.end + } + #[inline] + fn end_mut(&mut self) -> &mut usize { + &mut self.end + } + #[inline] + fn step(&mut self) { + Lexer::step(self) + } +} + impl<'a> Lexer<'a> { #[inline] pub fn loc(&self) -> bun_ast::Loc { @@ -318,115 +355,9 @@ impl<'a> Lexer<'a> { } } else { // Floating-point literal; - let is_invalid_legacy_octal_literal = first == '0' as CodePoint - && (self.code_point == '8' as CodePoint || self.code_point == '9' as CodePoint); - - // Initial digits; - loop { - if self.code_point < '0' as CodePoint || self.code_point > '9' as CodePoint { - match self.code_point { - // '-' => { - // if (lexer.raw().len == 5) { - // // Is this possibly a datetime literal that begins with a 4 digit year? - // lexer.step(); - // while (!lexer.has_newline_before) { - // switch (lexer.code_point) { - // ',' => { - // lexer.string_literal_slice = lexer.raw(); - // lexer.token = T.t_string_literal; - // break; - // }, - // } - // } - // } - // }, - c if c == '_' as CodePoint => {} - _ => break, - } - if self.code_point != '_' as CodePoint { - break; - } - - // Cannot have multiple underscores in a row; - if last_underscore_end > 0 && self.end == last_underscore_end + 1 { - self.syntax_error()?; - } - - // The specification forbids underscores in this case; - if is_invalid_legacy_octal_literal { - self.syntax_error()?; - } - - last_underscore_end = self.end; - underscore_count += 1; - } - self.step(); - } - - // Fractional digits; - if first != '.' as CodePoint && self.code_point == '.' as CodePoint { - // An underscore must not come last; - if last_underscore_end > 0 && self.end == last_underscore_end + 1 { - self.end -= 1; - self.syntax_error()?; - } - - has_dot_or_exponent = true; - self.step(); - if self.code_point == '_' as CodePoint { - self.syntax_error()?; - } - loop { - if self.code_point < '0' as CodePoint || self.code_point > '9' as CodePoint { - if self.code_point != '_' as CodePoint { - break; - } - - // Cannot have multiple underscores in a row; - if last_underscore_end > 0 && self.end == last_underscore_end + 1 { - self.syntax_error()?; - } - - last_underscore_end = self.end; - underscore_count += 1; - } - self.step(); - } - } - - // Exponent; - if self.code_point == 'e' as CodePoint || self.code_point == 'E' as CodePoint { - // An underscore must not come last; - if last_underscore_end > 0 && self.end == last_underscore_end + 1 { - self.end -= 1; - self.syntax_error()?; - } - - has_dot_or_exponent = true; - self.step(); - if self.code_point == '+' as CodePoint || self.code_point == '-' as CodePoint { - self.step(); - } - if self.code_point < '0' as CodePoint || self.code_point > '9' as CodePoint { - self.syntax_error()?; - } - loop { - if self.code_point < '0' as CodePoint || self.code_point > '9' as CodePoint { - if self.code_point != '_' as CodePoint { - break; - } - - // Cannot have multiple underscores in a row; - if last_underscore_end > 0 && self.end == last_underscore_end + 1 { - self.syntax_error()?; - } - - last_underscore_end = self.end; - underscore_count += 1; - } - self.step(); - } - } + let scan = crate::number_scan::scan_decimal_digits(self, first)?; + underscore_count = scan.underscore_count; + has_dot_or_exponent = scan.has_dot_or_exponent; // Take a slice of the text to parse; let mut text: &[u8] = self.raw(); @@ -854,327 +785,11 @@ impl<'a> Lexer<'a> { text: &[u8], buf: &mut bun_alloc::ArenaVec<'a, u8>, ) -> Result<(), Error> { - let iterator = strings::CodepointIterator::init(text); - let mut iter = strings::Cursor::default(); - while iterator.next(&mut iter) { - let width = iter.width; - match iter.c { - c if c == '\r' as CodePoint => { - // Convert '\r\n' into '\n'. After `next()` returns for `\r`, - // `iter.i` is the start byte of the `\r` itself — the `\n` - // we're looking for is at `iter.i + 1`. Reading `text[iter.i]` - // would always be `\r`, so the check never fired and a literal - // CRLF in a slow-path multiline basic string decoded to two LFs. - // Match the JS lexer (js_parser/lexer.rs:660-661). - let next_i: usize = iter.i as usize + 1; - if next_i < text.len() && text[next_i] == b'\n' { - iter.i += 1; - } - - // Convert '\r' into '\n' - buf.push(b'\n'); - continue; - } - - c if c == '\\' as CodePoint => { - if !iterator.next(&mut iter) { - return Ok(()); - } - - let c2 = iter.c; - - let width2 = iter.width; - match c2 { - // https://mathiasbynens.be/notes/javascript-escapes#single - c if c == 'b' as CodePoint => { - buf.push(8); - continue; - } - c if c == 'f' as CodePoint => { - // Form feed: U+000C - buf.push(12); - continue; - } - c if c == 'n' as CodePoint => { - buf.push(10); - continue; - } - c if c == 'v' as CodePoint => { - // Vertical tab is invalid JSON - // We're going to allow it. - buf.push(11); - continue; - } - c if c == 't' as CodePoint => { - // Horizontal tab: U+0009 - buf.push(9); - continue; - } - c if c == 'r' as CodePoint => { - buf.push(13); - continue; - } - - // legacy octal literals - c if ('0' as CodePoint..='7' as CodePoint).contains(&c) => { - let octal_start = (iter.i as usize + width2 as usize).saturating_sub(2); - - // 1-3 digit octal - let mut is_bad = false; - let mut value: i64 = (c2 - '0' as CodePoint) as i64; - let mut restore = iter; - - if !iterator.next(&mut iter) { - if value == 0 { - buf.push(0); - return Ok(()); - } - - self.syntax_error()?; - return Ok(()); - } - - let c3: CodePoint = iter.c; - - match c3 { - c if ('0' as CodePoint..='7' as CodePoint).contains(&c) => { - value = value * 8 + (c3 - '0' as CodePoint) as i64; - restore = iter; - if !iterator.next(&mut iter) { - return self.syntax_error(); - } - - let c4 = iter.c; - match c4 { - c if ('0' as CodePoint..='7' as CodePoint).contains(&c) => { - let temp = value * 8 + (c4 - '0' as CodePoint) as i64; - if temp < 256 { - value = temp; - } else { - iter = restore; - } - } - c if c == '8' as CodePoint || c == '9' as CodePoint => { - is_bad = true; - } - _ => { - iter = restore; - } - } - } - c if c == '8' as CodePoint || c == '9' as CodePoint => { - is_bad = true; - } - _ => { - iter = restore; - } - } - - iter.c = i32::try_from(value).expect("int cast"); - if is_bad { - self.add_range_error( - bun_ast::Range { - loc: bun_ast::Loc { - start: i32::try_from(octal_start).expect("int cast"), - }, - len: i32::try_from(iter.i as usize - octal_start) - .expect("int cast"), - }, - format_args!("Invalid legacy octal literal"), - ) - .expect("unreachable"); - } - } - c if c == '8' as CodePoint || c == '9' as CodePoint => { - iter.c = c2; - } - // 2-digit hexadecimal - c if c == 'x' as CodePoint => { - if ALLOW_MULTILINE { - self.end = - (start + iter.i as usize).saturating_sub(width2 as usize); - self.syntax_error()?; - } - - let mut value: CodePoint = 0; - let mut c3: CodePoint; - let mut width3: u8; - - if !iterator.next(&mut iter) { - return self.syntax_error(); - } - c3 = iter.c; - width3 = iter.width; - match hex_digit_value_u32(c3 as u32) { - Some(d) => value = (value * 16) | d as CodePoint, - None => { - self.end = - (start + iter.i as usize).saturating_sub(width3 as usize); - return self.syntax_error(); - } - } - - if !iterator.next(&mut iter) { - return self.syntax_error(); - } - c3 = iter.c; - width3 = iter.width; - match hex_digit_value_u32(c3 as u32) { - Some(d) => value = (value * 16) | d as CodePoint, - None => { - self.end = - (start + iter.i as usize).saturating_sub(width3 as usize); - return self.syntax_error(); - } - } - - iter.c = value; - } - c if c == 'u' as CodePoint => { - // We're going to make this an i64 so we don't risk integer overflows - // when people do weird things - let mut value: i64 = 0; - - if !iterator.next(&mut iter) { - return self.syntax_error(); - } - let mut c3 = iter.c; - let mut width3 = iter.width; - - // variable-length - if c3 == '{' as CodePoint { - let hex_start = (iter.i as usize) - .saturating_sub(width as usize) - .saturating_sub(width2 as usize) - .saturating_sub(width3 as usize); - let mut is_first = true; - let mut is_out_of_range = false; - 'variable_length: loop { - if !iterator.next(&mut iter) { - break 'variable_length; - } - c3 = iter.c; - - if c3 == '}' as CodePoint { - if is_first { - self.end = (start + iter.i as usize) - .saturating_sub(width3 as usize); - return self.syntax_error(); - } - break 'variable_length; - } - match hex_digit_value_u32(c3 as u32) { - Some(d) => value = (value * 16) | d as i64, - None => { - self.end = (start + iter.i as usize) - .saturating_sub(width3 as usize); - return self.syntax_error(); - } - } - - // '\U0010FFFF - // copied from golang utf8.MaxRune - if value > 1114111 { - is_out_of_range = true; - } - is_first = false; - } - - if is_out_of_range { - self.add_range_error( - bun_ast::Range { - loc: bun_ast::Loc { - start: i32::try_from(start + hex_start) - .expect("int cast"), - }, - len: i32::try_from( - (iter.i as usize).saturating_sub(hex_start), - ) - .unwrap(), - }, - format_args!("Unicode escape sequence is out of range"), - )?; - return Ok(()); - } - - // fixed-length - } else { - // Fixed-length - let mut j: usize = 0; - while j < 4 { - match hex_digit_value_u32(c3 as u32) { - Some(d) => value = (value * 16) | d as i64, - None => { - self.end = (start + iter.i as usize) - .saturating_sub(width3 as usize); - return self.syntax_error(); - } - } - - if j < 3 { - if !iterator.next(&mut iter) { - return self.syntax_error(); - } - c3 = iter.c; - - width3 = iter.width; - } - j += 1; - } - } - - iter.c = value as CodePoint; // @truncate - } - c if c == '\r' as CodePoint => { - if !ALLOW_MULTILINE { - self.end = - (start + iter.i as usize).saturating_sub(width2 as usize); - self.add_default_error(b"Unexpected end of line")?; - } - - // Ignore line continuations. A line continuation is not an escaped newline. - // Match the JS lexer (js_parser/lexer.rs:660-661, 937-939): guard on - // the index we actually read (`iter.i + 1`), not `iter.i`. Without - // this, a multiline basic string ending in `\` right before `"""` - // reads `text[len]` and panics even in release (slice bounds checks - // always run). - let next_i: usize = iter.i as usize + 1; - if next_i < text.len() && text[next_i] == b'\n' { - // Make sure Windows CRLF counts as a single newline - iter.i += 1; - } - continue; - } - c if c == '\n' as CodePoint || c == 0x2028 || c == 0x2029 => { - // Ignore line continuations. A line continuation is not an escaped newline. - if !ALLOW_MULTILINE { - self.end = - (start + iter.i as usize).saturating_sub(width2 as usize); - self.add_default_error(b"Unexpected end of line")?; - } - continue; - } - _ => { - iter.c = c2; - } - } - } - _ => {} - } - - match iter.c { - -1 => return self.add_default_error(b"Unexpected end of file"), - 0..=127 => { - buf.push(u8::try_from(iter.c).expect("int cast")); - } - _ => { - let mut part: [u8; 4] = [0; 4]; - let len = strings::encode_wtf8_rune(&mut part, iter.c as u32); - buf.extend_from_slice(&part[0..len]); - } - } - } - Ok(()) + // Multiline basic strings permit line continuations but reject `\x`; + // single-line basic strings are the inverse. + bun_ast::lexer_log::decode_escape_sequences::<_, ALLOW_MULTILINE, ALLOW_MULTILINE>( + self, start, text, buf, + ) } pub fn expected(&mut self, token: T) -> Result<(), Error> { From 1c37c64448fe3ac80b59796654e5429e6e9ae2d3 Mon Sep 17 00:00:00 2001 From: robobun Date: Tue, 9 Jun 2026 20:40:13 +0000 Subject: [PATCH 2/3] Pin TOML number underscore and string escape quirks in tests The shared number_scan and decode_escape_sequences helpers encode these behaviors in const parameters; these tests lock in the observable rules: underscore separator placement, exponent digits, the \x single-line vs multiline asymmetry, line continuations, and unicode escape range checks. --- test/js/bun/resolve/toml/toml-parse.test.ts | 54 +++++++++++++++++++++ 1 file changed, 54 insertions(+) diff --git a/test/js/bun/resolve/toml/toml-parse.test.ts b/test/js/bun/resolve/toml/toml-parse.test.ts index 45f2c48550b..ce9f8d3f7dc 100644 --- a/test/js/bun/resolve/toml/toml-parse.test.ts +++ b/test/js/bun/resolve/toml/toml-parse.test.ts @@ -87,3 +87,57 @@ test("Bun.TOML.parse rejects array values without comma separators (#31252)", () // Trailing comma is legal TOML. expect(Bun.TOML.parse("a = [1, 2,]")).toEqual({ a: [1, 2] }); }); + +// Digit scanning lives in parsers/number_scan.rs (shared with the json lexer). +// These pin the underscore-separator rules: legal between digits, illegal +// doubled, adjacent to the decimal point, or at the start of the exponent. +test("Bun.TOML.parse accepts underscore digit separators in numbers", () => { + expect(Bun.TOML.parse("a = 1_000")).toEqual({ a: 1000 }); + expect(Bun.TOML.parse("a = 5_349_221")).toEqual({ a: 5349221 }); + expect(Bun.TOML.parse("a = 1_000.000_1")).toEqual({ a: 1000.0001 }); + expect(Bun.TOML.parse("a = 1e1_0")).toEqual({ a: 1e10 }); + expect(Bun.TOML.parse("a = 9_224_617.445_991_228")).toEqual({ a: 9224617.445991228 }); +}); + +test("Bun.TOML.parse rejects misplaced underscores in numbers", () => { + expect(() => Bun.TOML.parse("a = 1__0")).toThrow(); + expect(() => Bun.TOML.parse("a = 1_.5")).toThrow(); + expect(() => Bun.TOML.parse("a = 1._5")).toThrow(); + expect(() => Bun.TOML.parse("a = 1.5_e3")).toThrow(); + expect(() => Bun.TOML.parse("a = 1.5e_3")).toThrow(); +}); + +test("Bun.TOML.parse rejects an exponent with no digits", () => { + expect(() => Bun.TOML.parse("a = 1e")).toThrow(); + expect(() => Bun.TOML.parse("a = 1e+")).toThrow(); + // Signed exponents with digits are fine. + expect(Bun.TOML.parse("a = 6.626e-34")).toEqual({ a: 6.626e-34 }); + expect(Bun.TOML.parse("a = 1e+6")).toEqual({ a: 1e6 }); +}); + +// decode_escape_sequences is instantiated with REJECT_HEX_ESCAPE and +// ALLOW_LINE_CONTINUATIONS both keyed to multiline-ness: multiline basic +// strings permit `\` but reject `\x`, single-line basic strings do +// the opposite (`\x` is a historical extension; TOML proper has neither). +test("Bun.TOML.parse allows \\x escapes in single-line basic strings only", () => { + expect(Bun.TOML.parse('a = "\\x41"')).toEqual({ a: "A" }); + expect(() => Bun.TOML.parse('a = """\\x41"""')).toThrow(); +}); + +test("Bun.TOML.parse allows line continuations in multiline basic strings only", () => { + // Note: only the `\` pair is dropped. TOML proper also trims the + // next line's leading whitespace; Bun's decoder keeps it (JS semantics). + expect(Bun.TOML.parse('a = """line \\\n joined"""')).toEqual({ a: "line joined" }); + // CRLF after the backslash is a single continuation too. + expect(Bun.TOML.parse('a = """line \\\r\n joined"""')).toEqual({ a: "line joined" }); + expect(() => Bun.TOML.parse('a = "line \\\n joined"')).toThrow(); +}); + +test("Bun.TOML.parse decodes unicode escapes and rejects out-of-range ones", () => { + expect(Bun.TOML.parse('a = "\\u0041\\u00e9\\u2764"')).toEqual({ a: "A\u00e9\u2764" }); + expect(Bun.TOML.parse('a = "\\u{1F600}"')).toEqual({ a: "\u{1F600}" }); + // Above U+10FFFF. + expect(() => Bun.TOML.parse('a = "\\u{110000}"')).toThrow(); + // Non-hex digits in a fixed-length escape. + expect(() => Bun.TOML.parse('a = "\\uZZZZ"')).toThrow(); +}); From 000af9785d30b3316b95760d52511e17f6a39694 Mon Sep 17 00:00:00 2001 From: robobun Date: Wed, 10 Jun 2026 07:10:54 +0000 Subject: [PATCH 3/3] ci: retrigger