Skip to content

Commit b4cc37d

Browse files
committed
ASCII identifier fast path
1 parent d9957f9 commit b4cc37d

1 file changed

Lines changed: 34 additions & 20 deletions

File tree

crates/ruff_python_parser/src/lexer.rs

Lines changed: 34 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -682,9 +682,7 @@ impl<'src> Lexer<'src> {
682682
// We need to therefore do the same in our lexer, but applying NFKC normalization
683683
// unconditionally is extremely expensive. If we know an identifier is ASCII-only,
684684
// (by far the most common case), we can skip NFKC normalization of the identifier.
685-
let mut is_ascii = first.is_ascii();
686-
self.cursor
687-
.eat_while(|c| is_identifier_continuation(c, &mut is_ascii));
685+
let is_ascii = self.eat_identifier_continuation(first.is_ascii());
688686

689687
let text = self.token_text();
690688

@@ -748,6 +746,29 @@ impl<'src> Lexer<'src> {
748746
}
749747
}
750748

749+
fn eat_identifier_continuation(&mut self, mut is_ascii: bool) -> bool {
750+
loop {
751+
let bytes = self.cursor.rest().as_bytes();
752+
let ascii_len = bytes
753+
.iter()
754+
.take_while(|&&byte| is_ascii_identifier_continuation_byte(byte))
755+
.count();
756+
757+
if ascii_len > 0 {
758+
self.cursor.skip_bytes(ascii_len);
759+
}
760+
761+
match self.cursor.first() {
762+
c if c.is_ascii() => return is_ascii,
763+
c if is_xid_continue(c) => {
764+
is_ascii = false;
765+
self.cursor.bump();
766+
}
767+
_ => return is_ascii,
768+
}
769+
}
770+
}
771+
751772
/// Try lexing the single character string prefix, updating the token flags accordingly.
752773
/// Returns `true` if it matches.
753774
fn try_single_char_prefix(&mut self, first: char) -> bool {
@@ -1824,29 +1845,16 @@ const fn is_ascii_identifier_start(c: char) -> bool {
18241845
matches!(c, 'a'..='z' | 'A'..='Z' | '_')
18251846
}
18261847

1848+
const fn is_ascii_identifier_continuation_byte(byte: u8) -> bool {
1849+
matches!(byte, b'a'..=b'z' | b'A'..=b'Z' | b'_' | b'0'..=b'9')
1850+
}
1851+
18271852
// Checks if the character c is a valid starting character as described
18281853
// in https://docs.python.org/3/reference/lexical_analysis.html#identifiers
18291854
fn is_unicode_identifier_start(c: char) -> bool {
18301855
is_xid_start(c)
18311856
}
18321857

1833-
/// Checks if the character c is a valid continuation character as described
1834-
/// in <https://docs.python.org/3/reference/lexical_analysis.html#identifiers>.
1835-
///
1836-
/// Additionally, this function also keeps track of whether or not the total
1837-
/// identifier is ASCII-only or not by mutably altering a reference to a
1838-
/// boolean value passed in.
1839-
fn is_identifier_continuation(c: char, identifier_is_ascii_only: &mut bool) -> bool {
1840-
// Arrange things such that ASCII codepoints never
1841-
// result in the slower `is_xid_continue` getting called.
1842-
if c.is_ascii() {
1843-
matches!(c, 'a'..='z' | 'A'..='Z' | '_' | '0'..='9')
1844-
} else {
1845-
*identifier_is_ascii_only = false;
1846-
is_xid_continue(c)
1847-
}
1848-
}
1849-
18501858
enum LexedText<'a> {
18511859
Source { source: &'a str, range: TextRange },
18521860
Owned(String),
@@ -2420,6 +2428,12 @@ if first:
24202428
assert_eq!(get_tokens_only(source1), get_tokens_only(source2));
24212429
}
24222430

2431+
#[test]
2432+
fn test_unicode_identifier_continuation() {
2433+
let source = "a𝒞 = 500";
2434+
assert_snapshot!(lex_source(source));
2435+
}
2436+
24232437
fn triple_quoted_eol(eol: &str) -> LexerOutput {
24242438
let source = format!("\"\"\"{eol} test string{eol} \"\"\"");
24252439
lex_source(&source)

0 commit comments

Comments
 (0)