ASCII identifier fast path

MichaReiser · MichaReiser · commit b4cc37d201a3 · 2026-04-27T15:21:53.000+02:00
diff --git a/crates/ruff_python_parser/src/lexer.rs b/crates/ruff_python_parser/src/lexer.rs
@@ -682,9 +682,7 @@ impl<'src> Lexer<'src> {
         // We need to therefore do the same in our lexer, but applying NFKC normalization
         // unconditionally is extremely expensive. If we know an identifier is ASCII-only,
         // (by far the most common case), we can skip NFKC normalization of the identifier.
-        let mut is_ascii = first.is_ascii();
-        self.cursor
-            .eat_while(|c| is_identifier_continuation(c, &mut is_ascii));
+        let is_ascii = self.eat_identifier_continuation(first.is_ascii());
 
         let text = self.token_text();
 
@@ -748,6 +746,29 @@ impl<'src> Lexer<'src> {
         }
     }
 
+    fn eat_identifier_continuation(&mut self, mut is_ascii: bool) -> bool {
+        loop {
+            let bytes = self.cursor.rest().as_bytes();
+            let ascii_len = bytes
+                .iter()
+                .take_while(|&&byte| is_ascii_identifier_continuation_byte(byte))
+                .count();
+
+            if ascii_len > 0 {
+                self.cursor.skip_bytes(ascii_len);
+            }
+
+            match self.cursor.first() {
+                c if c.is_ascii() => return is_ascii,
+                c if is_xid_continue(c) => {
+                    is_ascii = false;
+                    self.cursor.bump();
+                }
+                _ => return is_ascii,
+            }
+        }
+    }
+
     /// Try lexing the single character string prefix, updating the token flags accordingly.
     /// Returns `true` if it matches.
     fn try_single_char_prefix(&mut self, first: char) -> bool {
@@ -1824,29 +1845,16 @@ const fn is_ascii_identifier_start(c: char) -> bool {
     matches!(c, 'a'..='z' | 'A'..='Z' | '_')
 }
 
+const fn is_ascii_identifier_continuation_byte(byte: u8) -> bool {
+    matches!(byte, b'a'..=b'z' | b'A'..=b'Z' | b'_' | b'0'..=b'9')
+}
+
 // Checks if the character c is a valid starting character as described
 // in https://docs.python.org/3/reference/lexical_analysis.html#identifiers
 fn is_unicode_identifier_start(c: char) -> bool {
     is_xid_start(c)
 }
 
-/// Checks if the character c is a valid continuation character as described
-/// in <https://docs.python.org/3/reference/lexical_analysis.html#identifiers>.
-///
-/// Additionally, this function also keeps track of whether or not the total
-/// identifier is ASCII-only or not by mutably altering a reference to a
-/// boolean value passed in.
-fn is_identifier_continuation(c: char, identifier_is_ascii_only: &mut bool) -> bool {
-    // Arrange things such that ASCII codepoints never
-    // result in the slower `is_xid_continue` getting called.
-    if c.is_ascii() {
-        matches!(c, 'a'..='z' | 'A'..='Z' | '_' | '0'..='9')
-    } else {
-        *identifier_is_ascii_only = false;
-        is_xid_continue(c)
-    }
-}
-
 enum LexedText<'a> {
     Source { source: &'a str, range: TextRange },
     Owned(String),
@@ -2420,6 +2428,12 @@ if first:
         assert_eq!(get_tokens_only(source1), get_tokens_only(source2));
     }
 
+    #[test]
+    fn test_unicode_identifier_continuation() {
+        let source = "a𝒞 = 500";
+        assert_snapshot!(lex_source(source));
+    }
+
     fn triple_quoted_eol(eol: &str) -> LexerOutput {
         let source = format!("\"\"\"{eol} test string{eol} \"\"\"");
         lex_source(&source)