astral-sh · MichaReiser · Apr 27, 2026
@@ -682,9 +682,7 @@ impl<'src> Lexer<'src> {
         // We need to therefore do the same in our lexer, but applying NFKC normalization
         // unconditionally is extremely expensive. If we know an identifier is ASCII-only,
         // (by far the most common case), we can skip NFKC normalization of the identifier.
-        let mut is_ascii = first.is_ascii();
-        self.cursor
-            .eat_while(|c| is_identifier_continuation(c, &mut is_ascii));
+        let is_ascii = self.eat_identifier_continuation(first.is_ascii());
 
         let text = self.token_text();
 
@@ -748,6 +746,37 @@ impl<'src> Lexer<'src> {
         }
     }
 
+    fn eat_identifier_continuation(&mut self, is_ascii: bool) -> bool {
+        self.eat_ascii_identifier_continuation();
+
+        if self.cursor.first().is_ascii() {
+            is_ascii
+        } else {
+            self.eat_unicode_identifier_continuation();
+            false
+        }
+    }
+
+    fn eat_ascii_identifier_continuation(&mut self) {
+        let bytes = self.cursor.rest().as_bytes();
+        let ascii_len = bytes
+            .iter()
+            .take_while(|&&byte| is_ascii_identifier_continuation_byte(byte))
+            .count();
+
+        if ascii_len > 0 {
+            self.cursor.skip_bytes(ascii_len);
+        }
+    }
+
+    #[cold]
+    fn eat_unicode_identifier_continuation(&mut self) {
+        while is_xid_continue(self.cursor.first()) {
+            self.cursor.bump();
+            self.eat_ascii_identifier_continuation();
+        }
+    }
+
     /// Try lexing the single character string prefix, updating the token flags accordingly.
     /// Returns `true` if it matches.
     fn try_single_char_prefix(&mut self, first: char) -> bool {
@@ -1824,29 +1853,16 @@ const fn is_ascii_identifier_start(c: char) -> bool {
     matches!(c, 'a'..='z' | 'A'..='Z' | '_')
 }
 
+const fn is_ascii_identifier_continuation_byte(byte: u8) -> bool {
+    matches!(byte, b'a'..=b'z' | b'A'..=b'Z' | b'_' | b'0'..=b'9')
+}
+
 // Checks if the character c is a valid starting character as described
 // in https://docs.python.org/3/reference/lexical_analysis.html#identifiers
 fn is_unicode_identifier_start(c: char) -> bool {
     is_xid_start(c)
 }
 
-/// Checks if the character c is a valid continuation character as described
-/// in <https://docs.python.org/3/reference/lexical_analysis.html#identifiers>.
-///
-/// Additionally, this function also keeps track of whether or not the total
-/// identifier is ASCII-only or not by mutably altering a reference to a
-/// boolean value passed in.
-fn is_identifier_continuation(c: char, identifier_is_ascii_only: &mut bool) -> bool {
-    // Arrange things such that ASCII codepoints never
-    // result in the slower `is_xid_continue` getting called.
-    if c.is_ascii() {
-        matches!(c, 'a'..='z' | 'A'..='Z' | '_' | '0'..='9')
-    } else {
-        *identifier_is_ascii_only = false;
-        is_xid_continue(c)
-    }
-}
-
 enum LexedText<'a> {
     Source { source: &'a str, range: TextRange },
     Owned(String),
@@ -2420,6 +2436,12 @@ if first:
         assert_eq!(get_tokens_only(source1), get_tokens_only(source2));
     }
 
+    #[test]
+    fn test_unicode_identifier_continuation() {
+        let source = "a𝒞 = 500";
+        assert_snapshot!(lex_source(source));
+    }
+
     fn triple_quoted_eol(eol: &str) -> LexerOutput {
         let source = format!("\"\"\"{eol} test string{eol} \"\"\"");
         lex_source(&source)

@@ -0,0 +1,30 @@
+---
+source: crates/ruff_python_parser/src/lexer.rs
+assertion_line: 2434
+expression: lex_source(source)
+---
+## Tokens
+```
+[
+    (
+        Name(
+            Name("aC"),
+        ),
+        0..5,
+    ),
+    (
+        Equal,
+        6..7,
+    ),
+    (
+        Int(
+            500,
+        ),
+        8..11,
+    ),
+    (
+        Newline,
+        11..11,
+    ),
+]
+```