Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
62 changes: 42 additions & 20 deletions crates/ruff_python_parser/src/lexer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -682,9 +682,7 @@ impl<'src> Lexer<'src> {
// We need to therefore do the same in our lexer, but applying NFKC normalization
// unconditionally is extremely expensive. If we know an identifier is ASCII-only,
// (by far the most common case), we can skip NFKC normalization of the identifier.
let mut is_ascii = first.is_ascii();
self.cursor
.eat_while(|c| is_identifier_continuation(c, &mut is_ascii));
let is_ascii = self.eat_identifier_continuation(first.is_ascii());

let text = self.token_text();

Expand Down Expand Up @@ -748,6 +746,37 @@ impl<'src> Lexer<'src> {
}
}

fn eat_identifier_continuation(&mut self, is_ascii: bool) -> bool {
self.eat_ascii_identifier_continuation();

if self.cursor.first().is_ascii() {
is_ascii
} else {
self.eat_unicode_identifier_continuation();
false
}
}

fn eat_ascii_identifier_continuation(&mut self) {
let bytes = self.cursor.rest().as_bytes();
let ascii_len = bytes
.iter()
.take_while(|&&byte| is_ascii_identifier_continuation_byte(byte))
.count();

if ascii_len > 0 {
self.cursor.skip_bytes(ascii_len);
}
}

#[cold]
fn eat_unicode_identifier_continuation(&mut self) {
while is_xid_continue(self.cursor.first()) {
self.cursor.bump();
self.eat_ascii_identifier_continuation();
}
}

/// Try lexing the single character string prefix, updating the token flags accordingly.
/// Returns `true` if it matches.
fn try_single_char_prefix(&mut self, first: char) -> bool {
Expand Down Expand Up @@ -1824,29 +1853,16 @@ const fn is_ascii_identifier_start(c: char) -> bool {
matches!(c, 'a'..='z' | 'A'..='Z' | '_')
}

const fn is_ascii_identifier_continuation_byte(byte: u8) -> bool {
matches!(byte, b'a'..=b'z' | b'A'..=b'Z' | b'_' | b'0'..=b'9')
}

// Checks if the character c is a valid starting character as described
// in https://docs.python.org/3/reference/lexical_analysis.html#identifiers
fn is_unicode_identifier_start(c: char) -> bool {
is_xid_start(c)
}

/// Checks if the character c is a valid continuation character as described
/// in <https://docs.python.org/3/reference/lexical_analysis.html#identifiers>.
///
/// Additionally, this function also keeps track of whether or not the total
/// identifier is ASCII-only or not by mutably altering a reference to a
/// boolean value passed in.
fn is_identifier_continuation(c: char, identifier_is_ascii_only: &mut bool) -> bool {
// Arrange things such that ASCII codepoints never
// result in the slower `is_xid_continue` getting called.
if c.is_ascii() {
matches!(c, 'a'..='z' | 'A'..='Z' | '_' | '0'..='9')
} else {
*identifier_is_ascii_only = false;
is_xid_continue(c)
}
}

enum LexedText<'a> {
Source { source: &'a str, range: TextRange },
Owned(String),
Expand Down Expand Up @@ -2420,6 +2436,12 @@ if first:
assert_eq!(get_tokens_only(source1), get_tokens_only(source2));
}

#[test]
fn test_unicode_identifier_continuation() {
let source = "a𝒞 = 500";
assert_snapshot!(lex_source(source));
}

fn triple_quoted_eol(eol: &str) -> LexerOutput {
let source = format!("\"\"\"{eol} test string{eol} \"\"\"");
lex_source(&source)
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
---
source: crates/ruff_python_parser/src/lexer.rs
assertion_line: 2434
expression: lex_source(source)
---
## Tokens
```
[
(
Name(
Name("aC"),
),
0..5,
),
(
Equal,
6..7,
),
(
Int(
500,
),
8..11,
),
(
Newline,
11..11,
),
]
```
Loading