@@ -682,9 +682,7 @@ impl<'src> Lexer<'src> {
682682 // We need to therefore do the same in our lexer, but applying NFKC normalization
683683 // unconditionally is extremely expensive. If we know an identifier is ASCII-only,
684684 // (by far the most common case), we can skip NFKC normalization of the identifier.
685- let mut is_ascii = first. is_ascii ( ) ;
686- self . cursor
687- . eat_while ( |c| is_identifier_continuation ( c, & mut is_ascii) ) ;
685+ let is_ascii = self . eat_identifier_continuation ( first. is_ascii ( ) ) ;
688686
689687 let text = self . token_text ( ) ;
690688
@@ -748,6 +746,29 @@ impl<'src> Lexer<'src> {
748746 }
749747 }
750748
749+ fn eat_identifier_continuation ( & mut self , mut is_ascii : bool ) -> bool {
750+ loop {
751+ let bytes = self . cursor . rest ( ) . as_bytes ( ) ;
752+ let ascii_len = bytes
753+ . iter ( )
754+ . take_while ( |& & byte| is_ascii_identifier_continuation_byte ( byte) )
755+ . count ( ) ;
756+
757+ if ascii_len > 0 {
758+ self . cursor . skip_bytes ( ascii_len) ;
759+ }
760+
761+ match self . cursor . first ( ) {
762+ c if c. is_ascii ( ) => return is_ascii,
763+ c if is_xid_continue ( c) => {
764+ is_ascii = false ;
765+ self . cursor . bump ( ) ;
766+ }
767+ _ => return is_ascii,
768+ }
769+ }
770+ }
771+
751772 /// Try lexing the single character string prefix, updating the token flags accordingly.
752773 /// Returns `true` if it matches.
753774 fn try_single_char_prefix ( & mut self , first : char ) -> bool {
@@ -1824,29 +1845,16 @@ const fn is_ascii_identifier_start(c: char) -> bool {
18241845 matches ! ( c, 'a' ..='z' | 'A' ..='Z' | '_' )
18251846}
18261847
1848+ const fn is_ascii_identifier_continuation_byte ( byte : u8 ) -> bool {
1849+ matches ! ( byte, b'a' ..=b'z' | b'A' ..=b'Z' | b'_' | b'0' ..=b'9' )
1850+ }
1851+
18271852// Checks if the character c is a valid starting character as described
18281853// in https://docs.python.org/3/reference/lexical_analysis.html#identifiers
18291854fn is_unicode_identifier_start ( c : char ) -> bool {
18301855 is_xid_start ( c)
18311856}
18321857
1833- /// Checks if the character c is a valid continuation character as described
1834- /// in <https://docs.python.org/3/reference/lexical_analysis.html#identifiers>.
1835- ///
1836- /// Additionally, this function also keeps track of whether or not the total
1837- /// identifier is ASCII-only or not by mutably altering a reference to a
1838- /// boolean value passed in.
1839- fn is_identifier_continuation ( c : char , identifier_is_ascii_only : & mut bool ) -> bool {
1840- // Arrange things such that ASCII codepoints never
1841- // result in the slower `is_xid_continue` getting called.
1842- if c. is_ascii ( ) {
1843- matches ! ( c, 'a' ..='z' | 'A' ..='Z' | '_' | '0' ..='9' )
1844- } else {
1845- * identifier_is_ascii_only = false ;
1846- is_xid_continue ( c)
1847- }
1848- }
1849-
18501858enum LexedText < ' a > {
18511859 Source { source : & ' a str , range : TextRange } ,
18521860 Owned ( String ) ,
@@ -2420,6 +2428,12 @@ if first:
24202428 assert_eq ! ( get_tokens_only( source1) , get_tokens_only( source2) ) ;
24212429 }
24222430
2431+ #[ test]
2432+ fn test_unicode_identifier_continuation ( ) {
2433+ let source = "a𝒞 = 500" ;
2434+ assert_snapshot ! ( lex_source( source) ) ;
2435+ }
2436+
24232437 fn triple_quoted_eol ( eol : & str ) -> LexerOutput {
24242438 let source = format ! ( "\" \" \" {eol} test string{eol} \" \" \" " ) ;
24252439 lex_source ( & source)
0 commit comments