@@ -32,6 +32,7 @@ use std::iter::FusedIterator;
32
32
use std:: { char, cmp:: Ordering , str:: FromStr } ;
33
33
34
34
use unicode_ident:: { is_xid_continue, is_xid_start} ;
35
+ use unicode_normalization:: UnicodeNormalization ;
35
36
36
37
use ruff_python_ast:: { Int , IpyEscapeKind } ;
37
38
use ruff_text_size:: { TextLen , TextRange , TextSize } ;
@@ -197,10 +198,25 @@ impl<'source> Lexer<'source> {
197
198
_ => { }
198
199
}
199
200
200
- self . cursor . eat_while ( is_identifier_continuation) ;
201
+ // Keep track of whether the identifier is ASCII-only or not.
202
+ //
203
+ // This is important because Python applies NFKC normalization to
204
+ // identifiers: https://docs.python.org/3/reference/lexical_analysis.html#identifiers.
205
+ // We need to therefore do the same in our lexer, but applying NFKC normalization
206
+ // unconditionally is extremely expensive. If we know an identifier is ASCII-only,
207
+ // (by far the most common case), we can skip NFKC normalization of the identifier.
208
+ let mut is_ascii = first. is_ascii ( ) ;
209
+ self . cursor
210
+ . eat_while ( |c| is_identifier_continuation ( c, & mut is_ascii) ) ;
201
211
202
212
let text = self . token_text ( ) ;
203
213
214
+ if !is_ascii {
215
+ return Ok ( Tok :: Name {
216
+ name : text. nfkc ( ) . collect :: < String > ( ) . into_boxed_str ( ) ,
217
+ } ) ;
218
+ }
219
+
204
220
let keyword = match text {
205
221
"False" => Tok :: False ,
206
222
"None" => Tok :: None ,
@@ -1583,14 +1599,19 @@ fn is_unicode_identifier_start(c: char) -> bool {
1583
1599
is_xid_start ( c)
1584
1600
}
1585
1601
1586
- // Checks if the character c is a valid continuation character as described
1587
- // in https://docs.python.org/3/reference/lexical_analysis.html#identifiers
1588
- fn is_identifier_continuation ( c : char ) -> bool {
1602
+ /// Checks if the character c is a valid continuation character as described
1603
+ /// in <https://docs.python.org/3/reference/lexical_analysis.html#identifiers>.
1604
+ ///
1605
+ /// Additionally, this function also keeps track of whether or not the total
1606
+ /// identifier is ASCII-only or not by mutably altering a reference to a
1607
+ /// boolean value passed in.
1608
+ fn is_identifier_continuation ( c : char , identifier_is_ascii_only : & mut bool ) -> bool {
1589
1609
// Arrange things such that ASCII codepoints never
1590
1610
// result in the slower `is_xid_continue` getting called.
1591
1611
if c. is_ascii ( ) {
1592
1612
matches ! ( c, 'a' ..='z' | 'A' ..='Z' | '_' | '0' ..='9' )
1593
1613
} else {
1614
+ * identifier_is_ascii_only = false ;
1594
1615
is_xid_continue ( c)
1595
1616
}
1596
1617
}
@@ -2042,6 +2063,17 @@ def f(arg=%timeit a = b):
2042
2063
assert_debug_snapshot ! ( lex_source( source) ) ;
2043
2064
}
2044
2065
2066
+ fn get_tokens_only ( source : & str ) -> Vec < Tok > {
2067
+ lex_source ( source) . into_iter ( ) . map ( |( tok, _) | tok) . collect ( )
2068
+ }
2069
+
2070
+ #[ test]
2071
+ fn test_nfkc_normalization ( ) {
2072
+ let source1 = "𝒞 = 500" ;
2073
+ let source2 = "C = 500" ;
2074
+ assert_eq ! ( get_tokens_only( source1) , get_tokens_only( source2) ) ;
2075
+ }
2076
+
2045
2077
fn triple_quoted_eol ( eol : & str ) -> Vec < Spanned > {
2046
2078
let source = format ! ( "\" \" \" {eol} test string{eol} \" \" \" " ) ;
2047
2079
lex_source ( & source)
0 commit comments