|
29 | 29 | //! [Lexical analysis]: https://docs.python.org/3/reference/lexical_analysis.html
|
30 | 30 |
|
31 | 31 | use std::iter::FusedIterator;
|
32 |
| -use std::{char, cmp::Ordering, str::FromStr}; |
| 32 | +use std::{borrow::Cow, char, cmp::Ordering, str::FromStr}; |
33 | 33 |
|
34 | 34 | use unicode_ident::{is_xid_continue, is_xid_start};
|
| 35 | +use unicode_normalization::UnicodeNormalization; |
35 | 36 |
|
36 | 37 | use ruff_python_ast::{Int, IpyEscapeKind};
|
37 | 38 | use ruff_text_size::{TextLen, TextRange, TextSize};
|
@@ -197,11 +198,37 @@ impl<'source> Lexer<'source> {
|
197 | 198 | _ => {}
|
198 | 199 | }
|
199 | 200 |
|
200 |
| - self.cursor.eat_while(is_identifier_continuation); |
| 201 | + let mut is_ascii = first.is_ascii(); |
201 | 202 |
|
202 |
| - let text = self.token_text(); |
| 203 | + loop { |
| 204 | + let c = self.cursor.first(); |
| 205 | + // Arrange things such that ASCII codepoints never |
| 206 | + // result in the slower `is_xid_continue` getting called. |
| 207 | + if c.is_ascii() { |
| 208 | + if !matches!(c, 'a'..='z' | 'A'..='Z' | '_' | '0'..='9') { |
| 209 | + break; |
| 210 | + } |
| 211 | + } else { |
| 212 | + if !is_xid_continue(c) { |
| 213 | + break; |
| 214 | + } |
| 215 | + is_ascii = false; |
| 216 | + } |
| 217 | + if self.cursor.is_eof() { |
| 218 | + break; |
| 219 | + } |
| 220 | + self.cursor.bump(); |
| 221 | + } |
203 | 222 |
|
204 |
| - let keyword = match text { |
| 223 | + let text = { |
| 224 | + if is_ascii { |
| 225 | + Cow::Borrowed(self.token_text()) |
| 226 | + } else { |
| 227 | + Cow::Owned(self.token_text().nfkc().collect()) |
| 228 | + } |
| 229 | + }; |
| 230 | + |
| 231 | + let keyword = match &*text { |
205 | 232 | "False" => Tok::False,
|
206 | 233 | "None" => Tok::None,
|
207 | 234 | "True" => Tok::True,
|
@@ -1583,18 +1610,6 @@ fn is_unicode_identifier_start(c: char) -> bool {
|
1583 | 1610 | is_xid_start(c)
|
1584 | 1611 | }
|
1585 | 1612 |
|
1586 |
| -// Checks if the character c is a valid continuation character as described |
1587 |
| -// in https://docs.python.org/3/reference/lexical_analysis.html#identifiers |
1588 |
| -fn is_identifier_continuation(c: char) -> bool { |
1589 |
| - // Arrange things such that ASCII codepoints never |
1590 |
| - // result in the slower `is_xid_continue` getting called. |
1591 |
| - if c.is_ascii() { |
1592 |
| - matches!(c, 'a'..='z' | 'A'..='Z' | '_' | '0'..='9') |
1593 |
| - } else { |
1594 |
| - is_xid_continue(c) |
1595 |
| - } |
1596 |
| -} |
1597 |
| - |
1598 | 1613 | /// Returns `true` for [whitespace](https://docs.python.org/3/reference/lexical_analysis.html#whitespace-between-tokens)
|
1599 | 1614 | /// characters.
|
1600 | 1615 | ///
|
|
0 commit comments