Skip to content

Commit 0d02da1

Browse files
committed
fix the bug
1 parent e944c16 commit 0d02da1

File tree

8 files changed

+54
-26
lines changed

8 files changed

+54
-26
lines changed

Cargo.lock

+1
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

+1
Original file line numberDiff line numberDiff line change
@@ -108,6 +108,7 @@ unic-ucd-category = { version = "0.9" }
108108
unicode-ident = { version = "1.0.12" }
109109
unicode-width = { version = "0.1.11" }
110110
unicode_names2 = { version = "1.2.2" }
111+
unicode-normalization = { version = "0.1.23" }
111112
ureq = { version = "2.9.6" }
112113
url = { version = "2.5.0" }
113114
uuid = { version = "1.6.1", features = ["v4", "fast-rng", "macro-diagnostics", "js"] }
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
"""Test that unicode identifiers are NFKC-normalised"""
2+
3+
𝒞 = 500
4+
print(𝒞)
5+
print(C + 𝒞) # 2 references to the same variable due to NFKC normalization
6+
print(C / 𝒞)
7+
print(C == 𝑪 == 𝒞 == 𝓒 == 𝕮)
8+
9+
print(𝒟) # F821

crates/ruff_linter/src/rules/pyflakes/mod.rs

+1
Original file line numberDiff line numberDiff line change
@@ -156,6 +156,7 @@ mod tests {
156156
#[test_case(Rule::UndefinedName, Path::new("F821_26.py"))]
157157
#[test_case(Rule::UndefinedName, Path::new("F821_26.pyi"))]
158158
#[test_case(Rule::UndefinedName, Path::new("F821_27.py"))]
159+
#[test_case(Rule::UndefinedName, Path::new("F821_28.py"))]
159160
#[test_case(Rule::UndefinedExport, Path::new("F822_0.py"))]
160161
#[test_case(Rule::UndefinedExport, Path::new("F822_0.pyi"))]
161162
#[test_case(Rule::UndefinedExport, Path::new("F822_1.py"))]
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
---
2+
source: crates/ruff_linter/src/rules/pyflakes/mod.rs
3+
---
4+
F821_28.py:9:7: F821 Undefined name `𝒟`
5+
|
6+
7 | print(C == 𝑪 == 𝒞 == 𝓒 == 𝕮)
7+
8 |
8+
9 | print(𝒟) # F821
9+
| ^ F821
10+
|

crates/ruff_python_parser/Cargo.toml

+1
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@ rustc-hash = { workspace = true }
2828
static_assertions = { workspace = true }
2929
unicode-ident = { workspace = true }
3030
unicode_names2 = { workspace = true }
31+
unicode-normalization = { workspace = true }
3132

3233
[dev-dependencies]
3334
insta = { workspace = true }

crates/ruff_python_parser/src/lexer.rs

+31-16
Original file line numberDiff line numberDiff line change
@@ -29,9 +29,10 @@
2929
//! [Lexical analysis]: https://docs.python.org/3/reference/lexical_analysis.html
3030
3131
use std::iter::FusedIterator;
32-
use std::{char, cmp::Ordering, str::FromStr};
32+
use std::{borrow::Cow, char, cmp::Ordering, str::FromStr};
3333

3434
use unicode_ident::{is_xid_continue, is_xid_start};
35+
use unicode_normalization::UnicodeNormalization;
3536

3637
use ruff_python_ast::{Int, IpyEscapeKind};
3738
use ruff_text_size::{TextLen, TextRange, TextSize};
@@ -197,11 +198,37 @@ impl<'source> Lexer<'source> {
197198
_ => {}
198199
}
199200

200-
self.cursor.eat_while(is_identifier_continuation);
201+
let mut is_ascii = first.is_ascii();
201202

202-
let text = self.token_text();
203+
loop {
204+
let c = self.cursor.first();
205+
// Arrange things such that ASCII codepoints never
206+
// result in the slower `is_xid_continue` getting called.
207+
if c.is_ascii() {
208+
if !matches!(c, 'a'..='z' | 'A'..='Z' | '_' | '0'..='9') {
209+
break;
210+
}
211+
} else {
212+
if !is_xid_continue(c) {
213+
break;
214+
}
215+
is_ascii = false;
216+
}
217+
if self.cursor.is_eof() {
218+
break;
219+
}
220+
self.cursor.bump();
221+
}
203222

204-
let keyword = match text {
223+
let text = {
224+
if is_ascii {
225+
Cow::Borrowed(self.token_text())
226+
} else {
227+
Cow::Owned(self.token_text().nfkc().collect())
228+
}
229+
};
230+
231+
let keyword = match &*text {
205232
"False" => Tok::False,
206233
"None" => Tok::None,
207234
"True" => Tok::True,
@@ -1583,18 +1610,6 @@ fn is_unicode_identifier_start(c: char) -> bool {
15831610
is_xid_start(c)
15841611
}
15851612

1586-
// Checks if the character c is a valid continuation character as described
1587-
// in https://docs.python.org/3/reference/lexical_analysis.html#identifiers
1588-
fn is_identifier_continuation(c: char) -> bool {
1589-
// Arrange things such that ASCII codepoints never
1590-
// result in the slower `is_xid_continue` getting called.
1591-
if c.is_ascii() {
1592-
matches!(c, 'a'..='z' | 'A'..='Z' | '_' | '0'..='9')
1593-
} else {
1594-
is_xid_continue(c)
1595-
}
1596-
}
1597-
15981613
/// Returns `true` for [whitespace](https://docs.python.org/3/reference/lexical_analysis.html#whitespace-between-tokens)
15991614
/// characters.
16001615
///

crates/ruff_python_parser/src/lexer/cursor.rs

-10
Original file line numberDiff line numberDiff line change
@@ -119,16 +119,6 @@ impl<'a> Cursor<'a> {
119119
}
120120
}
121121

122-
/// Eats symbols while predicate returns true or until the end of file is reached.
123-
#[inline]
124-
pub(super) fn eat_while(&mut self, mut predicate: impl FnMut(char) -> bool) {
125-
// It was tried making optimized version of this for eg. line comments, but
126-
// LLVM can inline all of this and compile it down to fast iteration over bytes.
127-
while predicate(self.first()) && !self.is_eof() {
128-
self.bump();
129-
}
130-
}
131-
132122
/// Skips the next `count` bytes.
133123
///
134124
/// ## Panics

0 commit comments

Comments
 (0)