fix the bug

AlexWaygood · AlexWaygood · commit 0d02da182c92 · 2024-03-14T20:30:38.000Z
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -108,6 +108,7 @@ unic-ucd-category = { version = "0.9" }
 unicode-ident = { version = "1.0.12" }
 unicode-width = { version = "0.1.11" }
 unicode_names2 = { version = "1.2.2" }
+unicode-normalization = { version = "0.1.23" }
 ureq = { version = "2.9.6" }
 url = { version = "2.5.0" }
 uuid = { version = "1.6.1", features = ["v4", "fast-rng", "macro-diagnostics", "js"] }
diff --git a/crates/ruff_linter/resources/test/fixtures/pyflakes/F821_28.py b/crates/ruff_linter/resources/test/fixtures/pyflakes/F821_28.py
@@ -0,0 +1,9 @@
+"""Test that unicode identifiers are NFKC-normalised"""
+
+𝒞 = 500
+print(𝒞)
+print(C + 𝒞)  # 2 references to the same variable due to NFKC normalization
+print(C / 𝒞)
+print(C == 𝑪 == 𝒞 == 𝓒 == 𝕮)
+
+print(𝒟)  # F821
diff --git a/crates/ruff_linter/src/rules/pyflakes/mod.rs b/crates/ruff_linter/src/rules/pyflakes/mod.rs
@@ -156,6 +156,7 @@ mod tests {
     #[test_case(Rule::UndefinedName, Path::new("F821_26.py"))]
     #[test_case(Rule::UndefinedName, Path::new("F821_26.pyi"))]
     #[test_case(Rule::UndefinedName, Path::new("F821_27.py"))]
+    #[test_case(Rule::UndefinedName, Path::new("F821_28.py"))]
     #[test_case(Rule::UndefinedExport, Path::new("F822_0.py"))]
     #[test_case(Rule::UndefinedExport, Path::new("F822_0.pyi"))]
     #[test_case(Rule::UndefinedExport, Path::new("F822_1.py"))]
diff --git a/crates/ruff_linter/src/rules/pyflakes/snapshots/ruff_linter__rules__pyflakes__tests__F821_F821_28.py.snap b/crates/ruff_linter/src/rules/pyflakes/snapshots/ruff_linter__rules__pyflakes__tests__F821_F821_28.py.snap
@@ -0,0 +1,10 @@
+---
+source: crates/ruff_linter/src/rules/pyflakes/mod.rs
+---
+F821_28.py:9:7: F821 Undefined name `𝒟`
+  |
+7 | print(C == 𝑪 == 𝒞 == 𝓒 == 𝕮)
+8 | 
+9 | print(𝒟)  # F821
+  |       ^ F821
+  |
diff --git a/crates/ruff_python_parser/Cargo.toml b/crates/ruff_python_parser/Cargo.toml
@@ -28,6 +28,7 @@ rustc-hash = { workspace = true }
 static_assertions = { workspace = true }
 unicode-ident = { workspace = true }
 unicode_names2 = { workspace = true }
+unicode-normalization = { workspace = true }
 
 [dev-dependencies]
 insta = { workspace = true }
diff --git a/crates/ruff_python_parser/src/lexer.rs b/crates/ruff_python_parser/src/lexer.rs
@@ -29,9 +29,10 @@
 //! [Lexical analysis]: https://docs.python.org/3/reference/lexical_analysis.html
 
 use std::iter::FusedIterator;
-use std::{char, cmp::Ordering, str::FromStr};
+use std::{borrow::Cow, char, cmp::Ordering, str::FromStr};
 
 use unicode_ident::{is_xid_continue, is_xid_start};
+use unicode_normalization::UnicodeNormalization;
 
 use ruff_python_ast::{Int, IpyEscapeKind};
 use ruff_text_size::{TextLen, TextRange, TextSize};
@@ -197,11 +198,37 @@ impl<'source> Lexer<'source> {
             _ => {}
         }
 
-        self.cursor.eat_while(is_identifier_continuation);
+        let mut is_ascii = first.is_ascii();
 
-        let text = self.token_text();
+        loop {
+            let c = self.cursor.first();
+            // Arrange things such that ASCII codepoints never
+            // result in the slower `is_xid_continue` getting called.
+            if c.is_ascii() {
+                if !matches!(c, 'a'..='z' | 'A'..='Z' | '_' | '0'..='9') {
+                    break;
+                }
+            } else {
+                if !is_xid_continue(c) {
+                    break;
+                }
+                is_ascii = false;
+            }
+            if self.cursor.is_eof() {
+                break;
+            }
+            self.cursor.bump();
+        }
 
-        let keyword = match text {
+        let text = {
+            if is_ascii {
+                Cow::Borrowed(self.token_text())
+            } else {
+                Cow::Owned(self.token_text().nfkc().collect())
+            }
+        };
+
+        let keyword = match &*text {
             "False" => Tok::False,
             "None" => Tok::None,
             "True" => Tok::True,
@@ -1583,18 +1610,6 @@ fn is_unicode_identifier_start(c: char) -> bool {
     is_xid_start(c)
 }
 
-// Checks if the character c is a valid continuation character as described
-// in https://docs.python.org/3/reference/lexical_analysis.html#identifiers
-fn is_identifier_continuation(c: char) -> bool {
-    // Arrange things such that ASCII codepoints never
-    // result in the slower `is_xid_continue` getting called.
-    if c.is_ascii() {
-        matches!(c, 'a'..='z' | 'A'..='Z' | '_' | '0'..='9')
-    } else {
-        is_xid_continue(c)
-    }
-}
-
 /// Returns `true` for [whitespace](https://docs.python.org/3/reference/lexical_analysis.html#whitespace-between-tokens)
 /// characters.
 ///
diff --git a/crates/ruff_python_parser/src/lexer/cursor.rs b/crates/ruff_python_parser/src/lexer/cursor.rs
@@ -119,16 +119,6 @@ impl<'a> Cursor<'a> {
         }
     }
 
-    /// Eats symbols while predicate returns true or until the end of file is reached.
-    #[inline]
-    pub(super) fn eat_while(&mut self, mut predicate: impl FnMut(char) -> bool) {
-        // It was tried making optimized version of this for eg. line comments, but
-        // LLVM can inline all of this and compile it down to fast iteration over bytes.
-        while predicate(self.first()) && !self.is_eof() {
-            self.bump();
-        }
-    }
-
     /// Skips the next `count` bytes.
     ///
     /// ## Panics