Apply NFKC normalization to unicode identifiers in the lexer (#10412)

AlexWaygood · web-flow · commit 92e6026446fd · 2024-03-18T11:56:56.000Z
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -108,6 +108,7 @@ unic-ucd-category = { version = "0.9" }
 unicode-ident = { version = "1.0.12" }
 unicode-width = { version = "0.1.11" }
 unicode_names2 = { version = "1.2.2" }
+unicode-normalization = { version = "0.1.23" }
 ureq = { version = "2.9.6" }
 url = { version = "2.5.0" }
 uuid = { version = "1.6.1", features = ["v4", "fast-rng", "macro-diagnostics", "js"] }
diff --git a/crates/ruff_linter/resources/test/fixtures/pyflakes/F821_28.py b/crates/ruff_linter/resources/test/fixtures/pyflakes/F821_28.py
@@ -0,0 +1,9 @@
+"""Test that unicode identifiers are NFKC-normalised"""
+
+𝒞 = 500
+print(𝒞)
+print(C + 𝒞)  # 2 references to the same variable due to NFKC normalization
+print(C / 𝒞)
+print(C == 𝑪 == 𝒞 == 𝓒 == 𝕮)
+
+print(𝒟)  # F821
diff --git a/crates/ruff_linter/src/rules/pyflakes/mod.rs b/crates/ruff_linter/src/rules/pyflakes/mod.rs
@@ -156,6 +156,7 @@ mod tests {
     #[test_case(Rule::UndefinedName, Path::new("F821_26.py"))]
     #[test_case(Rule::UndefinedName, Path::new("F821_26.pyi"))]
     #[test_case(Rule::UndefinedName, Path::new("F821_27.py"))]
+    #[test_case(Rule::UndefinedName, Path::new("F821_28.py"))]
     #[test_case(Rule::UndefinedExport, Path::new("F822_0.py"))]
     #[test_case(Rule::UndefinedExport, Path::new("F822_0.pyi"))]
     #[test_case(Rule::UndefinedExport, Path::new("F822_1.py"))]
diff --git a/crates/ruff_linter/src/rules/pyflakes/snapshots/ruff_linter__rules__pyflakes__tests__F821_F821_28.py.snap b/crates/ruff_linter/src/rules/pyflakes/snapshots/ruff_linter__rules__pyflakes__tests__F821_F821_28.py.snap
@@ -0,0 +1,10 @@
+---
+source: crates/ruff_linter/src/rules/pyflakes/mod.rs
+---
+F821_28.py:9:7: F821 Undefined name `𝒟`
+  |
+7 | print(C == 𝑪 == 𝒞 == 𝓒 == 𝕮)
+8 | 
+9 | print(𝒟)  # F821
+  |       ^ F821
+  |
diff --git a/crates/ruff_python_formatter/src/expression/expr_name.rs b/crates/ruff_python_formatter/src/expression/expr_name.rs
@@ -1,4 +1,4 @@
-use ruff_formatter::{write, FormatContext};
+use ruff_formatter::write;
 use ruff_python_ast::AnyNodeRef;
 use ruff_python_ast::ExprName;
 
@@ -11,16 +11,11 @@ pub struct FormatExprName;
 
 impl FormatNodeRule<ExprName> for FormatExprName {
     fn fmt_fields(&self, item: &ExprName, f: &mut PyFormatter) -> FormatResult<()> {
-        let ExprName { id, range, ctx: _ } = item;
-
-        debug_assert_eq!(
-            id.as_str(),
-            f.context()
-                .source_code()
-                .slice(*range)
-                .text(f.context().source_code())
-        );
-
+        let ExprName {
+            id: _,
+            range,
+            ctx: _,
+        } = item;
         write!(f, [source_text_slice(*range)])
     }
 
diff --git a/crates/ruff_python_parser/Cargo.toml b/crates/ruff_python_parser/Cargo.toml
@@ -28,6 +28,7 @@ rustc-hash = { workspace = true }
 static_assertions = { workspace = true }
 unicode-ident = { workspace = true }
 unicode_names2 = { workspace = true }
+unicode-normalization = { workspace = true }
 
 [dev-dependencies]
 insta = { workspace = true }
diff --git a/crates/ruff_python_parser/src/lexer.rs b/crates/ruff_python_parser/src/lexer.rs
@@ -32,6 +32,7 @@ use std::iter::FusedIterator;
 use std::{char, cmp::Ordering, str::FromStr};
 
 use unicode_ident::{is_xid_continue, is_xid_start};
+use unicode_normalization::UnicodeNormalization;
 
 use ruff_python_ast::{Int, IpyEscapeKind};
 use ruff_text_size::{TextLen, TextRange, TextSize};
@@ -197,10 +198,25 @@ impl<'source> Lexer<'source> {
             _ => {}
         }
 
-        self.cursor.eat_while(is_identifier_continuation);
+        // Keep track of whether the identifier is ASCII-only or not.
+        //
+        // This is important because Python applies NFKC normalization to
+        // identifiers: https://docs.python.org/3/reference/lexical_analysis.html#identifiers.
+        // We need to therefore do the same in our lexer, but applying NFKC normalization
+        // unconditionally is extremely expensive. If we know an identifier is ASCII-only,
+        // (by far the most common case), we can skip NFKC normalization of the identifier.
+        let mut is_ascii = first.is_ascii();
+        self.cursor
+            .eat_while(|c| is_identifier_continuation(c, &mut is_ascii));
 
         let text = self.token_text();
 
+        if !is_ascii {
+            return Ok(Tok::Name {
+                name: text.nfkc().collect::<String>().into_boxed_str(),
+            });
+        }
+
         let keyword = match text {
             "False" => Tok::False,
             "None" => Tok::None,
@@ -1583,14 +1599,19 @@ fn is_unicode_identifier_start(c: char) -> bool {
     is_xid_start(c)
 }
 
-// Checks if the character c is a valid continuation character as described
-// in https://docs.python.org/3/reference/lexical_analysis.html#identifiers
-fn is_identifier_continuation(c: char) -> bool {
+/// Checks if the character c is a valid continuation character as described
+/// in <https://docs.python.org/3/reference/lexical_analysis.html#identifiers>.
+///
+/// Additionally, this function also keeps track of whether or not the total
+/// identifier is ASCII-only or not by mutably altering a reference to a
+/// boolean value passed in.
+fn is_identifier_continuation(c: char, identifier_is_ascii_only: &mut bool) -> bool {
     // Arrange things such that ASCII codepoints never
     // result in the slower `is_xid_continue` getting called.
     if c.is_ascii() {
         matches!(c, 'a'..='z' | 'A'..='Z' | '_' | '0'..='9')
     } else {
+        *identifier_is_ascii_only = false;
         is_xid_continue(c)
     }
 }
@@ -2042,6 +2063,17 @@ def f(arg=%timeit a = b):
         assert_debug_snapshot!(lex_source(source));
     }
 
+    fn get_tokens_only(source: &str) -> Vec<Tok> {
+        lex_source(source).into_iter().map(|(tok, _)| tok).collect()
+    }
+
+    #[test]
+    fn test_nfkc_normalization() {
+        let source1 = "𝒞 = 500";
+        let source2 = "C = 500";
+        assert_eq!(get_tokens_only(source1), get_tokens_only(source2));
+    }
+
     fn triple_quoted_eol(eol: &str) -> Vec<Spanned> {
         let source = format!("\"\"\"{eol} test string{eol} \"\"\"");
         lex_source(&source)
diff --git a/crates/ruff_python_parser/src/token.rs b/crates/ruff_python_parser/src/token.rs
@@ -16,6 +16,9 @@ pub enum Tok {
     /// Token value for a name, commonly known as an identifier.
     Name {
         /// The name value.
+        ///
+        /// Unicode names are NFKC-normalized by the lexer,
+        /// matching [the behaviour of Python's lexer](https://docs.python.org/3/reference/lexical_analysis.html#identifiers)
         name: Box<str>,
     },
     /// Token value for an integer.