Skip to content

Commit 92e6026

Browse files
authored
Apply NFKC normalization to unicode identifiers in the lexer (#10412)
1 parent bb54071 commit 92e6026

File tree

9 files changed

+68
-15
lines changed

9 files changed

+68
-15
lines changed

Cargo.lock

+1
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

+1
Original file line numberDiff line numberDiff line change
@@ -108,6 +108,7 @@ unic-ucd-category = { version = "0.9" }
108108
unicode-ident = { version = "1.0.12" }
109109
unicode-width = { version = "0.1.11" }
110110
unicode_names2 = { version = "1.2.2" }
111+
unicode-normalization = { version = "0.1.23" }
111112
ureq = { version = "2.9.6" }
112113
url = { version = "2.5.0" }
113114
uuid = { version = "1.6.1", features = ["v4", "fast-rng", "macro-diagnostics", "js"] }
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
"""Test that unicode identifiers are NFKC-normalised"""
2+
3+
𝒞 = 500
4+
print(𝒞)
5+
print(C + 𝒞) # 2 references to the same variable due to NFKC normalization
6+
print(C / 𝒞)
7+
print(C == 𝑪 == 𝒞 == 𝓒 == 𝕮)
8+
9+
print(𝒟) # F821

crates/ruff_linter/src/rules/pyflakes/mod.rs

+1
Original file line numberDiff line numberDiff line change
@@ -156,6 +156,7 @@ mod tests {
156156
#[test_case(Rule::UndefinedName, Path::new("F821_26.py"))]
157157
#[test_case(Rule::UndefinedName, Path::new("F821_26.pyi"))]
158158
#[test_case(Rule::UndefinedName, Path::new("F821_27.py"))]
159+
#[test_case(Rule::UndefinedName, Path::new("F821_28.py"))]
159160
#[test_case(Rule::UndefinedExport, Path::new("F822_0.py"))]
160161
#[test_case(Rule::UndefinedExport, Path::new("F822_0.pyi"))]
161162
#[test_case(Rule::UndefinedExport, Path::new("F822_1.py"))]
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
---
2+
source: crates/ruff_linter/src/rules/pyflakes/mod.rs
3+
---
4+
F821_28.py:9:7: F821 Undefined name `𝒟`
5+
|
6+
7 | print(C == 𝑪 == 𝒞 == 𝓒 == 𝕮)
7+
8 |
8+
9 | print(𝒟) # F821
9+
| ^ F821
10+
|

crates/ruff_python_formatter/src/expression/expr_name.rs

+6-11
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
use ruff_formatter::{write, FormatContext};
1+
use ruff_formatter::write;
22
use ruff_python_ast::AnyNodeRef;
33
use ruff_python_ast::ExprName;
44

@@ -11,16 +11,11 @@ pub struct FormatExprName;
1111

1212
impl FormatNodeRule<ExprName> for FormatExprName {
1313
fn fmt_fields(&self, item: &ExprName, f: &mut PyFormatter) -> FormatResult<()> {
14-
let ExprName { id, range, ctx: _ } = item;
15-
16-
debug_assert_eq!(
17-
id.as_str(),
18-
f.context()
19-
.source_code()
20-
.slice(*range)
21-
.text(f.context().source_code())
22-
);
23-
14+
let ExprName {
15+
id: _,
16+
range,
17+
ctx: _,
18+
} = item;
2419
write!(f, [source_text_slice(*range)])
2520
}
2621

crates/ruff_python_parser/Cargo.toml

+1
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@ rustc-hash = { workspace = true }
2828
static_assertions = { workspace = true }
2929
unicode-ident = { workspace = true }
3030
unicode_names2 = { workspace = true }
31+
unicode-normalization = { workspace = true }
3132

3233
[dev-dependencies]
3334
insta = { workspace = true }

crates/ruff_python_parser/src/lexer.rs

+36-4
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@ use std::iter::FusedIterator;
3232
use std::{char, cmp::Ordering, str::FromStr};
3333

3434
use unicode_ident::{is_xid_continue, is_xid_start};
35+
use unicode_normalization::UnicodeNormalization;
3536

3637
use ruff_python_ast::{Int, IpyEscapeKind};
3738
use ruff_text_size::{TextLen, TextRange, TextSize};
@@ -197,10 +198,25 @@ impl<'source> Lexer<'source> {
197198
_ => {}
198199
}
199200

200-
self.cursor.eat_while(is_identifier_continuation);
201+
// Keep track of whether the identifier is ASCII-only or not.
202+
//
203+
// This is important because Python applies NFKC normalization to
204+
// identifiers: https://docs.python.org/3/reference/lexical_analysis.html#identifiers.
205+
// We need to therefore do the same in our lexer, but applying NFKC normalization
206+
// unconditionally is extremely expensive. If we know an identifier is ASCII-only,
207+
// (by far the most common case), we can skip NFKC normalization of the identifier.
208+
let mut is_ascii = first.is_ascii();
209+
self.cursor
210+
.eat_while(|c| is_identifier_continuation(c, &mut is_ascii));
201211

202212
let text = self.token_text();
203213

214+
if !is_ascii {
215+
return Ok(Tok::Name {
216+
name: text.nfkc().collect::<String>().into_boxed_str(),
217+
});
218+
}
219+
204220
let keyword = match text {
205221
"False" => Tok::False,
206222
"None" => Tok::None,
@@ -1583,14 +1599,19 @@ fn is_unicode_identifier_start(c: char) -> bool {
15831599
is_xid_start(c)
15841600
}
15851601

1586-
// Checks if the character c is a valid continuation character as described
1587-
// in https://docs.python.org/3/reference/lexical_analysis.html#identifiers
1588-
fn is_identifier_continuation(c: char) -> bool {
1602+
/// Checks if the character c is a valid continuation character as described
1603+
/// in <https://docs.python.org/3/reference/lexical_analysis.html#identifiers>.
1604+
///
1605+
/// Additionally, this function also keeps track of whether or not the total
1606+
/// identifier is ASCII-only or not by mutably altering a reference to a
1607+
/// boolean value passed in.
1608+
fn is_identifier_continuation(c: char, identifier_is_ascii_only: &mut bool) -> bool {
15891609
// Arrange things such that ASCII codepoints never
15901610
// result in the slower `is_xid_continue` getting called.
15911611
if c.is_ascii() {
15921612
matches!(c, 'a'..='z' | 'A'..='Z' | '_' | '0'..='9')
15931613
} else {
1614+
*identifier_is_ascii_only = false;
15941615
is_xid_continue(c)
15951616
}
15961617
}
@@ -2042,6 +2063,17 @@ def f(arg=%timeit a = b):
20422063
assert_debug_snapshot!(lex_source(source));
20432064
}
20442065

2066+
fn get_tokens_only(source: &str) -> Vec<Tok> {
2067+
lex_source(source).into_iter().map(|(tok, _)| tok).collect()
2068+
}
2069+
2070+
#[test]
2071+
fn test_nfkc_normalization() {
2072+
let source1 = "𝒞 = 500";
2073+
let source2 = "C = 500";
2074+
assert_eq!(get_tokens_only(source1), get_tokens_only(source2));
2075+
}
2076+
20452077
fn triple_quoted_eol(eol: &str) -> Vec<Spanned> {
20462078
let source = format!("\"\"\"{eol} test string{eol} \"\"\"");
20472079
lex_source(&source)

crates/ruff_python_parser/src/token.rs

+3
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,9 @@ pub enum Tok {
1616
/// Token value for a name, commonly known as an identifier.
1717
Name {
1818
/// The name value.
19+
///
20+
/// Unicode names are NFKC-normalized by the lexer,
21+
/// matching [the behaviour of Python's lexer](https://docs.python.org/3/reference/lexical_analysis.html#identifiers)
1922
name: Box<str>,
2023
},
2124
/// Token value for an integer.

0 commit comments

Comments
 (0)