Skip to content

Commit 1401dbd

Browse files
committed
Normalise Hex and unicode escape sequences in string
1 parent 2951339 commit 1401dbd

File tree

11 files changed

+248
-123
lines changed

11 files changed

+248
-123
lines changed

crates/ruff_python_formatter/resources/test/fixtures/ruff/expression/bytes.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -118,3 +118,5 @@
118118
b'c'
119119
)
120120
}
121+
122+
b"Unicode Escape sequence don't apply to bytes: \N{0x} \u{ABCD} \U{ABCDEFGH}"

crates/ruff_python_formatter/resources/test/fixtures/ruff/expression/string.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -133,3 +133,8 @@
133133

134134
# https://github.com/astral-sh/ruff/issues/7460
135135
trailing_preferred_quote_texts = [''' "''', ''' ""''', ''' """''', ''' """"''']
136+
137+
a = f"""\x1F"""
138+
a = """\x1F"""
139+
a = """\\x1F"""
140+
a = """\\\x1F"""

crates/ruff_python_formatter/src/lib.rs

Lines changed: 2 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -211,13 +211,8 @@ if True:
211211
#[test]
212212
fn quick_test() {
213213
let source = r#"
214-
def main() -> None:
215-
if True:
216-
some_very_long_variable_name_abcdefghijk = Foo()
217-
some_very_long_variable_name_abcdefghijk = some_very_long_variable_name_abcdefghijk[
218-
some_very_long_variable_name_abcdefghijk.some_very_long_attribute_name
219-
== "This is a very long string abcdefghijk"
220-
]
214+
x = "\N{ox}\N{OX}"
215+
221216
222217
"#;
223218
let source_type = PySourceType::Python;

crates/ruff_python_formatter/src/other/bytes_literal.rs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@ use ruff_python_ast::BytesLiteral;
22
use ruff_text_size::Ranged;
33

44
use crate::prelude::*;
5+
use crate::preview::is_hex_codes_in_unicode_sequences_enabled;
56
use crate::string::{Quoting, StringPart};
67

78
#[derive(Default)]
@@ -17,6 +18,7 @@ impl FormatNodeRule<BytesLiteral> for FormatBytesLiteral {
1718
&locator,
1819
f.options().quote_style(),
1920
f.context().docstring(),
21+
is_hex_codes_in_unicode_sequences_enabled(f.context()),
2022
)
2123
.fmt(f)
2224
}

crates/ruff_python_formatter/src/other/f_string.rs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@ use ruff_python_ast::FString;
22
use ruff_text_size::Ranged;
33

44
use crate::prelude::*;
5+
use crate::preview::is_hex_codes_in_unicode_sequences_enabled;
56
use crate::string::{Quoting, StringPart};
67

78
/// Formats an f-string which is part of a larger f-string expression.
@@ -31,6 +32,7 @@ impl Format<PyFormatContext<'_>> for FormatFString<'_> {
3132
&locator,
3233
f.options().quote_style(),
3334
f.context().docstring(),
35+
is_hex_codes_in_unicode_sequences_enabled(f.context()),
3436
)
3537
.fmt(f);
3638

crates/ruff_python_formatter/src/other/string_literal.rs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@ use ruff_python_ast::StringLiteral;
22
use ruff_text_size::Ranged;
33

44
use crate::prelude::*;
5+
use crate::preview::is_hex_codes_in_unicode_sequences_enabled;
56
use crate::string::{docstring, Quoting, StringPart};
67
use crate::QuoteStyle;
78

@@ -61,6 +62,7 @@ impl Format<PyFormatContext<'_>> for FormatStringLiteral<'_> {
6162
&locator,
6263
quote_style,
6364
f.context().docstring(),
65+
is_hex_codes_in_unicode_sequences_enabled(f.context()),
6466
);
6567

6668
if self.layout.is_docstring() {

crates/ruff_python_formatter/src/preview.rs

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,3 +57,8 @@ pub(crate) const fn is_module_docstring_newlines_enabled(context: &PyFormatConte
5757
pub(crate) const fn is_dummy_implementations_enabled(context: &PyFormatContext) -> bool {
5858
context.is_preview()
5959
}
60+
61+
/// Returns `true` if the [`hex_codes_in_unicode_sequences`](https://github.com/psf/black/pull/2916) preview style is enabled.
62+
pub(crate) const fn is_hex_codes_in_unicode_sequences_enabled(context: &PyFormatContext) -> bool {
63+
context.is_preview()
64+
}

crates/ruff_python_formatter/src/string/mod.rs

Lines changed: 185 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -253,6 +253,7 @@ impl StringPart {
253253
locator: &'a Locator,
254254
configured_style: QuoteStyle,
255255
parent_docstring_quote_char: Option<QuoteChar>,
256+
normalize_hex: bool,
256257
) -> NormalizedString<'a> {
257258
// Per PEP 8, always prefer double quotes for triple-quoted strings.
258259
let preferred_style = if self.quotes.triple {
@@ -310,7 +311,7 @@ impl StringPart {
310311
configured_style
311312
};
312313

313-
let raw_content = locator.slice(self.content_range);
314+
let raw_content = &locator.slice(self.content_range);
314315

315316
let quotes = match quoting {
316317
Quoting::Preserve => self.quotes,
@@ -327,7 +328,7 @@ impl StringPart {
327328
}
328329
};
329330

330-
let normalized = normalize_string(locator.slice(self.content_range), quotes, self.prefix);
331+
let normalized = normalize_string(raw_content, quotes, self.prefix, normalize_hex);
331332

332333
NormalizedString {
333334
prefix: self.prefix,
@@ -423,6 +424,10 @@ impl StringPrefix {
423424
pub(super) const fn is_fstring(self) -> bool {
424425
self.contains(StringPrefix::F_STRING)
425426
}
427+
428+
pub(super) const fn is_byte(self) -> bool {
429+
self.contains(StringPrefix::BYTE)
430+
}
426431
}
427432

428433
impl Format<PyFormatContext<'_>> for StringPrefix {
@@ -722,7 +727,12 @@ impl TryFrom<char> for QuoteChar {
722727
/// with the provided [`StringQuotes`] style.
723728
///
724729
/// Returns the normalized string and whether it contains new lines.
725-
fn normalize_string(input: &str, quotes: StringQuotes, prefix: StringPrefix) -> Cow<str> {
730+
fn normalize_string(
731+
input: &str,
732+
quotes: StringQuotes,
733+
prefix: StringPrefix,
734+
normalize_hex: bool,
735+
) -> Cow<str> {
726736
// The normalized string if `input` is not yet normalized.
727737
// `output` must remain empty if `input` is already normalized.
728738
let mut output = String::new();
@@ -734,15 +744,15 @@ fn normalize_string(input: &str, quotes: StringQuotes, prefix: StringPrefix) ->
734744
let preferred_quote = quote.as_char();
735745
let opposite_quote = quote.invert().as_char();
736746

737-
let mut chars = input.char_indices().peekable();
747+
let mut chars = input.char_indices();
738748

739749
let is_raw = prefix.is_raw_string();
740750
let is_fstring = prefix.is_fstring();
741751
let mut formatted_value_nesting = 0u32;
742752

743753
while let Some((index, c)) = chars.next() {
744754
if is_fstring && matches!(c, '{' | '}') {
745-
if chars.peek().copied().is_some_and(|(_, next)| next == c) {
755+
if chars.as_str().starts_with(c) {
746756
// Skip over the second character of the double braces
747757
chars.next();
748758
} else if c == '{' {
@@ -757,7 +767,7 @@ fn normalize_string(input: &str, quotes: StringQuotes, prefix: StringPrefix) ->
757767
output.push_str(&input[last_index..index]);
758768

759769
// Skip over the '\r' character, keep the `\n`
760-
if chars.peek().copied().is_some_and(|(_, next)| next == '\n') {
770+
if chars.as_str().starts_with('\n') {
761771
chars.next();
762772
}
763773
// Replace the `\r` with a `\n`
@@ -766,24 +776,47 @@ fn normalize_string(input: &str, quotes: StringQuotes, prefix: StringPrefix) ->
766776
}
767777

768778
last_index = index + '\r'.len_utf8();
769-
} else if !quotes.triple && !is_raw {
779+
} else if !is_raw {
770780
if c == '\\' {
771-
if let Some((_, next)) = chars.peek().copied() {
772-
#[allow(clippy::if_same_then_else)]
773-
if next == opposite_quote && formatted_value_nesting == 0 {
774-
// Remove the escape by ending before the backslash and starting again with the quote
775-
chars.next();
776-
output.push_str(&input[last_index..index]);
777-
last_index = index + '\\'.len_utf8();
778-
} else if next == preferred_quote {
779-
// Quote is already escaped, skip over it.
780-
chars.next();
781-
} else if next == '\\' {
781+
if let Some((_, next)) = chars.clone().next() {
782+
if !quotes.triple {
783+
#[allow(clippy::if_same_then_else)]
784+
if next == opposite_quote && formatted_value_nesting == 0 {
785+
// Remove the escape by ending before the backslash and starting again with the quote
786+
chars.next();
787+
output.push_str(&input[last_index..index]);
788+
last_index = index + '\\'.len_utf8();
789+
continue;
790+
} else if next == preferred_quote {
791+
// Quote is already escaped, skip over it.
792+
chars.next();
793+
continue;
794+
}
795+
}
796+
if next == '\\' {
782797
// Skip over escaped backslashes
783798
chars.next();
799+
} else if normalize_hex {
800+
if let Some(normalised) = UnicodeEscape::new(next, !prefix.is_byte())
801+
.and_then(|escape| escape.normalize(&chars.as_str()[next.len_utf8()..]))
802+
{
803+
// Length of the `\` plus the length of the escape sequence character (`u` | `U` | `x`)
804+
let escape_start_len = '\\'.len_utf8() + next.len_utf8();
805+
let escape_start_offset = index + escape_start_len;
806+
if let Cow::Owned(normalised) = &normalised {
807+
output.push_str(&input[last_index..escape_start_offset]);
808+
output.push_str(normalised);
809+
last_index = escape_start_offset + normalised.len();
810+
};
811+
812+
// Skip over the escape sequence characters
813+
for _ in 0..escape_start_len + normalised.len() {
814+
chars.next();
815+
}
816+
}
784817
}
785818
}
786-
} else if c == preferred_quote && formatted_value_nesting == 0 {
819+
} else if !quotes.triple && c == preferred_quote && formatted_value_nesting == 0 {
787820
// Escape the quote
788821
output.push_str(&input[last_index..index]);
789822
output.push('\\');
@@ -802,3 +835,136 @@ fn normalize_string(input: &str, quotes: StringQuotes, prefix: StringPrefix) ->
802835

803836
normalized
804837
}
838+
839+
#[derive(Copy, Clone, Debug, PartialEq, Eq)]
840+
enum UnicodeEscape {
841+
/// A hex escape sequence of either 2 (`\x`), 4 (`\u`) or 8 (`\U`) hex characters.
842+
Hex(usize),
843+
844+
/// An escaped unicode name (`\N{name}`)
845+
CharacterName,
846+
}
847+
848+
impl UnicodeEscape {
849+
fn new(first: char, allow_unicode: bool) -> Option<UnicodeEscape> {
850+
Some(match first {
851+
'x' => UnicodeEscape::Hex(2),
852+
'u' if allow_unicode => UnicodeEscape::Hex(4),
853+
'U' if allow_unicode => UnicodeEscape::Hex(8),
854+
'N' if allow_unicode => UnicodeEscape::CharacterName,
855+
_ => return None,
856+
})
857+
}
858+
859+
/// Normalises `\u..`, `\U..`, `\x..` and `\N{..}` escape sequences to:
860+
///
861+
/// * `\u`, `\U'` and `\x`: To use lower case for the characters `a-f`.
862+
/// * `\N`: To use uppercase letters
863+
fn normalize(self, input: &str) -> Option<Cow<str>> {
864+
let mut normalised = String::new();
865+
866+
let len = match self {
867+
UnicodeEscape::Hex(len) => {
868+
// It's not a valid escape sequence if the input string has fewer characters
869+
// left than required by the escape sequence.
870+
if input.len() < len {
871+
return None;
872+
}
873+
874+
for (index, c) in input.char_indices().take(len) {
875+
match c {
876+
'0'..='9' | 'a'..='f' => {
877+
if !normalised.is_empty() {
878+
normalised.push(c);
879+
}
880+
}
881+
'A'..='F' => {
882+
if normalised.is_empty() {
883+
normalised.reserve(len);
884+
normalised.push_str(&input[..index]);
885+
normalised.push(c.to_ascii_lowercase());
886+
} else {
887+
normalised.push(c.to_ascii_lowercase());
888+
}
889+
}
890+
_ => {
891+
// not a valid escape sequence
892+
return None;
893+
}
894+
}
895+
}
896+
897+
len
898+
}
899+
UnicodeEscape::CharacterName => {
900+
let mut char_indices = input.char_indices();
901+
902+
if !matches!(char_indices.next(), Some((_, '{'))) {
903+
return None;
904+
}
905+
906+
loop {
907+
if let Some((index, c)) = char_indices.next() {
908+
match c {
909+
'}' => {
910+
if !normalised.is_empty() {
911+
normalised.push('}');
912+
}
913+
914+
// Name must be at least two characters long.
915+
if index < 3 {
916+
return None;
917+
}
918+
919+
break index + '}'.len_utf8();
920+
}
921+
'0'..='9' | 'A'..='Z' | ' ' | '-' => {
922+
if !normalised.is_empty() {
923+
normalised.push(c);
924+
}
925+
}
926+
'a'..='z' => {
927+
if normalised.is_empty() {
928+
normalised.reserve(c.len_utf8() + '}'.len_utf8());
929+
normalised.push_str(&input[..index]);
930+
normalised.push(c.to_ascii_uppercase());
931+
} else {
932+
normalised.push(c.to_ascii_uppercase());
933+
}
934+
}
935+
_ => {
936+
// Seems like an invalid escape sequence, don't normalise it.
937+
return None;
938+
}
939+
}
940+
} else {
941+
// Unterminated escape sequence, dont' normalise it.
942+
return None;
943+
}
944+
}
945+
}
946+
};
947+
948+
Some(if normalised.is_empty() {
949+
Cow::Borrowed(&input[..len])
950+
} else {
951+
Cow::Owned(normalised)
952+
})
953+
}
954+
}
955+
956+
#[cfg(test)]
957+
mod tests {
958+
use crate::string::UnicodeEscape;
959+
use std::borrow::Cow;
960+
961+
#[test]
962+
fn normalize_32_escape() {
963+
let escape_sequence = UnicodeEscape::new('U', true).unwrap();
964+
965+
assert_eq!(
966+
Some(Cow::Owned("0001f60e".to_string())),
967+
escape_sequence.normalize("0001F60E")
968+
);
969+
}
970+
}

0 commit comments

Comments
 (0)