@@ -253,6 +253,7 @@ impl StringPart {
253
253
locator : & ' a Locator ,
254
254
configured_style : QuoteStyle ,
255
255
parent_docstring_quote_char : Option < QuoteChar > ,
256
+ normalize_hex : bool ,
256
257
) -> NormalizedString < ' a > {
257
258
// Per PEP 8, always prefer double quotes for triple-quoted strings.
258
259
let preferred_style = if self . quotes . triple {
@@ -310,7 +311,7 @@ impl StringPart {
310
311
configured_style
311
312
} ;
312
313
313
- let raw_content = locator. slice ( self . content_range ) ;
314
+ let raw_content = & locator. slice ( self . content_range ) ;
314
315
315
316
let quotes = match quoting {
316
317
Quoting :: Preserve => self . quotes ,
@@ -327,7 +328,7 @@ impl StringPart {
327
328
}
328
329
} ;
329
330
330
- let normalized = normalize_string ( locator . slice ( self . content_range ) , quotes, self . prefix ) ;
331
+ let normalized = normalize_string ( raw_content , quotes, self . prefix , normalize_hex ) ;
331
332
332
333
NormalizedString {
333
334
prefix : self . prefix ,
@@ -423,6 +424,10 @@ impl StringPrefix {
423
424
pub ( super ) const fn is_fstring ( self ) -> bool {
424
425
self . contains ( StringPrefix :: F_STRING )
425
426
}
427
+
428
+ pub ( super ) const fn is_byte ( self ) -> bool {
429
+ self . contains ( StringPrefix :: BYTE )
430
+ }
426
431
}
427
432
428
433
impl Format < PyFormatContext < ' _ > > for StringPrefix {
@@ -722,7 +727,12 @@ impl TryFrom<char> for QuoteChar {
722
727
/// with the provided [`StringQuotes`] style.
723
728
///
724
729
/// Returns the normalized string and whether it contains new lines.
725
- fn normalize_string ( input : & str , quotes : StringQuotes , prefix : StringPrefix ) -> Cow < str > {
730
+ fn normalize_string (
731
+ input : & str ,
732
+ quotes : StringQuotes ,
733
+ prefix : StringPrefix ,
734
+ normalize_hex : bool ,
735
+ ) -> Cow < str > {
726
736
// The normalized string if `input` is not yet normalized.
727
737
// `output` must remain empty if `input` is already normalized.
728
738
let mut output = String :: new ( ) ;
@@ -734,15 +744,15 @@ fn normalize_string(input: &str, quotes: StringQuotes, prefix: StringPrefix) ->
734
744
let preferred_quote = quote. as_char ( ) ;
735
745
let opposite_quote = quote. invert ( ) . as_char ( ) ;
736
746
737
- let mut chars = input. char_indices ( ) . peekable ( ) ;
747
+ let mut chars = input. char_indices ( ) ;
738
748
739
749
let is_raw = prefix. is_raw_string ( ) ;
740
750
let is_fstring = prefix. is_fstring ( ) ;
741
751
let mut formatted_value_nesting = 0u32 ;
742
752
743
753
while let Some ( ( index, c) ) = chars. next ( ) {
744
754
if is_fstring && matches ! ( c, '{' | '}' ) {
745
- if chars. peek ( ) . copied ( ) . is_some_and ( | ( _ , next ) | next == c) {
755
+ if chars. as_str ( ) . starts_with ( c) {
746
756
// Skip over the second character of the double braces
747
757
chars. next ( ) ;
748
758
} else if c == '{' {
@@ -757,7 +767,7 @@ fn normalize_string(input: &str, quotes: StringQuotes, prefix: StringPrefix) ->
757
767
output. push_str ( & input[ last_index..index] ) ;
758
768
759
769
// Skip over the '\r' character, keep the `\n`
760
- if chars. peek ( ) . copied ( ) . is_some_and ( | ( _ , next ) | next == '\n' ) {
770
+ if chars. as_str ( ) . starts_with ( '\n' ) {
761
771
chars. next ( ) ;
762
772
}
763
773
// Replace the `\r` with a `\n`
@@ -766,24 +776,47 @@ fn normalize_string(input: &str, quotes: StringQuotes, prefix: StringPrefix) ->
766
776
}
767
777
768
778
last_index = index + '\r' . len_utf8 ( ) ;
769
- } else if !quotes . triple && ! is_raw {
779
+ } else if !is_raw {
770
780
if c == '\\' {
771
- if let Some ( ( _, next) ) = chars. peek ( ) . copied ( ) {
772
- #[ allow( clippy:: if_same_then_else) ]
773
- if next == opposite_quote && formatted_value_nesting == 0 {
774
- // Remove the escape by ending before the backslash and starting again with the quote
775
- chars. next ( ) ;
776
- output. push_str ( & input[ last_index..index] ) ;
777
- last_index = index + '\\' . len_utf8 ( ) ;
778
- } else if next == preferred_quote {
779
- // Quote is already escaped, skip over it.
780
- chars. next ( ) ;
781
- } else if next == '\\' {
781
+ if let Some ( ( _, next) ) = chars. clone ( ) . next ( ) {
782
+ if !quotes. triple {
783
+ #[ allow( clippy:: if_same_then_else) ]
784
+ if next == opposite_quote && formatted_value_nesting == 0 {
785
+ // Remove the escape by ending before the backslash and starting again with the quote
786
+ chars. next ( ) ;
787
+ output. push_str ( & input[ last_index..index] ) ;
788
+ last_index = index + '\\' . len_utf8 ( ) ;
789
+ continue ;
790
+ } else if next == preferred_quote {
791
+ // Quote is already escaped, skip over it.
792
+ chars. next ( ) ;
793
+ continue ;
794
+ }
795
+ }
796
+ if next == '\\' {
782
797
// Skip over escaped backslashes
783
798
chars. next ( ) ;
799
+ } else if normalize_hex {
800
+ if let Some ( normalised) = UnicodeEscape :: new ( next, !prefix. is_byte ( ) )
801
+ . and_then ( |escape| escape. normalize ( & chars. as_str ( ) [ next. len_utf8 ( ) ..] ) )
802
+ {
803
+ // Length of the `\` plus the length of the escape sequence character (`u` | `U` | `x`)
804
+ let escape_start_len = '\\' . len_utf8 ( ) + next. len_utf8 ( ) ;
805
+ let escape_start_offset = index + escape_start_len;
806
+ if let Cow :: Owned ( normalised) = & normalised {
807
+ output. push_str ( & input[ last_index..escape_start_offset] ) ;
808
+ output. push_str ( normalised) ;
809
+ last_index = escape_start_offset + normalised. len ( ) ;
810
+ } ;
811
+
812
+ // Skip over the escape sequence characters
813
+ for _ in 0 ..escape_start_len + normalised. len ( ) {
814
+ chars. next ( ) ;
815
+ }
816
+ }
784
817
}
785
818
}
786
- } else if c == preferred_quote && formatted_value_nesting == 0 {
819
+ } else if !quotes . triple && c == preferred_quote && formatted_value_nesting == 0 {
787
820
// Escape the quote
788
821
output. push_str ( & input[ last_index..index] ) ;
789
822
output. push ( '\\' ) ;
@@ -802,3 +835,136 @@ fn normalize_string(input: &str, quotes: StringQuotes, prefix: StringPrefix) ->
802
835
803
836
normalized
804
837
}
838
+
839
+ #[ derive( Copy , Clone , Debug , PartialEq , Eq ) ]
840
+ enum UnicodeEscape {
841
+ /// A hex escape sequence of either 2 (`\x`), 4 (`\u`) or 8 (`\U`) hex characters.
842
+ Hex ( usize ) ,
843
+
844
+ /// An escaped unicode name (`\N{name}`)
845
+ CharacterName ,
846
+ }
847
+
848
+ impl UnicodeEscape {
849
+ fn new ( first : char , allow_unicode : bool ) -> Option < UnicodeEscape > {
850
+ Some ( match first {
851
+ 'x' => UnicodeEscape :: Hex ( 2 ) ,
852
+ 'u' if allow_unicode => UnicodeEscape :: Hex ( 4 ) ,
853
+ 'U' if allow_unicode => UnicodeEscape :: Hex ( 8 ) ,
854
+ 'N' if allow_unicode => UnicodeEscape :: CharacterName ,
855
+ _ => return None ,
856
+ } )
857
+ }
858
+
859
+ /// Normalises `\u..`, `\U..`, `\x..` and `\N{..}` escape sequences to:
860
+ ///
861
+ /// * `\u`, `\U'` and `\x`: To use lower case for the characters `a-f`.
862
+ /// * `\N`: To use uppercase letters
863
+ fn normalize ( self , input : & str ) -> Option < Cow < str > > {
864
+ let mut normalised = String :: new ( ) ;
865
+
866
+ let len = match self {
867
+ UnicodeEscape :: Hex ( len) => {
868
+ // It's not a valid escape sequence if the input string has fewer characters
869
+ // left than required by the escape sequence.
870
+ if input. len ( ) < len {
871
+ return None ;
872
+ }
873
+
874
+ for ( index, c) in input. char_indices ( ) . take ( len) {
875
+ match c {
876
+ '0' ..='9' | 'a' ..='f' => {
877
+ if !normalised. is_empty ( ) {
878
+ normalised. push ( c) ;
879
+ }
880
+ }
881
+ 'A' ..='F' => {
882
+ if normalised. is_empty ( ) {
883
+ normalised. reserve ( len) ;
884
+ normalised. push_str ( & input[ ..index] ) ;
885
+ normalised. push ( c. to_ascii_lowercase ( ) ) ;
886
+ } else {
887
+ normalised. push ( c. to_ascii_lowercase ( ) ) ;
888
+ }
889
+ }
890
+ _ => {
891
+ // not a valid escape sequence
892
+ return None ;
893
+ }
894
+ }
895
+ }
896
+
897
+ len
898
+ }
899
+ UnicodeEscape :: CharacterName => {
900
+ let mut char_indices = input. char_indices ( ) ;
901
+
902
+ if !matches ! ( char_indices. next( ) , Some ( ( _, '{' ) ) ) {
903
+ return None ;
904
+ }
905
+
906
+ loop {
907
+ if let Some ( ( index, c) ) = char_indices. next ( ) {
908
+ match c {
909
+ '}' => {
910
+ if !normalised. is_empty ( ) {
911
+ normalised. push ( '}' ) ;
912
+ }
913
+
914
+ // Name must be at least two characters long.
915
+ if index < 3 {
916
+ return None ;
917
+ }
918
+
919
+ break index + '}' . len_utf8 ( ) ;
920
+ }
921
+ '0' ..='9' | 'A' ..='Z' | ' ' | '-' => {
922
+ if !normalised. is_empty ( ) {
923
+ normalised. push ( c) ;
924
+ }
925
+ }
926
+ 'a' ..='z' => {
927
+ if normalised. is_empty ( ) {
928
+ normalised. reserve ( c. len_utf8 ( ) + '}' . len_utf8 ( ) ) ;
929
+ normalised. push_str ( & input[ ..index] ) ;
930
+ normalised. push ( c. to_ascii_uppercase ( ) ) ;
931
+ } else {
932
+ normalised. push ( c. to_ascii_uppercase ( ) ) ;
933
+ }
934
+ }
935
+ _ => {
936
+ // Seems like an invalid escape sequence, don't normalise it.
937
+ return None ;
938
+ }
939
+ }
940
+ } else {
941
+ // Unterminated escape sequence, dont' normalise it.
942
+ return None ;
943
+ }
944
+ }
945
+ }
946
+ } ;
947
+
948
+ Some ( if normalised. is_empty ( ) {
949
+ Cow :: Borrowed ( & input[ ..len] )
950
+ } else {
951
+ Cow :: Owned ( normalised)
952
+ } )
953
+ }
954
+ }
955
+
956
+ #[ cfg( test) ]
957
+ mod tests {
958
+ use crate :: string:: UnicodeEscape ;
959
+ use std:: borrow:: Cow ;
960
+
961
+ #[ test]
962
+ fn normalize_32_escape ( ) {
963
+ let escape_sequence = UnicodeEscape :: new ( 'U' , true ) . unwrap ( ) ;
964
+
965
+ assert_eq ! (
966
+ Some ( Cow :: Owned ( "0001f60e" . to_string( ) ) ) ,
967
+ escape_sequence. normalize( "0001F60E" )
968
+ ) ;
969
+ }
970
+ }
0 commit comments