@@ -253,6 +253,7 @@ impl StringPart {
253
253
locator : & ' a Locator ,
254
254
configured_style : QuoteStyle ,
255
255
parent_docstring_quote_char : Option < QuoteChar > ,
256
+ normalize_hex : bool ,
256
257
) -> NormalizedString < ' a > {
257
258
// Per PEP 8, always prefer double quotes for triple-quoted strings.
258
259
let preferred_style = if self . quotes . triple {
@@ -310,7 +311,7 @@ impl StringPart {
310
311
configured_style
311
312
} ;
312
313
313
- let raw_content = locator. slice ( self . content_range ) ;
314
+ let raw_content = & locator. slice ( self . content_range ) ;
314
315
315
316
let quotes = match quoting {
316
317
Quoting :: Preserve => self . quotes ,
@@ -327,7 +328,7 @@ impl StringPart {
327
328
}
328
329
} ;
329
330
330
- let normalized = normalize_string ( locator . slice ( self . content_range ) , quotes, self . prefix ) ;
331
+ let normalized = normalize_string ( raw_content , quotes, self . prefix , normalize_hex ) ;
331
332
332
333
NormalizedString {
333
334
prefix : self . prefix ,
@@ -423,6 +424,10 @@ impl StringPrefix {
423
424
pub ( super ) const fn is_fstring ( self ) -> bool {
424
425
self . contains ( StringPrefix :: F_STRING )
425
426
}
427
+
428
+ pub ( super ) const fn is_byte ( self ) -> bool {
429
+ self . contains ( StringPrefix :: BYTE )
430
+ }
426
431
}
427
432
428
433
impl Format < PyFormatContext < ' _ > > for StringPrefix {
@@ -722,7 +727,12 @@ impl TryFrom<char> for QuoteChar {
722
727
/// with the provided [`StringQuotes`] style.
723
728
///
724
729
/// Returns the normalized string and whether it contains new lines.
725
- fn normalize_string ( input : & str , quotes : StringQuotes , prefix : StringPrefix ) -> Cow < str > {
730
+ fn normalize_string (
731
+ input : & str ,
732
+ quotes : StringQuotes ,
733
+ prefix : StringPrefix ,
734
+ normalize_hex : bool ,
735
+ ) -> Cow < str > {
726
736
// The normalized string if `input` is not yet normalized.
727
737
// `output` must remain empty if `input` is already normalized.
728
738
let mut output = String :: new ( ) ;
@@ -766,24 +776,50 @@ fn normalize_string(input: &str, quotes: StringQuotes, prefix: StringPrefix) ->
766
776
}
767
777
768
778
last_index = index + '\r' . len_utf8 ( ) ;
769
- } else if !quotes . triple && ! is_raw {
779
+ } else if !is_raw {
770
780
if c == '\\' {
771
- if let Some ( ( _, next) ) = chars. peek ( ) . copied ( ) {
772
- #[ allow( clippy:: if_same_then_else) ]
773
- if next == opposite_quote && formatted_value_nesting == 0 {
774
- // Remove the escape by ending before the backslash and starting again with the quote
775
- chars. next ( ) ;
776
- output. push_str ( & input[ last_index..index] ) ;
777
- last_index = index + '\\' . len_utf8 ( ) ;
778
- } else if next == preferred_quote {
779
- // Quote is already escaped, skip over it.
780
- chars. next ( ) ;
781
- } else if next == '\\' {
781
+ if let Some ( ( _, next) ) = chars. clone ( ) . next ( ) {
782
+ if next == '\\' {
782
783
// Skip over escaped backslashes
783
784
chars. next ( ) ;
785
+ } else if normalize_hex {
786
+ if let Some ( normalised) = UnicodeEscape :: new ( next, !prefix. is_byte ( ) )
787
+ . and_then ( |escape| {
788
+ escape. normalize ( & input[ index + c. len_utf8 ( ) + next. len_utf8 ( ) ..] )
789
+ } )
790
+ {
791
+ // Length of the `\` plus the length of the escape sequence character (`u` | `U` | `x`)
792
+ let escape_start_len = '\\' . len_utf8 ( ) + next. len_utf8 ( ) ;
793
+ let escape_start_offset = index + escape_start_len;
794
+ if let Cow :: Owned ( normalised) = & normalised {
795
+ output. push_str ( & input[ last_index..escape_start_offset] ) ;
796
+ output. push_str ( normalised) ;
797
+ last_index = escape_start_offset + normalised. len ( ) ;
798
+ } ;
799
+
800
+ // Move the `chars` iterator passed the escape sequence.
801
+ // Simply reassigning `chars` doesn't work because the indices` would
802
+ // then be off.
803
+ for _ in 0 ..next. len_utf8 ( ) + normalised. len ( ) {
804
+ chars. next ( ) ;
805
+ }
806
+ }
807
+ }
808
+
809
+ if !quotes. triple {
810
+ #[ allow( clippy:: if_same_then_else) ]
811
+ if next == opposite_quote && formatted_value_nesting == 0 {
812
+ // Remove the escape by ending before the backslash and starting again with the quote
813
+ chars. next ( ) ;
814
+ output. push_str ( & input[ last_index..index] ) ;
815
+ last_index = index + '\\' . len_utf8 ( ) ;
816
+ } else if next == preferred_quote {
817
+ // Quote is already escaped, skip over it.
818
+ chars. next ( ) ;
819
+ }
784
820
}
785
821
}
786
- } else if c == preferred_quote && formatted_value_nesting == 0 {
822
+ } else if !quotes . triple && c == preferred_quote && formatted_value_nesting == 0 {
787
823
// Escape the quote
788
824
output. push_str ( & input[ last_index..index] ) ;
789
825
output. push ( '\\' ) ;
@@ -802,3 +838,153 @@ fn normalize_string(input: &str, quotes: StringQuotes, prefix: StringPrefix) ->
802
838
803
839
normalized
804
840
}
841
+
842
+ #[ derive( Copy , Clone , Debug , PartialEq , Eq ) ]
843
+ enum UnicodeEscape {
844
+ /// A hex escape sequence of either 2 (`\x`), 4 (`\u`) or 8 (`\U`) hex characters.
845
+ Hex ( usize ) ,
846
+
847
+ /// An escaped unicode name (`\N{name}`)
848
+ CharacterName ,
849
+ }
850
+
851
+ impl UnicodeEscape {
852
+ fn new ( first : char , allow_unicode : bool ) -> Option < UnicodeEscape > {
853
+ Some ( match first {
854
+ 'x' => UnicodeEscape :: Hex ( 2 ) ,
855
+ 'u' if allow_unicode => UnicodeEscape :: Hex ( 4 ) ,
856
+ 'U' if allow_unicode => UnicodeEscape :: Hex ( 8 ) ,
857
+ 'N' if allow_unicode => UnicodeEscape :: CharacterName ,
858
+ _ => return None ,
859
+ } )
860
+ }
861
+
862
+ /// Normalises `\u..`, `\U..`, `\x..` and `\N{..}` escape sequences to:
863
+ ///
864
+ /// * `\u`, `\U'` and `\x`: To use lower case for the characters `a-f`.
865
+ /// * `\N`: To use uppercase letters
866
+ fn normalize ( self , input : & str ) -> Option < Cow < str > > {
867
+ let mut normalised = String :: new ( ) ;
868
+
869
+ let len = match self {
870
+ UnicodeEscape :: Hex ( len) => {
871
+ // It's not a valid escape sequence if the input string has fewer characters
872
+ // left than required by the escape sequence.
873
+ if input. len ( ) < len {
874
+ return None ;
875
+ }
876
+
877
+ for ( index, c) in input. char_indices ( ) . take ( len) {
878
+ match c {
879
+ '0' ..='9' | 'a' ..='f' => {
880
+ if !normalised. is_empty ( ) {
881
+ normalised. push ( c) ;
882
+ }
883
+ }
884
+ 'A' ..='F' => {
885
+ if normalised. is_empty ( ) {
886
+ normalised. reserve ( len) ;
887
+ normalised. push_str ( & input[ ..index] ) ;
888
+ normalised. push ( c. to_ascii_lowercase ( ) ) ;
889
+ } else {
890
+ normalised. push ( c. to_ascii_lowercase ( ) ) ;
891
+ }
892
+ }
893
+ _ => {
894
+ // not a valid escape sequence
895
+ return None ;
896
+ }
897
+ }
898
+ }
899
+
900
+ len
901
+ }
902
+ UnicodeEscape :: CharacterName => {
903
+ let mut char_indices = input. char_indices ( ) ;
904
+
905
+ if !matches ! ( char_indices. next( ) , Some ( ( _, '{' ) ) ) {
906
+ return None ;
907
+ }
908
+
909
+ loop {
910
+ if let Some ( ( index, c) ) = char_indices. next ( ) {
911
+ match c {
912
+ '}' => {
913
+ if !normalised. is_empty ( ) {
914
+ normalised. push ( '}' ) ;
915
+ }
916
+
917
+ // Name must be at least two characters long.
918
+ if index < 3 {
919
+ return None ;
920
+ }
921
+
922
+ break index + '}' . len_utf8 ( ) ;
923
+ }
924
+ '0' ..='9' | 'A' ..='Z' | ' ' | '-' => {
925
+ if !normalised. is_empty ( ) {
926
+ normalised. push ( c) ;
927
+ }
928
+ }
929
+ 'a' ..='z' => {
930
+ if normalised. is_empty ( ) {
931
+ normalised. reserve ( c. len_utf8 ( ) + '}' . len_utf8 ( ) ) ;
932
+ normalised. push_str ( & input[ ..index] ) ;
933
+ normalised. push ( c. to_ascii_uppercase ( ) ) ;
934
+ } else {
935
+ normalised. push ( c. to_ascii_uppercase ( ) ) ;
936
+ }
937
+ }
938
+ _ => {
939
+ // Seems like an invalid escape sequence, don't normalise it.
940
+ return None ;
941
+ }
942
+ }
943
+ } else {
944
+ // Unterminated escape sequence, dont' normalise it.
945
+ return None ;
946
+ }
947
+ }
948
+ }
949
+ } ;
950
+
951
+ Some ( if normalised. is_empty ( ) {
952
+ Cow :: Borrowed ( & input[ ..len] )
953
+ } else {
954
+ Cow :: Owned ( normalised)
955
+ } )
956
+ }
957
+ }
958
+
959
+ #[ cfg( test) ]
960
+ mod tests {
961
+ use crate :: string:: { normalize_string, QuoteChar , StringPrefix , StringQuotes , UnicodeEscape } ;
962
+ use std:: borrow:: Cow ;
963
+
964
+ #[ test]
965
+ fn normalize_32_escape ( ) {
966
+ let escape_sequence = UnicodeEscape :: new ( 'U' , true ) . unwrap ( ) ;
967
+
968
+ assert_eq ! (
969
+ Some ( Cow :: Owned ( "0001f60e" . to_string( ) ) ) ,
970
+ escape_sequence. normalize( "0001F60E" )
971
+ ) ;
972
+ }
973
+
974
+ #[ test]
975
+ fn normalize_hex_in_byte_string ( ) {
976
+ let input = r"\x89\x50\x4E\x47\x0D\x0A\x1A\x0A" ;
977
+
978
+ let normalized = normalize_string (
979
+ input,
980
+ StringQuotes {
981
+ triple : false ,
982
+ quote_char : QuoteChar :: Double ,
983
+ } ,
984
+ StringPrefix :: BYTE ,
985
+ true ,
986
+ ) ;
987
+
988
+ assert_eq ! ( r"\x89\x50\x4e\x47\x0d\x0a\x1a\x0a" , & normalized) ;
989
+ }
990
+ }
0 commit comments