@@ -25,6 +25,12 @@ public class StringUtil {
25
25
// contains all possible line breaks, not omitting any break such as "\\n"
26
26
private static final Pattern LINE_BREAKS = Pattern .compile ("\\ r\\ n|\\ r|\\ n" );
27
27
private static final Pattern BRACED_TITLE_CAPITAL_PATTERN = Pattern .compile ("\\ {[A-Z]+\\ }" );
28
+ private static final Pattern NORMALIZE_PATTERN = Pattern .compile (
29
+ "\\ s+|" + // multiple whitespace
30
+ "\\ s*-+\\ s*|" + // hyphens with surrounding spaces
31
+ "\\ s*,\\ s*|" + // commas with surrounding spaces
32
+ "\\ s*;\\ s*|" + // semicolons with surrounding spaces
33
+ "\\ s*:\\ s*" ); // colons with surrounding spaces
28
34
private static final UnicodeToReadableCharMap UNICODE_CHAR_MAP = new UnicodeToReadableCharMap ();
29
35
30
36
public static String booleanToBinaryString (boolean expression ) {
@@ -755,4 +761,43 @@ public static String removeStringAtTheEnd(String string, String stringToBeRemove
755
761
public static boolean endsWithIgnoreCase (String string , String suffix ) {
756
762
return StringUtils .endsWithIgnoreCase (string , suffix );
757
763
}
764
+
765
+ /**
766
+ * Normalizes a string by standardizing whitespace and punctuation. This includes:
767
+ * - Trimming outer whitespace
768
+ * - Converting multiple whitespace characters to a single space
769
+ * - Converting line breaks to spaces
770
+ * - Standardizing formatting around punctuation (hyphens, commas, semicolons, colons)
771
+ *
772
+ * @param value The string to normalize
773
+ * @return The normalized string, or empty string if input is null
774
+ */
775
+ public static String normalize (String value ) {
776
+ if (value == null ) {
777
+ return "" ;
778
+ }
779
+
780
+ String withoutLineBreaks = LINE_BREAKS .matcher (value ).replaceAll (" " );
781
+
782
+ String trimmed = withoutLineBreaks .trim ();
783
+ return NORMALIZE_PATTERN .matcher (trimmed ).replaceAll (match -> {
784
+ String matchStr = match .group ();
785
+ if (matchStr .matches ("\\ s+" )) {
786
+ return " " ;
787
+ }
788
+ if (matchStr .matches ("\\ s*-+\\ s*" )) {
789
+ return "-" ;
790
+ }
791
+ if (matchStr .matches ("\\ s*,\\ s*" )) {
792
+ return ", " ;
793
+ }
794
+ if (matchStr .matches ("\\ s*;\\ s*" )) {
795
+ return "; " ;
796
+ }
797
+ if (matchStr .matches ("\\ s*:\\ s*" )) {
798
+ return ": " ;
799
+ }
800
+ return matchStr ;
801
+ });
802
+ }
758
803
}
0 commit comments