Skip to content

Commit ac8c59d

Browse files
committed
Refactor: Move string normalization from StringNormalizer to StringUtil
- Moved normalize() method to StringUtil - use Pattern.compile for NORMALIZE_PATTERN - Update shouldUpdateField() method in MergingIdBasedFetcher to use StringUtil.normalize()
1 parent e8645a2 commit ac8c59d

File tree

2 files changed

+49
-19
lines changed

2 files changed

+49
-19
lines changed

src/main/java/org/jabref/logic/importer/fetcher/MergingIdBasedFetcher.java

Lines changed: 4 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
import org.jabref.model.entry.BibEntry;
1313
import org.jabref.model.entry.field.Field;
1414
import org.jabref.model.entry.field.StandardField;
15+
import org.jabref.model.strings.StringUtil;
1516

1617
import org.mariadb.jdbc.internal.logging.Logger;
1718
import org.mariadb.jdbc.internal.logging.LoggerFactory;
@@ -116,13 +117,12 @@ private Set<Field> updateFieldsFromSource(BibEntry sourceEntry,
116117
.collect(Collectors.toSet());
117118
}
118119

119-
private boolean shouldUpdateField(Field field, BibEntry sourceEntry,
120-
BibEntry targetEntry) {
120+
private boolean shouldUpdateField(Field field, BibEntry sourceEntry, BibEntry targetEntry) {
121121
String sourceValue = sourceEntry.getField(field)
122-
.map(StringNormalizer::normalize)
122+
.map(StringUtil::normalize)
123123
.orElse("");
124124
String targetValue = targetEntry.getField(field)
125-
.map(StringNormalizer::normalize)
125+
.map(StringUtil::normalize)
126126
.orElse("");
127127
return !sourceValue.equals(targetValue);
128128
}
@@ -136,18 +136,3 @@ private void updateField(Field field, BibEntry sourceEntry,
136136
}
137137
}
138138

139-
class StringNormalizer {
140-
public static String normalize(String value) {
141-
if (value == null) {
142-
return "";
143-
}
144-
return value.trim()
145-
.replaceAll("\\s+", " ")
146-
.replaceAll("\\r\\n|\\r|\\n", " ")
147-
.replaceAll("\\s*-+\\s*", "-")
148-
.replaceAll("\\s*,\\s*", ", ")
149-
.replaceAll("\\s*;\\s*", "; ")
150-
.replaceAll("\\s*:\\s*", ": ");
151-
}
152-
}
153-

src/main/java/org/jabref/model/strings/StringUtil.java

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,12 @@ public class StringUtil {
2525
// contains all possible line breaks, not omitting any break such as "\\n"
2626
private static final Pattern LINE_BREAKS = Pattern.compile("\\r\\n|\\r|\\n");
2727
private static final Pattern BRACED_TITLE_CAPITAL_PATTERN = Pattern.compile("\\{[A-Z]+\\}");
28+
private static final Pattern NORMALIZE_PATTERN = Pattern.compile(
29+
"\\s+|" + // multiple whitespace
30+
"\\s*-+\\s*|" + // hyphens with surrounding spaces
31+
"\\s*,\\s*|" + // commas with surrounding spaces
32+
"\\s*;\\s*|" + // semicolons with surrounding spaces
33+
"\\s*:\\s*"); // colons with surrounding spaces
2834
private static final UnicodeToReadableCharMap UNICODE_CHAR_MAP = new UnicodeToReadableCharMap();
2935

3036
public static String booleanToBinaryString(boolean expression) {
@@ -755,4 +761,43 @@ public static String removeStringAtTheEnd(String string, String stringToBeRemove
755761
public static boolean endsWithIgnoreCase(String string, String suffix) {
756762
return StringUtils.endsWithIgnoreCase(string, suffix);
757763
}
764+
765+
/**
766+
* Normalizes a string by standardizing whitespace and punctuation. This includes:
767+
* - Trimming outer whitespace
768+
* - Converting multiple whitespace characters to a single space
769+
* - Converting line breaks to spaces
770+
* - Standardizing formatting around punctuation (hyphens, commas, semicolons, colons)
771+
*
772+
* @param value The string to normalize
773+
* @return The normalized string, or empty string if input is null
774+
*/
775+
public static String normalize(String value) {
776+
if (value == null) {
777+
return "";
778+
}
779+
780+
String withoutLineBreaks = LINE_BREAKS.matcher(value).replaceAll(" ");
781+
782+
String trimmed = withoutLineBreaks.trim();
783+
return NORMALIZE_PATTERN.matcher(trimmed).replaceAll(match -> {
784+
String matchStr = match.group();
785+
if (matchStr.matches("\\s+")) {
786+
return " ";
787+
}
788+
if (matchStr.matches("\\s*-+\\s*")) {
789+
return "-";
790+
}
791+
if (matchStr.matches("\\s*,\\s*")) {
792+
return ", ";
793+
}
794+
if (matchStr.matches("\\s*;\\s*")) {
795+
return "; ";
796+
}
797+
if (matchStr.matches("\\s*:\\s*")) {
798+
return ": ";
799+
}
800+
return matchStr;
801+
});
802+
}
758803
}

0 commit comments

Comments
 (0)