diff --git a/lucene/core/src/java/org/apache/lucene/util/automaton/Automata.java b/lucene/core/src/java/org/apache/lucene/util/automaton/Automata.java index 6b9a7fcc5562..9994eb3c6282 100644 --- a/lucene/core/src/java/org/apache/lucene/util/automaton/Automata.java +++ b/lucene/core/src/java/org/apache/lucene/util/automaton/Automata.java @@ -608,7 +608,24 @@ public static Automaton makeStringUnion(Iterable utf8Strings) { if (utf8Strings.iterator().hasNext() == false) { return makeEmpty(); } else { - return StringsToAutomaton.build(utf8Strings, false); + return StringsToAutomaton.build(utf8Strings, false, false, false); + } + } + + /** + * Returns a new (deterministic and minimal) automaton that accepts the union of the given + * collection of {@link BytesRef}s representing UTF-8 encoded strings. + * + * @param utf8Strings The input strings, UTF-8 encoded. The collection must be in sorted order. + * @return An {@link Automaton} accepting all input strings. The resulting automaton is codepoint + * based (full unicode codepoints on transitions). + */ + public static Automaton makeCaseInsensitiveStringUnion( + Iterable utf8Strings, boolean turkic) { + if (utf8Strings.iterator().hasNext() == false) { + return makeEmpty(); + } else { + return StringsToAutomaton.build(utf8Strings, false, true, turkic); } } @@ -625,7 +642,7 @@ public static Automaton makeBinaryStringUnion(Iterable utf8Strings) { if (utf8Strings.iterator().hasNext() == false) { return makeEmpty(); } else { - return StringsToAutomaton.build(utf8Strings, true); + return StringsToAutomaton.build(utf8Strings, true, false, false); } } @@ -638,7 +655,7 @@ public static Automaton makeBinaryStringUnion(Iterable utf8Strings) { * based (full unicode codepoints on transitions). */ public static Automaton makeStringUnion(BytesRefIterator utf8Strings) throws IOException { - return StringsToAutomaton.build(utf8Strings, false); + return StringsToAutomaton.build(utf8Strings, false, false, false); } /** @@ -651,6 +668,6 @@ public static Automaton makeStringUnion(BytesRefIterator utf8Strings) throws IOE * based (UTF-8 encoded byte transition labels). */ public static Automaton makeBinaryStringUnion(BytesRefIterator utf8Strings) throws IOException { - return StringsToAutomaton.build(utf8Strings, true); + return StringsToAutomaton.build(utf8Strings, true, false, false); } } diff --git a/lucene/core/src/java/org/apache/lucene/util/automaton/CaseFolding.java b/lucene/core/src/java/org/apache/lucene/util/automaton/CaseFolding.java index dba1e6438e23..988ae6c3f681 100644 --- a/lucene/core/src/java/org/apache/lucene/util/automaton/CaseFolding.java +++ b/lucene/core/src/java/org/apache/lucene/util/automaton/CaseFolding.java @@ -743,4 +743,42 @@ static int[] lookupAlternates(int codepoint) { return alts; } + + /** + * Folds the case of the given character according to {@link Character#toLowerCase(int)}, but with + * exceptions if the turkic flag is set. + * + * @param codepoint to code point for the character to fold + * @param turkic if true, then apply tr/az folding rules + * @return the folded character + */ + static int foldCase(int codepoint, boolean turkic) { + if (turkic) { + if (codepoint == 0x00130) { // İ [LATIN CAPITAL LETTER I WITH DOT ABOVE] + return 0x00069; // i [LATIN SMALL LETTER I] + } else if (codepoint == 0x000049) { // I [LATIN CAPITAL LETTER I] + return 0x00131; // ı [LATIN SMALL LETTER DOTLESS I] + } + } + return Character.toLowerCase(codepoint); + } + + /** + * Attempts to convert the given character to upper case, acccording to {@link + * Character#toUpperCase(int)}, but with exceptions if the turkic flag is set. + * + * @param codepoint to code point for the character to convert to upper case + * @param turkic if true, then apply tr/az folding rules + * @return the upper case character + */ + static int upperCase(int codepoint, boolean turkic) { + if (turkic) { + if (codepoint == 0x00069) { // i [LATIN SMALL LETTER I] + return 0x00130; // İ [LATIN CAPITAL LETTER I WITH DOT ABOVE] + } else if (codepoint == 0x00131) { // ı [LATIN SMALL LETTER DOTLESS I] + return 0x000049; // I [LATIN CAPITAL LETTER I] + } + } + return Character.toUpperCase(codepoint); + } } diff --git a/lucene/core/src/java/org/apache/lucene/util/automaton/RegExp.java b/lucene/core/src/java/org/apache/lucene/util/automaton/RegExp.java index 342efe059786..7fe75d6925e5 100644 --- a/lucene/core/src/java/org/apache/lucene/util/automaton/RegExp.java +++ b/lucene/core/src/java/org/apache/lucene/util/automaton/RegExp.java @@ -758,7 +758,7 @@ private Automaton toAutomaton( * @param codepoint the Character code point to encode as an Automaton * @return the original codepoint and the set of alternates */ - private int[] toCaseInsensitiveChar(int codepoint) { + static int[] toCaseInsensitiveChar(int codepoint) { int[] altCodepoints = CaseFolding.lookupAlternates(codepoint); if (altCodepoints != null) { int[] concat = new int[altCodepoints.length + 1]; diff --git a/lucene/core/src/java/org/apache/lucene/util/automaton/StringsToAutomaton.java b/lucene/core/src/java/org/apache/lucene/util/automaton/StringsToAutomaton.java index 58a081fa6a21..f3474592f3b0 100644 --- a/lucene/core/src/java/org/apache/lucene/util/automaton/StringsToAutomaton.java +++ b/lucene/core/src/java/org/apache/lucene/util/automaton/StringsToAutomaton.java @@ -40,10 +40,13 @@ * @see Automata#makeBinaryStringUnion(BytesRefIterator) */ final class StringsToAutomaton { + private final boolean caseInsensitive; + private final boolean turkic; /** The default constructor is private. Use static methods directly. */ - private StringsToAutomaton() { - super(); + private StringsToAutomaton(boolean caseInsensitive, boolean turkic) { + this.caseInsensitive = caseInsensitive; + this.turkic = turkic; } /** DFSA state with char labels on transitions. */ @@ -195,7 +198,11 @@ private boolean setPrevious(BytesRef current) { /** Internal recursive traversal for conversion. */ private static int convert( - Automaton.Builder a, State s, IdentityHashMap visited) { + Automaton.Builder a, + State s, + IdentityHashMap visited, + boolean caseInsensitive, + boolean turkic) { Integer converted = visited.get(s); if (converted != null) { @@ -209,7 +216,15 @@ private static int convert( int i = 0; int[] labels = s.labels; for (StringsToAutomaton.State target : s.states) { - a.addTransition(converted, convert(a, target, visited), labels[i++]); + int label = labels[i++]; + int dest = convert(a, target, visited, caseInsensitive, turkic); + a.addTransition(converted, dest, label); + if (caseInsensitive) { + int altCase = CaseFolding.upperCase(label, turkic); + if (altCase != label) { + a.addTransition(converted, dest, altCase); + } + } } return converted; @@ -227,7 +242,7 @@ private Automaton completeAndConvert() { // Convert: Automaton.Builder a = new Automaton.Builder(); - convert(a, root, new IdentityHashMap<>()); + convert(a, root, new IdentityHashMap<>(), caseInsensitive, turkic); return a.finish(); } @@ -237,8 +252,12 @@ private Automaton completeAndConvert() { * UTF-8 codepoints as transition labels or binary (compiled) transition labels based on {@code * asBinary}. */ - static Automaton build(Iterable input, boolean asBinary) { - final StringsToAutomaton builder = new StringsToAutomaton(); + static Automaton build( + Iterable input, boolean asBinary, boolean caseInsensitive, boolean turkic) { + if (asBinary && caseInsensitive) { + throw new IllegalArgumentException("Cannot use caseInsensitive on binary automaton"); + } + final StringsToAutomaton builder = new StringsToAutomaton(caseInsensitive, turkic); for (BytesRef b : input) { builder.add(b, asBinary); @@ -253,8 +272,13 @@ static Automaton build(Iterable input, boolean asBinary) { * UTF-8 codepoints as transition labels or binary (compiled) transition labels based on {@code * asBinary}. */ - static Automaton build(BytesRefIterator input, boolean asBinary) throws IOException { - final StringsToAutomaton builder = new StringsToAutomaton(); + static Automaton build( + BytesRefIterator input, boolean asBinary, boolean caseInsensitive, boolean turkic) + throws IOException { + if (asBinary && caseInsensitive) { + throw new IllegalArgumentException("Cannot use caseInsensitive on binary automaton"); + } + final StringsToAutomaton builder = new StringsToAutomaton(caseInsensitive, turkic); for (BytesRef b = input.next(); b != null; b = input.next()) { builder.add(b, asBinary); @@ -293,6 +317,10 @@ private void add(BytesRef current, boolean asBinary) { } else { while (pos < max) { codePoint = UnicodeUtil.codePointAt(bytes, pos, codePoint); + if (caseInsensitive + && codePoint.codePoint != CaseFolding.foldCase(codePoint.codePoint, turkic)) { + throw new IllegalArgumentException("Case-insensitive input must be lower-case"); + } next = state.lastChild(codePoint.codePoint); if (next == null) { break; diff --git a/lucene/core/src/test/org/apache/lucene/util/automaton/TestStringsToAutomaton.java b/lucene/core/src/test/org/apache/lucene/util/automaton/TestStringsToAutomaton.java index efaa451258bb..82fe68087f2f 100644 --- a/lucene/core/src/test/org/apache/lucene/util/automaton/TestStringsToAutomaton.java +++ b/lucene/core/src/test/org/apache/lucene/util/automaton/TestStringsToAutomaton.java @@ -26,6 +26,8 @@ import java.util.Iterator; import java.util.List; import java.util.Set; +import java.util.stream.Collectors; +import java.util.stream.Stream; import org.apache.lucene.tests.util.LuceneTestCase; import org.apache.lucene.tests.util.TestUtil; import org.apache.lucene.tests.util.automaton.AutomatonTestUtil; @@ -47,6 +49,26 @@ public void testBasic() throws Exception { checkMinimized(a); } + public void testCaseInsensitive() throws Exception { + List terms = basicTerms(); + Collections.sort(terms); + + Automaton a = buildCaseInsensitive(terms, false); + checkAutomaton(terms, a, false, true); + checkMinimized(a); + } + + public void testCornerCase() throws Exception { + List terms = + Stream.of("aib", "aıc") + .map(LuceneTestCase::newBytesRef) + .sorted() + .collect(Collectors.toCollection(ArrayList::new)); + Automaton a = buildCaseInsensitive(terms, true); + System.out.println(a.toDot()); + assertTrue(a.isDeterministic()); + } + public void testBasicBinary() throws Exception { List terms = basicTerms(); Collections.sort(terms); @@ -84,6 +106,46 @@ public void testRandomMinimized() throws Exception { } } + public void testRandomMinimizedCaseInsensitive() throws Exception { + int iters = RandomizedTest.isNightly() ? 20 : 5; + for (int i = 0; i < iters; i++) { + int size = random().nextInt(2, 50); + Set terms = new HashSet<>(); + List automatonList = new ArrayList<>(size); + boolean turkic = random().nextBoolean(); + for (int j = 0; j < size; j++) { + String s = TestUtil.randomRealisticUnicodeString(random(), 8); + int[] lowercased = s.codePoints().map(c -> CaseFolding.foldCase(c, turkic)).toArray(); + s = new String(lowercased, 0, lowercased.length); + terms.add(newBytesRef(s)); + List charAutomata = + s.codePoints() + .mapToObj( + c -> { + Automaton a = Automata.makeChar(c); + int altCase = CaseFolding.upperCase(c, turkic); + if (altCase != c) { + return Operations.union(List.of(a, Automata.makeChar(altCase))); + } + return a; + }) + .collect(Collectors.toList()); + if (charAutomata.isEmpty()) { + automatonList.add(Automata.makeEmptyString()); + } else { + automatonList.add(Operations.concatenate(charAutomata)); + } + } + List sortedTerms = terms.stream().sorted().toList(); + + Automaton expected = + MinimizationOperations.minimize( + Operations.union(automatonList), Operations.DEFAULT_DETERMINIZE_WORK_LIMIT); + Automaton actual = buildCaseInsensitive(sortedTerms, turkic); + assertSameAutomaton(expected, actual); + } + } + public void testRandomUnicodeOnly() throws Exception { testRandom(false); } @@ -131,6 +193,11 @@ private void testRandom(boolean allowBinary) throws Exception { } private void checkAutomaton(List expected, Automaton a, boolean isBinary) { + checkAutomaton(expected, a, isBinary, false); + } + + private void checkAutomaton( + List expected, Automaton a, boolean isBinary, boolean caseInsensitive) { CompiledAutomaton c = new CompiledAutomaton(a, true, false, isBinary); ByteRunAutomaton runAutomaton = c.runAutomaton; @@ -141,12 +208,14 @@ private void checkAutomaton(List expected, Automaton a, boolean isBina readable + " should be found but wasn't", runAutomaton.run(t.bytes, t.offset, t.length)); } - // Make sure every term produced by the automaton is expected - BytesRefBuilder scratch = new BytesRefBuilder(); - FiniteStringsIterator it = new FiniteStringsIterator(c.automaton); - for (IntsRef r = it.next(); r != null; r = it.next()) { - BytesRef t = Util.toBytesRef(r, scratch); - assertTrue(expected.contains(t)); + if (caseInsensitive == false) { + // Make sure every term produced by the automaton is expected + BytesRefBuilder scratch = new BytesRefBuilder(); + FiniteStringsIterator it = new FiniteStringsIterator(c.automaton); + for (IntsRef r = it.next(); r != null; r = it.next()) { + BytesRef t = Util.toBytesRef(r, scratch); + assertTrue(expected.contains(t)); + } } } @@ -174,9 +243,18 @@ private List basicTerms() { private Automaton build(Collection terms, boolean asBinary) throws IOException { if (random().nextBoolean()) { - return StringsToAutomaton.build(terms, asBinary); + return StringsToAutomaton.build(terms, asBinary, false, false); + } else { + return StringsToAutomaton.build(new TermIterator(terms), asBinary, false, false); + } + } + + private Automaton buildCaseInsensitive(Collection terms, boolean turkic) + throws IOException { + if (random().nextBoolean()) { + return StringsToAutomaton.build(terms, false, true, turkic); } else { - return StringsToAutomaton.build(new TermIterator(terms), asBinary); + return StringsToAutomaton.build(new TermIterator(terms), false, true, turkic); } }