From 036668ac373a4e57ba7e0c52fec4ef085d8524b3 Mon Sep 17 00:00:00 2001 From: Michael Froh Date: Wed, 12 Mar 2025 18:06:51 -0700 Subject: [PATCH 1/2] [DRAFT] Case-insensitive matching over union of strings --- .../lucene/util/automaton/Automata.java | 24 ++++- .../apache/lucene/util/automaton/RegExp.java | 2 +- .../util/automaton/StringsToAutomaton.java | 44 +++++++-- .../automaton/TestStringsToAutomaton.java | 97 ++++++++++++++++--- 4 files changed, 139 insertions(+), 28 deletions(-) diff --git a/lucene/core/src/java/org/apache/lucene/util/automaton/Automata.java b/lucene/core/src/java/org/apache/lucene/util/automaton/Automata.java index 6b9a7fcc5562..fe9a350dec1f 100644 --- a/lucene/core/src/java/org/apache/lucene/util/automaton/Automata.java +++ b/lucene/core/src/java/org/apache/lucene/util/automaton/Automata.java @@ -608,7 +608,23 @@ public static Automaton makeStringUnion(Iterable utf8Strings) { if (utf8Strings.iterator().hasNext() == false) { return makeEmpty(); } else { - return StringsToAutomaton.build(utf8Strings, false); + return StringsToAutomaton.build(utf8Strings, false, false); + } + } + + /** + * Returns a new (deterministic and minimal) automaton that accepts the union of the given + * collection of {@link BytesRef}s representing UTF-8 encoded strings. + * + * @param utf8Strings The input strings, UTF-8 encoded. The collection must be in sorted order. + * @return An {@link Automaton} accepting all input strings. The resulting automaton is codepoint + * based (full unicode codepoints on transitions). + */ + public static Automaton makeCaseInsensitiveStringUnion(Iterable utf8Strings) { + if (utf8Strings.iterator().hasNext() == false) { + return makeEmpty(); + } else { + return StringsToAutomaton.build(utf8Strings, false, true); } } @@ -625,7 +641,7 @@ public static Automaton makeBinaryStringUnion(Iterable utf8Strings) { if (utf8Strings.iterator().hasNext() == false) { return makeEmpty(); } else { - return StringsToAutomaton.build(utf8Strings, true); + return StringsToAutomaton.build(utf8Strings, true, false); } } @@ -638,7 +654,7 @@ public static Automaton makeBinaryStringUnion(Iterable utf8Strings) { * based (full unicode codepoints on transitions). */ public static Automaton makeStringUnion(BytesRefIterator utf8Strings) throws IOException { - return StringsToAutomaton.build(utf8Strings, false); + return StringsToAutomaton.build(utf8Strings, false, false); } /** @@ -651,6 +667,6 @@ public static Automaton makeStringUnion(BytesRefIterator utf8Strings) throws IOE * based (UTF-8 encoded byte transition labels). */ public static Automaton makeBinaryStringUnion(BytesRefIterator utf8Strings) throws IOException { - return StringsToAutomaton.build(utf8Strings, true); + return StringsToAutomaton.build(utf8Strings, true, false); } } diff --git a/lucene/core/src/java/org/apache/lucene/util/automaton/RegExp.java b/lucene/core/src/java/org/apache/lucene/util/automaton/RegExp.java index 342efe059786..7fe75d6925e5 100644 --- a/lucene/core/src/java/org/apache/lucene/util/automaton/RegExp.java +++ b/lucene/core/src/java/org/apache/lucene/util/automaton/RegExp.java @@ -758,7 +758,7 @@ private Automaton toAutomaton( * @param codepoint the Character code point to encode as an Automaton * @return the original codepoint and the set of alternates */ - private int[] toCaseInsensitiveChar(int codepoint) { + static int[] toCaseInsensitiveChar(int codepoint) { int[] altCodepoints = CaseFolding.lookupAlternates(codepoint); if (altCodepoints != null) { int[] concat = new int[altCodepoints.length + 1]; diff --git a/lucene/core/src/java/org/apache/lucene/util/automaton/StringsToAutomaton.java b/lucene/core/src/java/org/apache/lucene/util/automaton/StringsToAutomaton.java index 58a081fa6a21..b66ad2793e52 100644 --- a/lucene/core/src/java/org/apache/lucene/util/automaton/StringsToAutomaton.java +++ b/lucene/core/src/java/org/apache/lucene/util/automaton/StringsToAutomaton.java @@ -195,7 +195,10 @@ private boolean setPrevious(BytesRef current) { /** Internal recursive traversal for conversion. */ private static int convert( - Automaton.Builder a, State s, IdentityHashMap visited) { + Automaton.Builder a, + State s, + IdentityHashMap visited, + boolean caseInsensitive) { Integer converted = visited.get(s); if (converted != null) { @@ -209,7 +212,25 @@ private static int convert( int i = 0; int[] labels = s.labels; for (StringsToAutomaton.State target : s.states) { - a.addTransition(converted, convert(a, target, visited), labels[i++]); + int label = labels[i++]; + int dest = convert(a, target, visited, caseInsensitive); + a.addTransition(converted, dest, label); + if (caseInsensitive) { + int[] alternatives = CaseFolding.lookupAlternates(label); + if (alternatives != null) { + for (int alt : alternatives) { + a.addTransition(converted, dest, alt); + } + } else { + int altCase = + Character.isLowerCase(label) + ? Character.toUpperCase(label) + : Character.toLowerCase(label); + if (altCase != label) { + a.addTransition(converted, dest, altCase); + } + } + } } return converted; @@ -219,7 +240,7 @@ private static int convert( * Called after adding all terms. Performs final minimization and converts to a standard {@link * Automaton} instance. */ - private Automaton completeAndConvert() { + private Automaton completeAndConvert(boolean caseInsensitive) { // Final minimization: if (this.stateRegistry == null) throw new IllegalStateException(); if (root.hasChildren()) replaceOrRegister(root); @@ -227,7 +248,7 @@ private Automaton completeAndConvert() { // Convert: Automaton.Builder a = new Automaton.Builder(); - convert(a, root, new IdentityHashMap<>()); + convert(a, root, new IdentityHashMap<>(), caseInsensitive); return a.finish(); } @@ -237,14 +258,17 @@ private Automaton completeAndConvert() { * UTF-8 codepoints as transition labels or binary (compiled) transition labels based on {@code * asBinary}. */ - static Automaton build(Iterable input, boolean asBinary) { + static Automaton build(Iterable input, boolean asBinary, boolean caseInsensitive) { + if (asBinary && caseInsensitive) { + throw new IllegalArgumentException("Cannot use caseInsensitive on binary automaton"); + } final StringsToAutomaton builder = new StringsToAutomaton(); for (BytesRef b : input) { builder.add(b, asBinary); } - return builder.completeAndConvert(); + return builder.completeAndConvert(caseInsensitive); } /** @@ -253,14 +277,18 @@ static Automaton build(Iterable input, boolean asBinary) { * UTF-8 codepoints as transition labels or binary (compiled) transition labels based on {@code * asBinary}. */ - static Automaton build(BytesRefIterator input, boolean asBinary) throws IOException { + static Automaton build(BytesRefIterator input, boolean asBinary, boolean caseInsensitive) + throws IOException { + if (asBinary && caseInsensitive) { + throw new IllegalArgumentException("Cannot use caseInsensitive on binary automaton"); + } final StringsToAutomaton builder = new StringsToAutomaton(); for (BytesRef b = input.next(); b != null; b = input.next()) { builder.add(b, asBinary); } - return builder.completeAndConvert(); + return builder.completeAndConvert(caseInsensitive); } private void add(BytesRef current, boolean asBinary) { diff --git a/lucene/core/src/test/org/apache/lucene/util/automaton/TestStringsToAutomaton.java b/lucene/core/src/test/org/apache/lucene/util/automaton/TestStringsToAutomaton.java index efaa451258bb..5316f594a910 100644 --- a/lucene/core/src/test/org/apache/lucene/util/automaton/TestStringsToAutomaton.java +++ b/lucene/core/src/test/org/apache/lucene/util/automaton/TestStringsToAutomaton.java @@ -26,6 +26,7 @@ import java.util.Iterator; import java.util.List; import java.util.Set; +import java.util.stream.Collectors; import org.apache.lucene.tests.util.LuceneTestCase; import org.apache.lucene.tests.util.TestUtil; import org.apache.lucene.tests.util.automaton.AutomatonTestUtil; @@ -42,16 +43,25 @@ public void testBasic() throws Exception { List terms = basicTerms(); Collections.sort(terms); - Automaton a = build(terms, false); + Automaton a = build(terms, false, false); checkAutomaton(terms, a, false); checkMinimized(a); } + public void testCaseInsensitive() throws Exception { + List terms = basicTerms(); + Collections.sort(terms); + + Automaton a = build(terms, false, true); + checkAutomaton(terms, a, false, true); + checkMinimized(a); + } + public void testBasicBinary() throws Exception { List terms = basicTerms(); Collections.sort(terms); - Automaton a = build(terms, true); + Automaton a = build(terms, true, false); checkAutomaton(terms, a, true); checkMinimized(a); } @@ -79,7 +89,56 @@ public void testRandomMinimized() throws Exception { Automaton expected = MinimizationOperations.minimize( Operations.union(automatonList), Operations.DEFAULT_DETERMINIZE_WORK_LIMIT); - Automaton actual = build(sortedTerms, buildBinary); + Automaton actual = build(sortedTerms, buildBinary, false); + assertSameAutomaton(expected, actual); + } + } + + public void testRandomMinimizedCaseInsensitive() throws Exception { + int iters = RandomizedTest.isNightly() ? 20 : 5; + for (int i = 0; i < iters; i++) { + int size = random().nextInt(2, 50); + Set terms = new HashSet<>(); + List automatonList = new ArrayList<>(size); + for (int j = 0; j < size; j++) { + String s = TestUtil.randomRealisticUnicodeString(random(), 8); + terms.add(newBytesRef(s)); + List charAutomata = + s.codePoints() + .mapToObj( + c -> { + List caseAutomata = new ArrayList<>(); + caseAutomata.add(Automata.makeChar(c)); + int[] alternates = CaseFolding.lookupAlternates(c); + if (alternates != null) { + for (int alt : alternates) { + caseAutomata.add(Automata.makeChar(alt)); + } + } else { + int altCase = + Character.isLowerCase(c) + ? Character.toUpperCase(c) + : Character.toLowerCase(c); + + if (altCase != c) { + caseAutomata.add(Automata.makeChar(altCase)); + } + } + return Operations.union(caseAutomata); + }) + .collect(Collectors.toList()); + if (charAutomata.isEmpty()) { + automatonList.add(Automata.makeEmptyString()); + } else { + automatonList.add(Operations.concatenate(charAutomata)); + } + } + List sortedTerms = terms.stream().sorted().toList(); + + Automaton expected = + MinimizationOperations.minimize( + Operations.union(automatonList), Operations.DEFAULT_DETERMINIZE_WORK_LIMIT); + Automaton actual = build(sortedTerms, false, true); assertSameAutomaton(expected, actual); } } @@ -98,7 +157,7 @@ public void testLargeTerms() throws Exception { IllegalArgumentException e = expectThrows( IllegalArgumentException.class, - () -> build(Collections.singleton(new BytesRef(b10k)), false)); + () -> build(Collections.singleton(new BytesRef(b10k)), false, false)); assertTrue( e.getMessage() .startsWith( @@ -107,7 +166,7 @@ public void testLargeTerms() throws Exception { + " UTF-8 bytes")); byte[] b1k = ArrayUtil.copyOfSubArray(b10k, 0, 1000); - build(Collections.singleton(new BytesRef(b1k)), false); // no exception + build(Collections.singleton(new BytesRef(b1k)), false, false); // no exception } private void testRandom(boolean allowBinary) throws Exception { @@ -125,12 +184,17 @@ private void testRandom(boolean allowBinary) throws Exception { } List sorted = terms.stream().sorted().toList(); - Automaton a = build(sorted, allowBinary); + Automaton a = build(sorted, allowBinary, false); checkAutomaton(sorted, a, allowBinary); } } private void checkAutomaton(List expected, Automaton a, boolean isBinary) { + checkAutomaton(expected, a, isBinary, false); + } + + private void checkAutomaton( + List expected, Automaton a, boolean isBinary, boolean caseInsensitive) { CompiledAutomaton c = new CompiledAutomaton(a, true, false, isBinary); ByteRunAutomaton runAutomaton = c.runAutomaton; @@ -141,12 +205,14 @@ private void checkAutomaton(List expected, Automaton a, boolean isBina readable + " should be found but wasn't", runAutomaton.run(t.bytes, t.offset, t.length)); } - // Make sure every term produced by the automaton is expected - BytesRefBuilder scratch = new BytesRefBuilder(); - FiniteStringsIterator it = new FiniteStringsIterator(c.automaton); - for (IntsRef r = it.next(); r != null; r = it.next()) { - BytesRef t = Util.toBytesRef(r, scratch); - assertTrue(expected.contains(t)); + if (caseInsensitive == false) { + // Make sure every term produced by the automaton is expected + BytesRefBuilder scratch = new BytesRefBuilder(); + FiniteStringsIterator it = new FiniteStringsIterator(c.automaton); + for (IntsRef r = it.next(); r != null; r = it.next()) { + BytesRef t = Util.toBytesRef(r, scratch); + assertTrue(expected.contains(t)); + } } } @@ -172,11 +238,12 @@ private List basicTerms() { return terms; } - private Automaton build(Collection terms, boolean asBinary) throws IOException { + private Automaton build(Collection terms, boolean asBinary, boolean caseInsensitive) + throws IOException { if (random().nextBoolean()) { - return StringsToAutomaton.build(terms, asBinary); + return StringsToAutomaton.build(terms, asBinary, caseInsensitive); } else { - return StringsToAutomaton.build(new TermIterator(terms), asBinary); + return StringsToAutomaton.build(new TermIterator(terms), asBinary, caseInsensitive); } } From b39474f09aa8e5c933c142ad4438227f6e735f65 Mon Sep 17 00:00:00 2001 From: Michael Froh Date: Mon, 17 Mar 2025 17:57:19 -0700 Subject: [PATCH 2/2] Only accept lowercase input, generate transitions for uppercase --- .../lucene/util/automaton/Automata.java | 13 ++-- .../lucene/util/automaton/CaseFolding.java | 38 +++++++++++ .../util/automaton/StringsToAutomaton.java | 50 +++++++------- .../automaton/TestStringsToAutomaton.java | 67 +++++++++++-------- 4 files changed, 109 insertions(+), 59 deletions(-) diff --git a/lucene/core/src/java/org/apache/lucene/util/automaton/Automata.java b/lucene/core/src/java/org/apache/lucene/util/automaton/Automata.java index fe9a350dec1f..9994eb3c6282 100644 --- a/lucene/core/src/java/org/apache/lucene/util/automaton/Automata.java +++ b/lucene/core/src/java/org/apache/lucene/util/automaton/Automata.java @@ -608,7 +608,7 @@ public static Automaton makeStringUnion(Iterable utf8Strings) { if (utf8Strings.iterator().hasNext() == false) { return makeEmpty(); } else { - return StringsToAutomaton.build(utf8Strings, false, false); + return StringsToAutomaton.build(utf8Strings, false, false, false); } } @@ -620,11 +620,12 @@ public static Automaton makeStringUnion(Iterable utf8Strings) { * @return An {@link Automaton} accepting all input strings. The resulting automaton is codepoint * based (full unicode codepoints on transitions). */ - public static Automaton makeCaseInsensitiveStringUnion(Iterable utf8Strings) { + public static Automaton makeCaseInsensitiveStringUnion( + Iterable utf8Strings, boolean turkic) { if (utf8Strings.iterator().hasNext() == false) { return makeEmpty(); } else { - return StringsToAutomaton.build(utf8Strings, false, true); + return StringsToAutomaton.build(utf8Strings, false, true, turkic); } } @@ -641,7 +642,7 @@ public static Automaton makeBinaryStringUnion(Iterable utf8Strings) { if (utf8Strings.iterator().hasNext() == false) { return makeEmpty(); } else { - return StringsToAutomaton.build(utf8Strings, true, false); + return StringsToAutomaton.build(utf8Strings, true, false, false); } } @@ -654,7 +655,7 @@ public static Automaton makeBinaryStringUnion(Iterable utf8Strings) { * based (full unicode codepoints on transitions). */ public static Automaton makeStringUnion(BytesRefIterator utf8Strings) throws IOException { - return StringsToAutomaton.build(utf8Strings, false, false); + return StringsToAutomaton.build(utf8Strings, false, false, false); } /** @@ -667,6 +668,6 @@ public static Automaton makeStringUnion(BytesRefIterator utf8Strings) throws IOE * based (UTF-8 encoded byte transition labels). */ public static Automaton makeBinaryStringUnion(BytesRefIterator utf8Strings) throws IOException { - return StringsToAutomaton.build(utf8Strings, true, false); + return StringsToAutomaton.build(utf8Strings, true, false, false); } } diff --git a/lucene/core/src/java/org/apache/lucene/util/automaton/CaseFolding.java b/lucene/core/src/java/org/apache/lucene/util/automaton/CaseFolding.java index dba1e6438e23..988ae6c3f681 100644 --- a/lucene/core/src/java/org/apache/lucene/util/automaton/CaseFolding.java +++ b/lucene/core/src/java/org/apache/lucene/util/automaton/CaseFolding.java @@ -743,4 +743,42 @@ static int[] lookupAlternates(int codepoint) { return alts; } + + /** + * Folds the case of the given character according to {@link Character#toLowerCase(int)}, but with + * exceptions if the turkic flag is set. + * + * @param codepoint to code point for the character to fold + * @param turkic if true, then apply tr/az folding rules + * @return the folded character + */ + static int foldCase(int codepoint, boolean turkic) { + if (turkic) { + if (codepoint == 0x00130) { // İ [LATIN CAPITAL LETTER I WITH DOT ABOVE] + return 0x00069; // i [LATIN SMALL LETTER I] + } else if (codepoint == 0x000049) { // I [LATIN CAPITAL LETTER I] + return 0x00131; // ı [LATIN SMALL LETTER DOTLESS I] + } + } + return Character.toLowerCase(codepoint); + } + + /** + * Attempts to convert the given character to upper case, acccording to {@link + * Character#toUpperCase(int)}, but with exceptions if the turkic flag is set. + * + * @param codepoint to code point for the character to convert to upper case + * @param turkic if true, then apply tr/az folding rules + * @return the upper case character + */ + static int upperCase(int codepoint, boolean turkic) { + if (turkic) { + if (codepoint == 0x00069) { // i [LATIN SMALL LETTER I] + return 0x00130; // İ [LATIN CAPITAL LETTER I WITH DOT ABOVE] + } else if (codepoint == 0x00131) { // ı [LATIN SMALL LETTER DOTLESS I] + return 0x000049; // I [LATIN CAPITAL LETTER I] + } + } + return Character.toUpperCase(codepoint); + } } diff --git a/lucene/core/src/java/org/apache/lucene/util/automaton/StringsToAutomaton.java b/lucene/core/src/java/org/apache/lucene/util/automaton/StringsToAutomaton.java index b66ad2793e52..f3474592f3b0 100644 --- a/lucene/core/src/java/org/apache/lucene/util/automaton/StringsToAutomaton.java +++ b/lucene/core/src/java/org/apache/lucene/util/automaton/StringsToAutomaton.java @@ -40,10 +40,13 @@ * @see Automata#makeBinaryStringUnion(BytesRefIterator) */ final class StringsToAutomaton { + private final boolean caseInsensitive; + private final boolean turkic; /** The default constructor is private. Use static methods directly. */ - private StringsToAutomaton() { - super(); + private StringsToAutomaton(boolean caseInsensitive, boolean turkic) { + this.caseInsensitive = caseInsensitive; + this.turkic = turkic; } /** DFSA state with char labels on transitions. */ @@ -198,7 +201,8 @@ private static int convert( Automaton.Builder a, State s, IdentityHashMap visited, - boolean caseInsensitive) { + boolean caseInsensitive, + boolean turkic) { Integer converted = visited.get(s); if (converted != null) { @@ -213,22 +217,12 @@ private static int convert( int[] labels = s.labels; for (StringsToAutomaton.State target : s.states) { int label = labels[i++]; - int dest = convert(a, target, visited, caseInsensitive); + int dest = convert(a, target, visited, caseInsensitive, turkic); a.addTransition(converted, dest, label); if (caseInsensitive) { - int[] alternatives = CaseFolding.lookupAlternates(label); - if (alternatives != null) { - for (int alt : alternatives) { - a.addTransition(converted, dest, alt); - } - } else { - int altCase = - Character.isLowerCase(label) - ? Character.toUpperCase(label) - : Character.toLowerCase(label); - if (altCase != label) { - a.addTransition(converted, dest, altCase); - } + int altCase = CaseFolding.upperCase(label, turkic); + if (altCase != label) { + a.addTransition(converted, dest, altCase); } } } @@ -240,7 +234,7 @@ private static int convert( * Called after adding all terms. Performs final minimization and converts to a standard {@link * Automaton} instance. */ - private Automaton completeAndConvert(boolean caseInsensitive) { + private Automaton completeAndConvert() { // Final minimization: if (this.stateRegistry == null) throw new IllegalStateException(); if (root.hasChildren()) replaceOrRegister(root); @@ -248,7 +242,7 @@ private Automaton completeAndConvert(boolean caseInsensitive) { // Convert: Automaton.Builder a = new Automaton.Builder(); - convert(a, root, new IdentityHashMap<>(), caseInsensitive); + convert(a, root, new IdentityHashMap<>(), caseInsensitive, turkic); return a.finish(); } @@ -258,17 +252,18 @@ private Automaton completeAndConvert(boolean caseInsensitive) { * UTF-8 codepoints as transition labels or binary (compiled) transition labels based on {@code * asBinary}. */ - static Automaton build(Iterable input, boolean asBinary, boolean caseInsensitive) { + static Automaton build( + Iterable input, boolean asBinary, boolean caseInsensitive, boolean turkic) { if (asBinary && caseInsensitive) { throw new IllegalArgumentException("Cannot use caseInsensitive on binary automaton"); } - final StringsToAutomaton builder = new StringsToAutomaton(); + final StringsToAutomaton builder = new StringsToAutomaton(caseInsensitive, turkic); for (BytesRef b : input) { builder.add(b, asBinary); } - return builder.completeAndConvert(caseInsensitive); + return builder.completeAndConvert(); } /** @@ -277,18 +272,19 @@ static Automaton build(Iterable input, boolean asBinary, boolean caseI * UTF-8 codepoints as transition labels or binary (compiled) transition labels based on {@code * asBinary}. */ - static Automaton build(BytesRefIterator input, boolean asBinary, boolean caseInsensitive) + static Automaton build( + BytesRefIterator input, boolean asBinary, boolean caseInsensitive, boolean turkic) throws IOException { if (asBinary && caseInsensitive) { throw new IllegalArgumentException("Cannot use caseInsensitive on binary automaton"); } - final StringsToAutomaton builder = new StringsToAutomaton(); + final StringsToAutomaton builder = new StringsToAutomaton(caseInsensitive, turkic); for (BytesRef b = input.next(); b != null; b = input.next()) { builder.add(b, asBinary); } - return builder.completeAndConvert(caseInsensitive); + return builder.completeAndConvert(); } private void add(BytesRef current, boolean asBinary) { @@ -321,6 +317,10 @@ private void add(BytesRef current, boolean asBinary) { } else { while (pos < max) { codePoint = UnicodeUtil.codePointAt(bytes, pos, codePoint); + if (caseInsensitive + && codePoint.codePoint != CaseFolding.foldCase(codePoint.codePoint, turkic)) { + throw new IllegalArgumentException("Case-insensitive input must be lower-case"); + } next = state.lastChild(codePoint.codePoint); if (next == null) { break; diff --git a/lucene/core/src/test/org/apache/lucene/util/automaton/TestStringsToAutomaton.java b/lucene/core/src/test/org/apache/lucene/util/automaton/TestStringsToAutomaton.java index 5316f594a910..82fe68087f2f 100644 --- a/lucene/core/src/test/org/apache/lucene/util/automaton/TestStringsToAutomaton.java +++ b/lucene/core/src/test/org/apache/lucene/util/automaton/TestStringsToAutomaton.java @@ -27,6 +27,7 @@ import java.util.List; import java.util.Set; import java.util.stream.Collectors; +import java.util.stream.Stream; import org.apache.lucene.tests.util.LuceneTestCase; import org.apache.lucene.tests.util.TestUtil; import org.apache.lucene.tests.util.automaton.AutomatonTestUtil; @@ -43,7 +44,7 @@ public void testBasic() throws Exception { List terms = basicTerms(); Collections.sort(terms); - Automaton a = build(terms, false, false); + Automaton a = build(terms, false); checkAutomaton(terms, a, false); checkMinimized(a); } @@ -52,16 +53,27 @@ public void testCaseInsensitive() throws Exception { List terms = basicTerms(); Collections.sort(terms); - Automaton a = build(terms, false, true); + Automaton a = buildCaseInsensitive(terms, false); checkAutomaton(terms, a, false, true); checkMinimized(a); } + public void testCornerCase() throws Exception { + List terms = + Stream.of("aib", "aıc") + .map(LuceneTestCase::newBytesRef) + .sorted() + .collect(Collectors.toCollection(ArrayList::new)); + Automaton a = buildCaseInsensitive(terms, true); + System.out.println(a.toDot()); + assertTrue(a.isDeterministic()); + } + public void testBasicBinary() throws Exception { List terms = basicTerms(); Collections.sort(terms); - Automaton a = build(terms, true, false); + Automaton a = build(terms, true); checkAutomaton(terms, a, true); checkMinimized(a); } @@ -89,7 +101,7 @@ public void testRandomMinimized() throws Exception { Automaton expected = MinimizationOperations.minimize( Operations.union(automatonList), Operations.DEFAULT_DETERMINIZE_WORK_LIMIT); - Automaton actual = build(sortedTerms, buildBinary, false); + Automaton actual = build(sortedTerms, buildBinary); assertSameAutomaton(expected, actual); } } @@ -100,31 +112,22 @@ public void testRandomMinimizedCaseInsensitive() throws Exception { int size = random().nextInt(2, 50); Set terms = new HashSet<>(); List automatonList = new ArrayList<>(size); + boolean turkic = random().nextBoolean(); for (int j = 0; j < size; j++) { String s = TestUtil.randomRealisticUnicodeString(random(), 8); + int[] lowercased = s.codePoints().map(c -> CaseFolding.foldCase(c, turkic)).toArray(); + s = new String(lowercased, 0, lowercased.length); terms.add(newBytesRef(s)); List charAutomata = s.codePoints() .mapToObj( c -> { - List caseAutomata = new ArrayList<>(); - caseAutomata.add(Automata.makeChar(c)); - int[] alternates = CaseFolding.lookupAlternates(c); - if (alternates != null) { - for (int alt : alternates) { - caseAutomata.add(Automata.makeChar(alt)); - } - } else { - int altCase = - Character.isLowerCase(c) - ? Character.toUpperCase(c) - : Character.toLowerCase(c); - - if (altCase != c) { - caseAutomata.add(Automata.makeChar(altCase)); - } + Automaton a = Automata.makeChar(c); + int altCase = CaseFolding.upperCase(c, turkic); + if (altCase != c) { + return Operations.union(List.of(a, Automata.makeChar(altCase))); } - return Operations.union(caseAutomata); + return a; }) .collect(Collectors.toList()); if (charAutomata.isEmpty()) { @@ -138,7 +141,7 @@ public void testRandomMinimizedCaseInsensitive() throws Exception { Automaton expected = MinimizationOperations.minimize( Operations.union(automatonList), Operations.DEFAULT_DETERMINIZE_WORK_LIMIT); - Automaton actual = build(sortedTerms, false, true); + Automaton actual = buildCaseInsensitive(sortedTerms, turkic); assertSameAutomaton(expected, actual); } } @@ -157,7 +160,7 @@ public void testLargeTerms() throws Exception { IllegalArgumentException e = expectThrows( IllegalArgumentException.class, - () -> build(Collections.singleton(new BytesRef(b10k)), false, false)); + () -> build(Collections.singleton(new BytesRef(b10k)), false)); assertTrue( e.getMessage() .startsWith( @@ -166,7 +169,7 @@ public void testLargeTerms() throws Exception { + " UTF-8 bytes")); byte[] b1k = ArrayUtil.copyOfSubArray(b10k, 0, 1000); - build(Collections.singleton(new BytesRef(b1k)), false, false); // no exception + build(Collections.singleton(new BytesRef(b1k)), false); // no exception } private void testRandom(boolean allowBinary) throws Exception { @@ -184,7 +187,7 @@ private void testRandom(boolean allowBinary) throws Exception { } List sorted = terms.stream().sorted().toList(); - Automaton a = build(sorted, allowBinary, false); + Automaton a = build(sorted, allowBinary); checkAutomaton(sorted, a, allowBinary); } } @@ -238,12 +241,20 @@ private List basicTerms() { return terms; } - private Automaton build(Collection terms, boolean asBinary, boolean caseInsensitive) + private Automaton build(Collection terms, boolean asBinary) throws IOException { + if (random().nextBoolean()) { + return StringsToAutomaton.build(terms, asBinary, false, false); + } else { + return StringsToAutomaton.build(new TermIterator(terms), asBinary, false, false); + } + } + + private Automaton buildCaseInsensitive(Collection terms, boolean turkic) throws IOException { if (random().nextBoolean()) { - return StringsToAutomaton.build(terms, asBinary, caseInsensitive); + return StringsToAutomaton.build(terms, false, true, turkic); } else { - return StringsToAutomaton.build(new TermIterator(terms), asBinary, caseInsensitive); + return StringsToAutomaton.build(new TermIterator(terms), false, true, turkic); } }