Skip to content

Commit 14802c6

Browse files
committed
[DRAFT] Case-insensitive matching over union of strings
1 parent 804b5a1 commit 14802c6

File tree

4 files changed

+131
-28
lines changed

4 files changed

+131
-28
lines changed

lucene/core/src/java/org/apache/lucene/util/automaton/Automata.java

Lines changed: 20 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -608,7 +608,23 @@ public static Automaton makeStringUnion(Iterable<BytesRef> utf8Strings) {
608608
if (utf8Strings.iterator().hasNext() == false) {
609609
return makeEmpty();
610610
} else {
611-
return StringsToAutomaton.build(utf8Strings, false);
611+
return StringsToAutomaton.build(utf8Strings, false, false);
612+
}
613+
}
614+
615+
/**
616+
* Returns a new (deterministic and minimal) automaton that accepts the union of the given
617+
* collection of {@link BytesRef}s representing UTF-8 encoded strings.
618+
*
619+
* @param utf8Strings The input strings, UTF-8 encoded. The collection must be in sorted order.
620+
* @return An {@link Automaton} accepting all input strings. The resulting automaton is codepoint
621+
* based (full unicode codepoints on transitions).
622+
*/
623+
public static Automaton makeCaseInsensitiveStringUnion(Iterable<BytesRef> utf8Strings) {
624+
if (utf8Strings.iterator().hasNext() == false) {
625+
return makeEmpty();
626+
} else {
627+
return StringsToAutomaton.build(utf8Strings, false, true);
612628
}
613629
}
614630

@@ -625,7 +641,7 @@ public static Automaton makeBinaryStringUnion(Iterable<BytesRef> utf8Strings) {
625641
if (utf8Strings.iterator().hasNext() == false) {
626642
return makeEmpty();
627643
} else {
628-
return StringsToAutomaton.build(utf8Strings, true);
644+
return StringsToAutomaton.build(utf8Strings, true, false);
629645
}
630646
}
631647

@@ -638,7 +654,7 @@ public static Automaton makeBinaryStringUnion(Iterable<BytesRef> utf8Strings) {
638654
* based (full unicode codepoints on transitions).
639655
*/
640656
public static Automaton makeStringUnion(BytesRefIterator utf8Strings) throws IOException {
641-
return StringsToAutomaton.build(utf8Strings, false);
657+
return StringsToAutomaton.build(utf8Strings, false, false);
642658
}
643659

644660
/**
@@ -651,6 +667,6 @@ public static Automaton makeStringUnion(BytesRefIterator utf8Strings) throws IOE
651667
* based (UTF-8 encoded byte transition labels).
652668
*/
653669
public static Automaton makeBinaryStringUnion(BytesRefIterator utf8Strings) throws IOException {
654-
return StringsToAutomaton.build(utf8Strings, true);
670+
return StringsToAutomaton.build(utf8Strings, true, false);
655671
}
656672
}

lucene/core/src/java/org/apache/lucene/util/automaton/RegExp.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -758,7 +758,7 @@ private Automaton toAutomaton(
758758
* @param codepoint the Character code point to encode as an Automaton
759759
* @return the original codepoint and the set of alternates
760760
*/
761-
private int[] toCaseInsensitiveChar(int codepoint) {
761+
static int[] toCaseInsensitiveChar(int codepoint) {
762762
int[] altCodepoints = CaseFolding.lookupAlternates(codepoint);
763763
if (altCodepoints != null) {
764764
int[] concat = new int[altCodepoints.length + 1];

lucene/core/src/java/org/apache/lucene/util/automaton/StringsToAutomaton.java

Lines changed: 32 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -195,7 +195,7 @@ private boolean setPrevious(BytesRef current) {
195195

196196
/** Internal recursive traversal for conversion. */
197197
private static int convert(
198-
Automaton.Builder a, State s, IdentityHashMap<State, Integer> visited) {
198+
Automaton.Builder a, State s, IdentityHashMap<State, Integer> visited, boolean caseInsensitive) {
199199

200200
Integer converted = visited.get(s);
201201
if (converted != null) {
@@ -209,7 +209,25 @@ private static int convert(
209209
int i = 0;
210210
int[] labels = s.labels;
211211
for (StringsToAutomaton.State target : s.states) {
212-
a.addTransition(converted, convert(a, target, visited), labels[i++]);
212+
int label = labels[i++];
213+
int dest = convert(a, target, visited, caseInsensitive);
214+
a.addTransition(converted, dest, label);
215+
if (caseInsensitive) {
216+
int[] alternatives = CaseFolding.lookupAlternates(label);
217+
if (alternatives != null) {
218+
for (int alt : alternatives) {
219+
a.addTransition(converted, dest, alt);
220+
}
221+
} else {
222+
int altCase =
223+
Character.isLowerCase(label)
224+
? Character.toUpperCase(label)
225+
: Character.toLowerCase(label);
226+
if (altCase != label) {
227+
a.addTransition(converted, dest, altCase);
228+
}
229+
}
230+
}
213231
}
214232

215233
return converted;
@@ -219,15 +237,15 @@ private static int convert(
219237
* Called after adding all terms. Performs final minimization and converts to a standard {@link
220238
* Automaton} instance.
221239
*/
222-
private Automaton completeAndConvert() {
240+
private Automaton completeAndConvert(boolean caseInsensitive) {
223241
// Final minimization:
224242
if (this.stateRegistry == null) throw new IllegalStateException();
225243
if (root.hasChildren()) replaceOrRegister(root);
226244
stateRegistry = null;
227245

228246
// Convert:
229247
Automaton.Builder a = new Automaton.Builder();
230-
convert(a, root, new IdentityHashMap<>());
248+
convert(a, root, new IdentityHashMap<>(), caseInsensitive);
231249
return a.finish();
232250
}
233251

@@ -237,14 +255,17 @@ private Automaton completeAndConvert() {
237255
* UTF-8 codepoints as transition labels or binary (compiled) transition labels based on {@code
238256
* asBinary}.
239257
*/
240-
static Automaton build(Iterable<BytesRef> input, boolean asBinary) {
258+
static Automaton build(Iterable<BytesRef> input, boolean asBinary, boolean caseInsensitive) {
259+
if (asBinary && caseInsensitive) {
260+
throw new IllegalArgumentException("Cannot use caseInsensitive on binary automaton");
261+
}
241262
final StringsToAutomaton builder = new StringsToAutomaton();
242263

243264
for (BytesRef b : input) {
244265
builder.add(b, asBinary);
245266
}
246267

247-
return builder.completeAndConvert();
268+
return builder.completeAndConvert(caseInsensitive);
248269
}
249270

250271
/**
@@ -253,14 +274,17 @@ static Automaton build(Iterable<BytesRef> input, boolean asBinary) {
253274
* UTF-8 codepoints as transition labels or binary (compiled) transition labels based on {@code
254275
* asBinary}.
255276
*/
256-
static Automaton build(BytesRefIterator input, boolean asBinary) throws IOException {
277+
static Automaton build(BytesRefIterator input, boolean asBinary, boolean caseInsensitive) throws IOException {
278+
if (asBinary && caseInsensitive) {
279+
throw new IllegalArgumentException("Cannot use caseInsensitive on binary automaton");
280+
}
257281
final StringsToAutomaton builder = new StringsToAutomaton();
258282

259283
for (BytesRef b = input.next(); b != null; b = input.next()) {
260284
builder.add(b, asBinary);
261285
}
262286

263-
return builder.completeAndConvert();
287+
return builder.completeAndConvert(caseInsensitive);
264288
}
265289

266290
private void add(BytesRef current, boolean asBinary) {

lucene/core/src/test/org/apache/lucene/util/automaton/TestStringsToAutomaton.java

Lines changed: 78 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,10 @@
2525
import java.util.HashSet;
2626
import java.util.Iterator;
2727
import java.util.List;
28+
import java.util.Locale;
2829
import java.util.Set;
30+
import java.util.stream.Collectors;
31+
2932
import org.apache.lucene.tests.util.LuceneTestCase;
3033
import org.apache.lucene.tests.util.TestUtil;
3134
import org.apache.lucene.tests.util.automaton.AutomatonTestUtil;
@@ -42,16 +45,25 @@ public void testBasic() throws Exception {
4245
List<BytesRef> terms = basicTerms();
4346
Collections.sort(terms);
4447

45-
Automaton a = build(terms, false);
48+
Automaton a = build(terms, false, false);
4649
checkAutomaton(terms, a, false);
4750
checkMinimized(a);
4851
}
4952

53+
public void testCaseInsensitive() throws Exception {
54+
List<BytesRef> terms = basicTerms();
55+
Collections.sort(terms);
56+
57+
Automaton a = build(terms, false, true);
58+
checkAutomaton(terms, a, false, true);
59+
checkMinimized(a);
60+
}
61+
5062
public void testBasicBinary() throws Exception {
5163
List<BytesRef> terms = basicTerms();
5264
Collections.sort(terms);
5365

54-
Automaton a = build(terms, true);
66+
Automaton a = build(terms, true, false);
5567
checkAutomaton(terms, a, true);
5668
checkMinimized(a);
5769
}
@@ -79,7 +91,52 @@ public void testRandomMinimized() throws Exception {
7991
Automaton expected =
8092
MinimizationOperations.minimize(
8193
Operations.union(automatonList), Operations.DEFAULT_DETERMINIZE_WORK_LIMIT);
82-
Automaton actual = build(sortedTerms, buildBinary);
94+
Automaton actual = build(sortedTerms, buildBinary, false);
95+
assertSameAutomaton(expected, actual);
96+
}
97+
}
98+
99+
public void testRandomMinimizedCaseInsensitive() throws Exception {
100+
int iters = RandomizedTest.isNightly() ? 20 : 5;
101+
for (int i = 0; i < iters; i++) {
102+
int size = random().nextInt(2, 50);
103+
Set<BytesRef> terms = new HashSet<>();
104+
List<Automaton> automatonList = new ArrayList<>(size);
105+
for (int j = 0; j < size; j++) {
106+
String s = TestUtil.randomRealisticUnicodeString(random(), 8);
107+
terms.add(newBytesRef(s));
108+
List<Automaton> charAutomata = s.codePoints().mapToObj(c -> {
109+
List<Automaton> caseAutomata = new ArrayList<>();
110+
caseAutomata.add(Automata.makeChar(c));
111+
int[] alternates = CaseFolding.lookupAlternates(c);
112+
if (alternates != null) {
113+
for (int alt : alternates) {
114+
caseAutomata.add(Automata.makeChar(alt));
115+
}
116+
} else {
117+
int altCase =
118+
Character.isLowerCase(c)
119+
? Character.toUpperCase(c)
120+
: Character.toLowerCase(c);
121+
122+
if (altCase != c) {
123+
caseAutomata.add(Automata.makeChar(altCase));
124+
}
125+
}
126+
return Operations.union(caseAutomata);
127+
}).collect(Collectors.toList());
128+
if (charAutomata.isEmpty()) {
129+
automatonList.add(Automata.makeEmptyString());
130+
} else {
131+
automatonList.add(Operations.concatenate(charAutomata));
132+
}
133+
}
134+
List<BytesRef> sortedTerms = terms.stream().sorted().toList();
135+
136+
Automaton expected =
137+
MinimizationOperations.minimize(
138+
Operations.union(automatonList), Operations.DEFAULT_DETERMINIZE_WORK_LIMIT);
139+
Automaton actual = build(sortedTerms, false, true);
83140
assertSameAutomaton(expected, actual);
84141
}
85142
}
@@ -98,7 +155,7 @@ public void testLargeTerms() throws Exception {
98155
IllegalArgumentException e =
99156
expectThrows(
100157
IllegalArgumentException.class,
101-
() -> build(Collections.singleton(new BytesRef(b10k)), false));
158+
() -> build(Collections.singleton(new BytesRef(b10k)), false, false));
102159
assertTrue(
103160
e.getMessage()
104161
.startsWith(
@@ -107,7 +164,7 @@ public void testLargeTerms() throws Exception {
107164
+ " UTF-8 bytes"));
108165

109166
byte[] b1k = ArrayUtil.copyOfSubArray(b10k, 0, 1000);
110-
build(Collections.singleton(new BytesRef(b1k)), false); // no exception
167+
build(Collections.singleton(new BytesRef(b1k)), false, false); // no exception
111168
}
112169

113170
private void testRandom(boolean allowBinary) throws Exception {
@@ -125,12 +182,16 @@ private void testRandom(boolean allowBinary) throws Exception {
125182
}
126183

127184
List<BytesRef> sorted = terms.stream().sorted().toList();
128-
Automaton a = build(sorted, allowBinary);
185+
Automaton a = build(sorted, allowBinary, false);
129186
checkAutomaton(sorted, a, allowBinary);
130187
}
131188
}
132189

133190
private void checkAutomaton(List<BytesRef> expected, Automaton a, boolean isBinary) {
191+
checkAutomaton(expected, a, isBinary, false);
192+
}
193+
194+
private void checkAutomaton(List<BytesRef> expected, Automaton a, boolean isBinary, boolean caseInsensitive) {
134195
CompiledAutomaton c = new CompiledAutomaton(a, true, false, isBinary);
135196
ByteRunAutomaton runAutomaton = c.runAutomaton;
136197

@@ -141,12 +202,14 @@ private void checkAutomaton(List<BytesRef> expected, Automaton a, boolean isBina
141202
readable + " should be found but wasn't", runAutomaton.run(t.bytes, t.offset, t.length));
142203
}
143204

144-
// Make sure every term produced by the automaton is expected
145-
BytesRefBuilder scratch = new BytesRefBuilder();
146-
FiniteStringsIterator it = new FiniteStringsIterator(c.automaton);
147-
for (IntsRef r = it.next(); r != null; r = it.next()) {
148-
BytesRef t = Util.toBytesRef(r, scratch);
149-
assertTrue(expected.contains(t));
205+
if (caseInsensitive == false) {
206+
// Make sure every term produced by the automaton is expected
207+
BytesRefBuilder scratch = new BytesRefBuilder();
208+
FiniteStringsIterator it = new FiniteStringsIterator(c.automaton);
209+
for (IntsRef r = it.next(); r != null; r = it.next()) {
210+
BytesRef t = Util.toBytesRef(r, scratch);
211+
assertTrue(expected.contains(t));
212+
}
150213
}
151214
}
152215

@@ -172,11 +235,11 @@ private List<BytesRef> basicTerms() {
172235
return terms;
173236
}
174237

175-
private Automaton build(Collection<BytesRef> terms, boolean asBinary) throws IOException {
238+
private Automaton build(Collection<BytesRef> terms, boolean asBinary, boolean caseInsensitive) throws IOException {
176239
if (random().nextBoolean()) {
177-
return StringsToAutomaton.build(terms, asBinary);
240+
return StringsToAutomaton.build(terms, asBinary, caseInsensitive);
178241
} else {
179-
return StringsToAutomaton.build(new TermIterator(terms), asBinary);
242+
return StringsToAutomaton.build(new TermIterator(terms), asBinary, caseInsensitive);
180243
}
181244
}
182245

0 commit comments

Comments
 (0)