apache · msfroh · Mar 13, 2025 · Mar 18, 2025 · dweiss · Mar 18, 2025
diff --git a/lucene/core/src/java/org/apache/lucene/util/automaton/Automata.java b/lucene/core/src/java/org/apache/lucene/util/automaton/Automata.java
@@ -608,7 +608,24 @@ public static Automaton makeStringUnion(Iterable<BytesRef> utf8Strings) {
     if (utf8Strings.iterator().hasNext() == false) {
       return makeEmpty();
     } else {
-      return StringsToAutomaton.build(utf8Strings, false);
+      return StringsToAutomaton.build(utf8Strings, false, false, false);
+    }
+  }
+
+  /**
+   * Returns a new (deterministic and minimal) automaton that accepts the union of the given
+   * collection of {@link BytesRef}s representing UTF-8 encoded strings.
+   *
+   * @param utf8Strings The input strings, UTF-8 encoded. The collection must be in sorted order.
+   * @return An {@link Automaton} accepting all input strings. The resulting automaton is codepoint
+   *     based (full unicode codepoints on transitions).
+   */
+  public static Automaton makeCaseInsensitiveStringUnion(
+      Iterable<BytesRef> utf8Strings, boolean turkic) {
+    if (utf8Strings.iterator().hasNext() == false) {
+      return makeEmpty();
+    } else {
+      return StringsToAutomaton.build(utf8Strings, false, true, turkic);
     }
   }
 
@@ -625,7 +642,7 @@ public static Automaton makeBinaryStringUnion(Iterable<BytesRef> utf8Strings) {
     if (utf8Strings.iterator().hasNext() == false) {
       return makeEmpty();
     } else {
-      return StringsToAutomaton.build(utf8Strings, true);
+      return StringsToAutomaton.build(utf8Strings, true, false, false);
     }
   }
 
@@ -638,7 +655,7 @@ public static Automaton makeBinaryStringUnion(Iterable<BytesRef> utf8Strings) {
    *     based (full unicode codepoints on transitions).
    */
   public static Automaton makeStringUnion(BytesRefIterator utf8Strings) throws IOException {
-    return StringsToAutomaton.build(utf8Strings, false);
+    return StringsToAutomaton.build(utf8Strings, false, false, false);
   }
 
   /**
@@ -651,6 +668,6 @@ public static Automaton makeStringUnion(BytesRefIterator utf8Strings) throws IOE
    *     based (UTF-8 encoded byte transition labels).
    */
   public static Automaton makeBinaryStringUnion(BytesRefIterator utf8Strings) throws IOException {
-    return StringsToAutomaton.build(utf8Strings, true);
+    return StringsToAutomaton.build(utf8Strings, true, false, false);
   }
 }
diff --git a/lucene/core/src/java/org/apache/lucene/util/automaton/CaseFolding.java b/lucene/core/src/java/org/apache/lucene/util/automaton/CaseFolding.java
@@ -743,4 +743,42 @@ static int[] lookupAlternates(int codepoint) {
 
     return alts;
   }
+
+  /**
+   * Folds the case of the given character according to {@link Character#toLowerCase(int)}, but with
+   * exceptions if the turkic flag is set.
+   *
+   * @param codepoint to code point for the character to fold
+   * @param turkic if true, then apply tr/az folding rules
+   * @return the folded character
+   */
+  static int foldCase(int codepoint, boolean turkic) {
+    if (turkic) {
+      if (codepoint == 0x00130) { // İ [LATIN CAPITAL LETTER I WITH DOT ABOVE]
+        return 0x00069; // i [LATIN SMALL LETTER I]
+      } else if (codepoint == 0x000049) { //  I [LATIN CAPITAL LETTER I]
+        return 0x00131; // ı [LATIN SMALL LETTER DOTLESS I]
+      }
+    }
+    return Character.toLowerCase(codepoint);
+  }
+
+  /**
+   * Attempts to convert the given character to upper case, acccording to {@link
+   * Character#toUpperCase(int)}, but with exceptions if the turkic flag is set.
+   *
+   * @param codepoint to code point for the character to convert to upper case
+   * @param turkic if true, then apply tr/az folding rules
+   * @return the upper case character
+   */
+  static int upperCase(int codepoint, boolean turkic) {
+    if (turkic) {
+      if (codepoint == 0x00069) { // i [LATIN SMALL LETTER I]
+        return 0x00130; // İ [LATIN CAPITAL LETTER I WITH DOT ABOVE]
+      } else if (codepoint == 0x00131) { // ı [LATIN SMALL LETTER DOTLESS I]
+        return 0x000049; // I [LATIN CAPITAL LETTER I]
+      }
+    }
+    return Character.toUpperCase(codepoint);
+  }
 }
diff --git a/lucene/core/src/java/org/apache/lucene/util/automaton/RegExp.java b/lucene/core/src/java/org/apache/lucene/util/automaton/RegExp.java
@@ -758,7 +758,7 @@ private Automaton toAutomaton(
    * @param codepoint the Character code point to encode as an Automaton
    * @return the original codepoint and the set of alternates
    */
-  private int[] toCaseInsensitiveChar(int codepoint) {
+  static int[] toCaseInsensitiveChar(int codepoint) {
     int[] altCodepoints = CaseFolding.lookupAlternates(codepoint);
     if (altCodepoints != null) {
       int[] concat = new int[altCodepoints.length + 1];

diff --git a/lucene/core/src/java/org/apache/lucene/util/automaton/StringsToAutomaton.java b/lucene/core/src/java/org/apache/lucene/util/automaton/StringsToAutomaton.java
@@ -40,10 +40,13 @@
  * @see Automata#makeBinaryStringUnion(BytesRefIterator)
  */
 final class StringsToAutomaton {
+  private final boolean caseInsensitive;
+  private final boolean turkic;
 
   /** The default constructor is private. Use static methods directly. */
-  private StringsToAutomaton() {
-    super();
+  private StringsToAutomaton(boolean caseInsensitive, boolean turkic) {
+    this.caseInsensitive = caseInsensitive;
+    this.turkic = turkic;
   }
 
   /** DFSA state with <code>char</code> labels on transitions. */
@@ -195,7 +198,11 @@ private boolean setPrevious(BytesRef current) {
 
   /** Internal recursive traversal for conversion. */
   private static int convert(
-      Automaton.Builder a, State s, IdentityHashMap<State, Integer> visited) {
+      Automaton.Builder a,
+      State s,
+      IdentityHashMap<State, Integer> visited,
+      boolean caseInsensitive,
+      boolean turkic) {
 
     Integer converted = visited.get(s);
     if (converted != null) {
@@ -209,7 +216,15 @@ private static int convert(
     int i = 0;
     int[] labels = s.labels;
     for (StringsToAutomaton.State target : s.states) {
-      a.addTransition(converted, convert(a, target, visited), labels[i++]);
+      int label = labels[i++];
+      int dest = convert(a, target, visited, caseInsensitive, turkic);
+      a.addTransition(converted, dest, label);
+      if (caseInsensitive) {
+        int altCase = CaseFolding.upperCase(label, turkic);
+        if (altCase != label) {
+          a.addTransition(converted, dest, altCase);
+        }
+      }
     }
 
     return converted;
@@ -227,7 +242,7 @@ private Automaton completeAndConvert() {
 
     // Convert:
     Automaton.Builder a = new Automaton.Builder();
-    convert(a, root, new IdentityHashMap<>());
+    convert(a, root, new IdentityHashMap<>(), caseInsensitive, turkic);
     return a.finish();
   }
 
@@ -237,8 +252,12 @@ private Automaton completeAndConvert() {
    * UTF-8 codepoints as transition labels or binary (compiled) transition labels based on {@code
    * asBinary}.
    */
-  static Automaton build(Iterable<BytesRef> input, boolean asBinary) {
-    final StringsToAutomaton builder = new StringsToAutomaton();
+  static Automaton build(
+      Iterable<BytesRef> input, boolean asBinary, boolean caseInsensitive, boolean turkic) {
+    if (asBinary && caseInsensitive) {
+      throw new IllegalArgumentException("Cannot use caseInsensitive on binary automaton");
+    }
+    final StringsToAutomaton builder = new StringsToAutomaton(caseInsensitive, turkic);
 
     for (BytesRef b : input) {
       builder.add(b, asBinary);
@@ -253,8 +272,13 @@ static Automaton build(Iterable<BytesRef> input, boolean asBinary) {
    * UTF-8 codepoints as transition labels or binary (compiled) transition labels based on {@code
    * asBinary}.
    */
-  static Automaton build(BytesRefIterator input, boolean asBinary) throws IOException {
-    final StringsToAutomaton builder = new StringsToAutomaton();
+  static Automaton build(
+      BytesRefIterator input, boolean asBinary, boolean caseInsensitive, boolean turkic)
+      throws IOException {
+    if (asBinary && caseInsensitive) {
+      throw new IllegalArgumentException("Cannot use caseInsensitive on binary automaton");
+    }
+    final StringsToAutomaton builder = new StringsToAutomaton(caseInsensitive, turkic);
 
     for (BytesRef b = input.next(); b != null; b = input.next()) {
       builder.add(b, asBinary);
@@ -293,6 +317,10 @@ private void add(BytesRef current, boolean asBinary) {
     } else {
       while (pos < max) {
         codePoint = UnicodeUtil.codePointAt(bytes, pos, codePoint);
+        if (caseInsensitive
+            && codePoint.codePoint != CaseFolding.foldCase(codePoint.codePoint, turkic)) {
+          throw new IllegalArgumentException("Case-insensitive input must be lower-case");
+        }
         next = state.lastChild(codePoint.codePoint);
         if (next == null) {
           break;

diff --git a/lucene/core/src/test/org/apache/lucene/util/automaton/TestStringsToAutomaton.java b/lucene/core/src/test/org/apache/lucene/util/automaton/TestStringsToAutomaton.java
@@ -26,6 +26,8 @@
 import java.util.Iterator;
 import java.util.List;
 import java.util.Set;
+import java.util.stream.Collectors;
+import java.util.stream.Stream;
 import org.apache.lucene.tests.util.LuceneTestCase;
 import org.apache.lucene.tests.util.TestUtil;
 import org.apache.lucene.tests.util.automaton.AutomatonTestUtil;
@@ -47,6 +49,26 @@ public void testBasic() throws Exception {
     checkMinimized(a);
   }
 
+  public void testCaseInsensitive() throws Exception {
+    List<BytesRef> terms = basicTerms();
+    Collections.sort(terms);
+
+    Automaton a = buildCaseInsensitive(terms, false);
+    checkAutomaton(terms, a, false, true);
+    checkMinimized(a);
+  }
+
+  public void testCornerCase() throws Exception {
+    List<BytesRef> terms =
+        Stream.of("aib", "aıc")
+            .map(LuceneTestCase::newBytesRef)
+            .sorted()
+            .collect(Collectors.toCollection(ArrayList::new));
+    Automaton a = buildCaseInsensitive(terms, true);
+    System.out.println(a.toDot());
+    assertTrue(a.isDeterministic());
+  }
+
   public void testBasicBinary() throws Exception {
     List<BytesRef> terms = basicTerms();
     Collections.sort(terms);
@@ -84,6 +106,46 @@ public void testRandomMinimized() throws Exception {
     }
   }
 
+  public void testRandomMinimizedCaseInsensitive() throws Exception {
+    int iters = RandomizedTest.isNightly() ? 20 : 5;
+    for (int i = 0; i < iters; i++) {
+      int size = random().nextInt(2, 50);
+      Set<BytesRef> terms = new HashSet<>();
+      List<Automaton> automatonList = new ArrayList<>(size);
+      boolean turkic = random().nextBoolean();
+      for (int j = 0; j < size; j++) {
+        String s = TestUtil.randomRealisticUnicodeString(random(), 8);
+        int[] lowercased = s.codePoints().map(c -> CaseFolding.foldCase(c, turkic)).toArray();
+        s = new String(lowercased, 0, lowercased.length);
+        terms.add(newBytesRef(s));
+        List<Automaton> charAutomata =
+            s.codePoints()
+                .mapToObj(
+                    c -> {
+                      Automaton a = Automata.makeChar(c);
+                      int altCase = CaseFolding.upperCase(c, turkic);
+                      if (altCase != c) {
+                        return Operations.union(List.of(a, Automata.makeChar(altCase)));
+                      }
+                      return a;
+                    })
+                .collect(Collectors.toList());
+        if (charAutomata.isEmpty()) {
+          automatonList.add(Automata.makeEmptyString());
+        } else {
+          automatonList.add(Operations.concatenate(charAutomata));
+        }
+      }
+      List<BytesRef> sortedTerms = terms.stream().sorted().toList();
+
+      Automaton expected =
+          MinimizationOperations.minimize(
+              Operations.union(automatonList), Operations.DEFAULT_DETERMINIZE_WORK_LIMIT);
+      Automaton actual = buildCaseInsensitive(sortedTerms, turkic);
+      assertSameAutomaton(expected, actual);
+    }
+  }
+
   public void testRandomUnicodeOnly() throws Exception {
     testRandom(false);
   }
@@ -131,6 +193,11 @@ private void testRandom(boolean allowBinary) throws Exception {
   }
 
   private void checkAutomaton(List<BytesRef> expected, Automaton a, boolean isBinary) {
+    checkAutomaton(expected, a, isBinary, false);
+  }
+
+  private void checkAutomaton(
+      List<BytesRef> expected, Automaton a, boolean isBinary, boolean caseInsensitive) {
     CompiledAutomaton c = new CompiledAutomaton(a, true, false, isBinary);
     ByteRunAutomaton runAutomaton = c.runAutomaton;
 
@@ -141,12 +208,14 @@ private void checkAutomaton(List<BytesRef> expected, Automaton a, boolean isBina
           readable + " should be found but wasn't", runAutomaton.run(t.bytes, t.offset, t.length));
     }
 
-    // Make sure every term produced by the automaton is expected
-    BytesRefBuilder scratch = new BytesRefBuilder();
-    FiniteStringsIterator it = new FiniteStringsIterator(c.automaton);
-    for (IntsRef r = it.next(); r != null; r = it.next()) {
-      BytesRef t = Util.toBytesRef(r, scratch);
-      assertTrue(expected.contains(t));
+    if (caseInsensitive == false) {
+      // Make sure every term produced by the automaton is expected
+      BytesRefBuilder scratch = new BytesRefBuilder();
+      FiniteStringsIterator it = new FiniteStringsIterator(c.automaton);
+      for (IntsRef r = it.next(); r != null; r = it.next()) {
+        BytesRef t = Util.toBytesRef(r, scratch);
+        assertTrue(expected.contains(t));
+      }
     }
   }
 
@@ -174,9 +243,18 @@ private List<BytesRef> basicTerms() {
 
   private Automaton build(Collection<BytesRef> terms, boolean asBinary) throws IOException {
     if (random().nextBoolean()) {
-      return StringsToAutomaton.build(terms, asBinary);
+      return StringsToAutomaton.build(terms, asBinary, false, false);
+    } else {
+      return StringsToAutomaton.build(new TermIterator(terms), asBinary, false, false);
+    }
+  }
+
+  private Automaton buildCaseInsensitive(Collection<BytesRef> terms, boolean turkic)
+      throws IOException {
+    if (random().nextBoolean()) {
+      return StringsToAutomaton.build(terms, false, true, turkic);
     } else {
-      return StringsToAutomaton.build(new TermIterator(terms), asBinary);
+      return StringsToAutomaton.build(new TermIterator(terms), false, true, turkic);
     }
   }