From 036668ac373a4e57ba7e0c52fec4ef085d8524b3 Mon Sep 17 00:00:00 2001
From: Michael Froh <msfroh@gmail.com>
Date: Wed, 12 Mar 2025 18:06:51 -0700
Subject: [PATCH 1/2] [DRAFT] Case-insensitive matching over union of strings

---
 .../lucene/util/automaton/Automata.java       | 24 ++++-
 .../apache/lucene/util/automaton/RegExp.java  |  2 +-
 .../util/automaton/StringsToAutomaton.java    | 44 +++++++--
 .../automaton/TestStringsToAutomaton.java     | 97 ++++++++++++++++---
 4 files changed, 139 insertions(+), 28 deletions(-)
diff --git a/lucene/core/src/java/org/apache/lucene/util/automaton/Automata.java b/lucene/core/src/java/org/apache/lucene/util/automaton/Automata.java
index 6b9a7fcc5562..fe9a350dec1f 100644
--- a/lucene/core/src/java/org/apache/lucene/util/automaton/Automata.java
+++ b/lucene/core/src/java/org/apache/lucene/util/automaton/Automata.java
@@ -608,7 +608,23 @@ public static Automaton makeStringUnion(Iterable<BytesRef> utf8Strings) {
     if (utf8Strings.iterator().hasNext() == false) {
       return makeEmpty();
     } else {
-      return StringsToAutomaton.build(utf8Strings, false);
+      return StringsToAutomaton.build(utf8Strings, false, false);
+    }
+  }
+
+  /**
+   * Returns a new (deterministic and minimal) automaton that accepts the union of the given
+   * collection of {@link BytesRef}s representing UTF-8 encoded strings.
+   *
+   * @param utf8Strings The input strings, UTF-8 encoded. The collection must be in sorted order.
+   * @return An {@link Automaton} accepting all input strings. The resulting automaton is codepoint
+   *     based (full unicode codepoints on transitions).
+   */
+  public static Automaton makeCaseInsensitiveStringUnion(Iterable<BytesRef> utf8Strings) {
+    if (utf8Strings.iterator().hasNext() == false) {
+      return makeEmpty();
+    } else {
+      return StringsToAutomaton.build(utf8Strings, false, true);
     }
   }
 
@@ -625,7 +641,7 @@ public static Automaton makeBinaryStringUnion(Iterable<BytesRef> utf8Strings) {
     if (utf8Strings.iterator().hasNext() == false) {
       return makeEmpty();
     } else {
-      return StringsToAutomaton.build(utf8Strings, true);
+      return StringsToAutomaton.build(utf8Strings, true, false);
     }
   }
 
@@ -638,7 +654,7 @@ public static Automaton makeBinaryStringUnion(Iterable<BytesRef> utf8Strings) {
    *     based (full unicode codepoints on transitions).
    */
   public static Automaton makeStringUnion(BytesRefIterator utf8Strings) throws IOException {
-    return StringsToAutomaton.build(utf8Strings, false);
+    return StringsToAutomaton.build(utf8Strings, false, false);
   }
 
   /**
@@ -651,6 +667,6 @@ public static Automaton makeStringUnion(BytesRefIterator utf8Strings) throws IOE
    *     based (UTF-8 encoded byte transition labels).
    */
   public static Automaton makeBinaryStringUnion(BytesRefIterator utf8Strings) throws IOException {
-    return StringsToAutomaton.build(utf8Strings, true);
+    return StringsToAutomaton.build(utf8Strings, true, false);
   }
 }
diff --git a/lucene/core/src/java/org/apache/lucene/util/automaton/RegExp.java b/lucene/core/src/java/org/apache/lucene/util/automaton/RegExp.java
index 342efe059786..7fe75d6925e5 100644
--- a/lucene/core/src/java/org/apache/lucene/util/automaton/RegExp.java
+++ b/lucene/core/src/java/org/apache/lucene/util/automaton/RegExp.java
@@ -758,7 +758,7 @@ private Automaton toAutomaton(
    * @param codepoint the Character code point to encode as an Automaton
    * @return the original codepoint and the set of alternates
    */
-  private int[] toCaseInsensitiveChar(int codepoint) {
+  static int[] toCaseInsensitiveChar(int codepoint) {
     int[] altCodepoints = CaseFolding.lookupAlternates(codepoint);
     if (altCodepoints != null) {
       int[] concat = new int[altCodepoints.length + 1];
diff --git a/lucene/core/src/java/org/apache/lucene/util/automaton/StringsToAutomaton.java b/lucene/core/src/java/org/apache/lucene/util/automaton/StringsToAutomaton.java
index 58a081fa6a21..b66ad2793e52 100644
--- a/lucene/core/src/java/org/apache/lucene/util/automaton/StringsToAutomaton.java
+++ b/lucene/core/src/java/org/apache/lucene/util/automaton/StringsToAutomaton.java
@@ -195,7 +195,10 @@ private boolean setPrevious(BytesRef current) {
 
   /** Internal recursive traversal for conversion. */
   private static int convert(
-      Automaton.Builder a, State s, IdentityHashMap<State, Integer> visited) {
+      Automaton.Builder a,
+      State s,
+      IdentityHashMap<State, Integer> visited,
+      boolean caseInsensitive) {
 
     Integer converted = visited.get(s);
     if (converted != null) {
@@ -209,7 +212,25 @@ private static int convert(
     int i = 0;
     int[] labels = s.labels;
     for (StringsToAutomaton.State target : s.states) {
-      a.addTransition(converted, convert(a, target, visited), labels[i++]);
+      int label = labels[i++];
+      int dest = convert(a, target, visited, caseInsensitive);
+      a.addTransition(converted, dest, label);
+      if (caseInsensitive) {
+        int[] alternatives = CaseFolding.lookupAlternates(label);
+        if (alternatives != null) {
+          for (int alt : alternatives) {
+            a.addTransition(converted, dest, alt);
+          }
+        } else {
+          int altCase =
+              Character.isLowerCase(label)
+                  ? Character.toUpperCase(label)
+                  : Character.toLowerCase(label);
+          if (altCase != label) {
+            a.addTransition(converted, dest, altCase);
+          }
+        }
+      }
     }
 
     return converted;
@@ -219,7 +240,7 @@ private static int convert(
    * Called after adding all terms. Performs final minimization and converts to a standard {@link
    * Automaton} instance.
    */
-  private Automaton completeAndConvert() {
+  private Automaton completeAndConvert(boolean caseInsensitive) {
     // Final minimization:
     if (this.stateRegistry == null) throw new IllegalStateException();
     if (root.hasChildren()) replaceOrRegister(root);
@@ -227,7 +248,7 @@ private Automaton completeAndConvert() {
 
     // Convert:
     Automaton.Builder a = new Automaton.Builder();
-    convert(a, root, new IdentityHashMap<>());
+    convert(a, root, new IdentityHashMap<>(), caseInsensitive);
     return a.finish();
   }
 
@@ -237,14 +258,17 @@ private Automaton completeAndConvert() {
    * UTF-8 codepoints as transition labels or binary (compiled) transition labels based on {@code
    * asBinary}.
    */
-  static Automaton build(Iterable<BytesRef> input, boolean asBinary) {
+  static Automaton build(Iterable<BytesRef> input, boolean asBinary, boolean caseInsensitive) {
+    if (asBinary && caseInsensitive) {
+      throw new IllegalArgumentException("Cannot use caseInsensitive on binary automaton");
+    }
     final StringsToAutomaton builder = new StringsToAutomaton();
 
     for (BytesRef b : input) {
       builder.add(b, asBinary);
     }
 
-    return builder.completeAndConvert();
+    return builder.completeAndConvert(caseInsensitive);
   }
 
   /**
@@ -253,14 +277,18 @@ static Automaton build(Iterable<BytesRef> input, boolean asBinary) {
    * UTF-8 codepoints as transition labels or binary (compiled) transition labels based on {@code
    * asBinary}.
    */
-  static Automaton build(BytesRefIterator input, boolean asBinary) throws IOException {
+  static Automaton build(BytesRefIterator input, boolean asBinary, boolean caseInsensitive)
+      throws IOException {
+    if (asBinary && caseInsensitive) {
+      throw new IllegalArgumentException("Cannot use caseInsensitive on binary automaton");
+    }
     final StringsToAutomaton builder = new StringsToAutomaton();
 
     for (BytesRef b = input.next(); b != null; b = input.next()) {
       builder.add(b, asBinary);
     }
 
-    return builder.completeAndConvert();
+    return builder.completeAndConvert(caseInsensitive);
   }
 
   private void add(BytesRef current, boolean asBinary) {
diff --git a/lucene/core/src/test/org/apache/lucene/util/automaton/TestStringsToAutomaton.java b/lucene/core/src/test/org/apache/lucene/util/automaton/TestStringsToAutomaton.java
index efaa451258bb..5316f594a910 100644
--- a/lucene/core/src/test/org/apache/lucene/util/automaton/TestStringsToAutomaton.java
+++ b/lucene/core/src/test/org/apache/lucene/util/automaton/TestStringsToAutomaton.java
@@ -26,6 +26,7 @@
 import java.util.Iterator;
 import java.util.List;
 import java.util.Set;
+import java.util.stream.Collectors;
 import org.apache.lucene.tests.util.LuceneTestCase;
 import org.apache.lucene.tests.util.TestUtil;
 import org.apache.lucene.tests.util.automaton.AutomatonTestUtil;
@@ -42,16 +43,25 @@ public void testBasic() throws Exception {
     List<BytesRef> terms = basicTerms();
     Collections.sort(terms);
 
-    Automaton a = build(terms, false);
+    Automaton a = build(terms, false, false);
     checkAutomaton(terms, a, false);
     checkMinimized(a);
   }
 
+  public void testCaseInsensitive() throws Exception {
+    List<BytesRef> terms = basicTerms();
+    Collections.sort(terms);
+
+    Automaton a = build(terms, false, true);
+    checkAutomaton(terms, a, false, true);
+    checkMinimized(a);
+  }
+
   public void testBasicBinary() throws Exception {
     List<BytesRef> terms = basicTerms();
     Collections.sort(terms);
 
-    Automaton a = build(terms, true);
+    Automaton a = build(terms, true, false);
     checkAutomaton(terms, a, true);
     checkMinimized(a);
   }
@@ -79,7 +89,56 @@ public void testRandomMinimized() throws Exception {
       Automaton expected =
           MinimizationOperations.minimize(
               Operations.union(automatonList), Operations.DEFAULT_DETERMINIZE_WORK_LIMIT);
-      Automaton actual = build(sortedTerms, buildBinary);
+      Automaton actual = build(sortedTerms, buildBinary, false);
+      assertSameAutomaton(expected, actual);
+    }
+  }
+
+  public void testRandomMinimizedCaseInsensitive() throws Exception {
+    int iters = RandomizedTest.isNightly() ? 20 : 5;
+    for (int i = 0; i < iters; i++) {
+      int size = random().nextInt(2, 50);
+      Set<BytesRef> terms = new HashSet<>();
+      List<Automaton> automatonList = new ArrayList<>(size);
+      for (int j = 0; j < size; j++) {
+        String s = TestUtil.randomRealisticUnicodeString(random(), 8);
+        terms.add(newBytesRef(s));
+        List<Automaton> charAutomata =
+            s.codePoints()
+                .mapToObj(
+                    c -> {
+                      List<Automaton> caseAutomata = new ArrayList<>();
+                      caseAutomata.add(Automata.makeChar(c));
+                      int[] alternates = CaseFolding.lookupAlternates(c);
+                      if (alternates != null) {
+                        for (int alt : alternates) {
+                          caseAutomata.add(Automata.makeChar(alt));
+                        }
+                      } else {
+                        int altCase =
+                            Character.isLowerCase(c)
+                                ? Character.toUpperCase(c)
+                                : Character.toLowerCase(c);
+
+                        if (altCase != c) {
+                          caseAutomata.add(Automata.makeChar(altCase));
+                        }
+                      }
+                      return Operations.union(caseAutomata);
+                    })
+                .collect(Collectors.toList());
+        if (charAutomata.isEmpty()) {
+          automatonList.add(Automata.makeEmptyString());
+        } else {
+          automatonList.add(Operations.concatenate(charAutomata));
+        }
+      }
+      List<BytesRef> sortedTerms = terms.stream().sorted().toList();
+
+      Automaton expected =
+          MinimizationOperations.minimize(
+              Operations.union(automatonList), Operations.DEFAULT_DETERMINIZE_WORK_LIMIT);
+      Automaton actual = build(sortedTerms, false, true);
       assertSameAutomaton(expected, actual);
     }
   }
@@ -98,7 +157,7 @@ public void testLargeTerms() throws Exception {
     IllegalArgumentException e =
         expectThrows(
             IllegalArgumentException.class,
-            () -> build(Collections.singleton(new BytesRef(b10k)), false));
+            () -> build(Collections.singleton(new BytesRef(b10k)), false, false));
     assertTrue(
         e.getMessage()
             .startsWith(
@@ -107,7 +166,7 @@ public void testLargeTerms() throws Exception {
                     + " UTF-8 bytes"));
 
     byte[] b1k = ArrayUtil.copyOfSubArray(b10k, 0, 1000);
-    build(Collections.singleton(new BytesRef(b1k)), false); // no exception
+    build(Collections.singleton(new BytesRef(b1k)), false, false); // no exception
   }
 
   private void testRandom(boolean allowBinary) throws Exception {
@@ -125,12 +184,17 @@ private void testRandom(boolean allowBinary) throws Exception {
       }
 
       List<BytesRef> sorted = terms.stream().sorted().toList();
-      Automaton a = build(sorted, allowBinary);
+      Automaton a = build(sorted, allowBinary, false);
       checkAutomaton(sorted, a, allowBinary);
     }
   }
 
   private void checkAutomaton(List<BytesRef> expected, Automaton a, boolean isBinary) {
+    checkAutomaton(expected, a, isBinary, false);
+  }
+
+  private void checkAutomaton(
+      List<BytesRef> expected, Automaton a, boolean isBinary, boolean caseInsensitive) {
     CompiledAutomaton c = new CompiledAutomaton(a, true, false, isBinary);
     ByteRunAutomaton runAutomaton = c.runAutomaton;
 
@@ -141,12 +205,14 @@ private void checkAutomaton(List<BytesRef> expected, Automaton a, boolean isBina
           readable + " should be found but wasn't", runAutomaton.run(t.bytes, t.offset, t.length));
     }
 
-    // Make sure every term produced by the automaton is expected
-    BytesRefBuilder scratch = new BytesRefBuilder();
-    FiniteStringsIterator it = new FiniteStringsIterator(c.automaton);
-    for (IntsRef r = it.next(); r != null; r = it.next()) {
-      BytesRef t = Util.toBytesRef(r, scratch);
-      assertTrue(expected.contains(t));
+    if (caseInsensitive == false) {
+      // Make sure every term produced by the automaton is expected
+      BytesRefBuilder scratch = new BytesRefBuilder();
+      FiniteStringsIterator it = new FiniteStringsIterator(c.automaton);
+      for (IntsRef r = it.next(); r != null; r = it.next()) {
+        BytesRef t = Util.toBytesRef(r, scratch);
+        assertTrue(expected.contains(t));
+      }
     }
   }
 
@@ -172,11 +238,12 @@ private List<BytesRef> basicTerms() {
     return terms;
   }
 
-  private Automaton build(Collection<BytesRef> terms, boolean asBinary) throws IOException {
+  private Automaton build(Collection<BytesRef> terms, boolean asBinary, boolean caseInsensitive)
+      throws IOException {
     if (random().nextBoolean()) {
-      return StringsToAutomaton.build(terms, asBinary);
+      return StringsToAutomaton.build(terms, asBinary, caseInsensitive);
     } else {
-      return StringsToAutomaton.build(new TermIterator(terms), asBinary);
+      return StringsToAutomaton.build(new TermIterator(terms), asBinary, caseInsensitive);
     }
   }
 

From b39474f09aa8e5c933c142ad4438227f6e735f65 Mon Sep 17 00:00:00 2001
From: Michael Froh <msfroh@gmail.com>
Date: Mon, 17 Mar 2025 17:57:19 -0700
Subject: [PATCH 2/2] Only accept lowercase input, generate transitions for
 uppercase

---
 .../lucene/util/automaton/Automata.java       | 13 ++--
 .../lucene/util/automaton/CaseFolding.java    | 38 +++++++++++
 .../util/automaton/StringsToAutomaton.java    | 50 +++++++-------
 .../automaton/TestStringsToAutomaton.java     | 67 +++++++++++--------
 4 files changed, 109 insertions(+), 59 deletions(-)

diff --git a/lucene/core/src/java/org/apache/lucene/util/automaton/Automata.java b/lucene/core/src/java/org/apache/lucene/util/automaton/Automata.java
index fe9a350dec1f..9994eb3c6282 100644
--- a/lucene/core/src/java/org/apache/lucene/util/automaton/Automata.java
+++ b/lucene/core/src/java/org/apache/lucene/util/automaton/Automata.java
@@ -608,7 +608,7 @@ public static Automaton makeStringUnion(Iterable<BytesRef> utf8Strings) {
     if (utf8Strings.iterator().hasNext() == false) {
       return makeEmpty();
     } else {
-      return StringsToAutomaton.build(utf8Strings, false, false);
+      return StringsToAutomaton.build(utf8Strings, false, false, false);
     }
   }
 
@@ -620,11 +620,12 @@ public static Automaton makeStringUnion(Iterable<BytesRef> utf8Strings) {
    * @return An {@link Automaton} accepting all input strings. The resulting automaton is codepoint
    *     based (full unicode codepoints on transitions).
    */
-  public static Automaton makeCaseInsensitiveStringUnion(Iterable<BytesRef> utf8Strings) {
+  public static Automaton makeCaseInsensitiveStringUnion(
+      Iterable<BytesRef> utf8Strings, boolean turkic) {
     if (utf8Strings.iterator().hasNext() == false) {
       return makeEmpty();
     } else {
-      return StringsToAutomaton.build(utf8Strings, false, true);
+      return StringsToAutomaton.build(utf8Strings, false, true, turkic);
     }
   }
 
@@ -641,7 +642,7 @@ public static Automaton makeBinaryStringUnion(Iterable<BytesRef> utf8Strings) {
     if (utf8Strings.iterator().hasNext() == false) {
       return makeEmpty();
     } else {
-      return StringsToAutomaton.build(utf8Strings, true, false);
+      return StringsToAutomaton.build(utf8Strings, true, false, false);
     }
   }
 
@@ -654,7 +655,7 @@ public static Automaton makeBinaryStringUnion(Iterable<BytesRef> utf8Strings) {
    *     based (full unicode codepoints on transitions).
    */
   public static Automaton makeStringUnion(BytesRefIterator utf8Strings) throws IOException {
-    return StringsToAutomaton.build(utf8Strings, false, false);
+    return StringsToAutomaton.build(utf8Strings, false, false, false);
   }
 
   /**
@@ -667,6 +668,6 @@ public static Automaton makeStringUnion(BytesRefIterator utf8Strings) throws IOE
    *     based (UTF-8 encoded byte transition labels).
    */
   public static Automaton makeBinaryStringUnion(BytesRefIterator utf8Strings) throws IOException {
-    return StringsToAutomaton.build(utf8Strings, true, false);
+    return StringsToAutomaton.build(utf8Strings, true, false, false);
   }
 }
diff --git a/lucene/core/src/java/org/apache/lucene/util/automaton/CaseFolding.java b/lucene/core/src/java/org/apache/lucene/util/automaton/CaseFolding.java
index dba1e6438e23..988ae6c3f681 100644
--- a/lucene/core/src/java/org/apache/lucene/util/automaton/CaseFolding.java
+++ b/lucene/core/src/java/org/apache/lucene/util/automaton/CaseFolding.java
@@ -743,4 +743,42 @@ static int[] lookupAlternates(int codepoint) {
 
     return alts;
   }
+
+  /**
+   * Folds the case of the given character according to {@link Character#toLowerCase(int)}, but with
+   * exceptions if the turkic flag is set.
+   *
+   * @param codepoint to code point for the character to fold
+   * @param turkic if true, then apply tr/az folding rules
+   * @return the folded character
+   */
+  static int foldCase(int codepoint, boolean turkic) {
+    if (turkic) {
+      if (codepoint == 0x00130) { // İ [LATIN CAPITAL LETTER I WITH DOT ABOVE]
+        return 0x00069; // i [LATIN SMALL LETTER I]
+      } else if (codepoint == 0x000049) { //  I [LATIN CAPITAL LETTER I]
+        return 0x00131; // ı [LATIN SMALL LETTER DOTLESS I]
+      }
+    }
+    return Character.toLowerCase(codepoint);
+  }
+
+  /**
+   * Attempts to convert the given character to upper case, acccording to {@link
+   * Character#toUpperCase(int)}, but with exceptions if the turkic flag is set.
+   *
+   * @param codepoint to code point for the character to convert to upper case
+   * @param turkic if true, then apply tr/az folding rules
+   * @return the upper case character
+   */
+  static int upperCase(int codepoint, boolean turkic) {
+    if (turkic) {
+      if (codepoint == 0x00069) { // i [LATIN SMALL LETTER I]
+        return 0x00130; // İ [LATIN CAPITAL LETTER I WITH DOT ABOVE]
+      } else if (codepoint == 0x00131) { // ı [LATIN SMALL LETTER DOTLESS I]
+        return 0x000049; // I [LATIN CAPITAL LETTER I]
+      }
+    }
+    return Character.toUpperCase(codepoint);
+  }
 }
diff --git a/lucene/core/src/java/org/apache/lucene/util/automaton/StringsToAutomaton.java b/lucene/core/src/java/org/apache/lucene/util/automaton/StringsToAutomaton.java
index b66ad2793e52..f3474592f3b0 100644
--- a/lucene/core/src/java/org/apache/lucene/util/automaton/StringsToAutomaton.java
+++ b/lucene/core/src/java/org/apache/lucene/util/automaton/StringsToAutomaton.java
@@ -40,10 +40,13 @@
  * @see Automata#makeBinaryStringUnion(BytesRefIterator)
  */
 final class StringsToAutomaton {
+  private final boolean caseInsensitive;
+  private final boolean turkic;
 
   /** The default constructor is private. Use static methods directly. */
-  private StringsToAutomaton() {
-    super();
+  private StringsToAutomaton(boolean caseInsensitive, boolean turkic) {
+    this.caseInsensitive = caseInsensitive;
+    this.turkic = turkic;
   }
 
   /** DFSA state with <code>char</code> labels on transitions. */
@@ -198,7 +201,8 @@ private static int convert(
       Automaton.Builder a,
       State s,
       IdentityHashMap<State, Integer> visited,
-      boolean caseInsensitive) {
+      boolean caseInsensitive,
+      boolean turkic) {
 
     Integer converted = visited.get(s);
     if (converted != null) {
@@ -213,22 +217,12 @@ private static int convert(
     int[] labels = s.labels;
     for (StringsToAutomaton.State target : s.states) {
       int label = labels[i++];
-      int dest = convert(a, target, visited, caseInsensitive);
+      int dest = convert(a, target, visited, caseInsensitive, turkic);
       a.addTransition(converted, dest, label);
       if (caseInsensitive) {
-        int[] alternatives = CaseFolding.lookupAlternates(label);
-        if (alternatives != null) {
-          for (int alt : alternatives) {
-            a.addTransition(converted, dest, alt);
-          }
-        } else {
-          int altCase =
-              Character.isLowerCase(label)
-                  ? Character.toUpperCase(label)
-                  : Character.toLowerCase(label);
-          if (altCase != label) {
-            a.addTransition(converted, dest, altCase);
-          }
+        int altCase = CaseFolding.upperCase(label, turkic);
+        if (altCase != label) {
+          a.addTransition(converted, dest, altCase);
         }
       }
     }
@@ -240,7 +234,7 @@ private static int convert(
    * Called after adding all terms. Performs final minimization and converts to a standard {@link
    * Automaton} instance.
    */
-  private Automaton completeAndConvert(boolean caseInsensitive) {
+  private Automaton completeAndConvert() {
     // Final minimization:
     if (this.stateRegistry == null) throw new IllegalStateException();
     if (root.hasChildren()) replaceOrRegister(root);
@@ -248,7 +242,7 @@ private Automaton completeAndConvert(boolean caseInsensitive) {
 
     // Convert:
     Automaton.Builder a = new Automaton.Builder();
-    convert(a, root, new IdentityHashMap<>(), caseInsensitive);
+    convert(a, root, new IdentityHashMap<>(), caseInsensitive, turkic);
     return a.finish();
   }
 
@@ -258,17 +252,18 @@ private Automaton completeAndConvert(boolean caseInsensitive) {
    * UTF-8 codepoints as transition labels or binary (compiled) transition labels based on {@code
    * asBinary}.
    */
-  static Automaton build(Iterable<BytesRef> input, boolean asBinary, boolean caseInsensitive) {
+  static Automaton build(
+      Iterable<BytesRef> input, boolean asBinary, boolean caseInsensitive, boolean turkic) {
     if (asBinary && caseInsensitive) {
       throw new IllegalArgumentException("Cannot use caseInsensitive on binary automaton");
     }
-    final StringsToAutomaton builder = new StringsToAutomaton();
+    final StringsToAutomaton builder = new StringsToAutomaton(caseInsensitive, turkic);
 
     for (BytesRef b : input) {
       builder.add(b, asBinary);
     }
 
-    return builder.completeAndConvert(caseInsensitive);
+    return builder.completeAndConvert();
   }
 
   /**
@@ -277,18 +272,19 @@ static Automaton build(Iterable<BytesRef> input, boolean asBinary, boolean caseI
    * UTF-8 codepoints as transition labels or binary (compiled) transition labels based on {@code
    * asBinary}.
    */
-  static Automaton build(BytesRefIterator input, boolean asBinary, boolean caseInsensitive)
+  static Automaton build(
+      BytesRefIterator input, boolean asBinary, boolean caseInsensitive, boolean turkic)
       throws IOException {
     if (asBinary && caseInsensitive) {
       throw new IllegalArgumentException("Cannot use caseInsensitive on binary automaton");
     }
-    final StringsToAutomaton builder = new StringsToAutomaton();
+    final StringsToAutomaton builder = new StringsToAutomaton(caseInsensitive, turkic);
 
     for (BytesRef b = input.next(); b != null; b = input.next()) {
       builder.add(b, asBinary);
     }
 
-    return builder.completeAndConvert(caseInsensitive);
+    return builder.completeAndConvert();
   }
 
   private void add(BytesRef current, boolean asBinary) {
@@ -321,6 +317,10 @@ private void add(BytesRef current, boolean asBinary) {
     } else {
       while (pos < max) {
         codePoint = UnicodeUtil.codePointAt(bytes, pos, codePoint);
+        if (caseInsensitive
+            && codePoint.codePoint != CaseFolding.foldCase(codePoint.codePoint, turkic)) {
+          throw new IllegalArgumentException("Case-insensitive input must be lower-case");
+        }
         next = state.lastChild(codePoint.codePoint);
         if (next == null) {
           break;
diff --git a/lucene/core/src/test/org/apache/lucene/util/automaton/TestStringsToAutomaton.java b/lucene/core/src/test/org/apache/lucene/util/automaton/TestStringsToAutomaton.java
index 5316f594a910..82fe68087f2f 100644
--- a/lucene/core/src/test/org/apache/lucene/util/automaton/TestStringsToAutomaton.java
+++ b/lucene/core/src/test/org/apache/lucene/util/automaton/TestStringsToAutomaton.java
@@ -27,6 +27,7 @@
 import java.util.List;
 import java.util.Set;
 import java.util.stream.Collectors;
+import java.util.stream.Stream;
 import org.apache.lucene.tests.util.LuceneTestCase;
 import org.apache.lucene.tests.util.TestUtil;
 import org.apache.lucene.tests.util.automaton.AutomatonTestUtil;
@@ -43,7 +44,7 @@ public void testBasic() throws Exception {
     List<BytesRef> terms = basicTerms();
     Collections.sort(terms);
 
-    Automaton a = build(terms, false, false);
+    Automaton a = build(terms, false);
     checkAutomaton(terms, a, false);
     checkMinimized(a);
   }
@@ -52,16 +53,27 @@ public void testCaseInsensitive() throws Exception {
     List<BytesRef> terms = basicTerms();
     Collections.sort(terms);
 
-    Automaton a = build(terms, false, true);
+    Automaton a = buildCaseInsensitive(terms, false);
     checkAutomaton(terms, a, false, true);
     checkMinimized(a);
   }
 
+  public void testCornerCase() throws Exception {
+    List<BytesRef> terms =
+        Stream.of("aib", "aıc")
+            .map(LuceneTestCase::newBytesRef)
+            .sorted()
+            .collect(Collectors.toCollection(ArrayList::new));
+    Automaton a = buildCaseInsensitive(terms, true);
+    System.out.println(a.toDot());
+    assertTrue(a.isDeterministic());
+  }
+
   public void testBasicBinary() throws Exception {
     List<BytesRef> terms = basicTerms();
     Collections.sort(terms);
 
-    Automaton a = build(terms, true, false);
+    Automaton a = build(terms, true);
     checkAutomaton(terms, a, true);
     checkMinimized(a);
   }
@@ -89,7 +101,7 @@ public void testRandomMinimized() throws Exception {
       Automaton expected =
           MinimizationOperations.minimize(
               Operations.union(automatonList), Operations.DEFAULT_DETERMINIZE_WORK_LIMIT);
-      Automaton actual = build(sortedTerms, buildBinary, false);
+      Automaton actual = build(sortedTerms, buildBinary);
       assertSameAutomaton(expected, actual);
     }
   }
@@ -100,31 +112,22 @@ public void testRandomMinimizedCaseInsensitive() throws Exception {
       int size = random().nextInt(2, 50);
       Set<BytesRef> terms = new HashSet<>();
       List<Automaton> automatonList = new ArrayList<>(size);
+      boolean turkic = random().nextBoolean();
       for (int j = 0; j < size; j++) {
         String s = TestUtil.randomRealisticUnicodeString(random(), 8);
+        int[] lowercased = s.codePoints().map(c -> CaseFolding.foldCase(c, turkic)).toArray();
+        s = new String(lowercased, 0, lowercased.length);
         terms.add(newBytesRef(s));
         List<Automaton> charAutomata =
             s.codePoints()
                 .mapToObj(
                     c -> {
-                      List<Automaton> caseAutomata = new ArrayList<>();
-                      caseAutomata.add(Automata.makeChar(c));
-                      int[] alternates = CaseFolding.lookupAlternates(c);
-                      if (alternates != null) {
-                        for (int alt : alternates) {
-                          caseAutomata.add(Automata.makeChar(alt));
-                        }
-                      } else {
-                        int altCase =
-                            Character.isLowerCase(c)
-                                ? Character.toUpperCase(c)
-                                : Character.toLowerCase(c);
-
-                        if (altCase != c) {
-                          caseAutomata.add(Automata.makeChar(altCase));
-                        }
+                      Automaton a = Automata.makeChar(c);
+                      int altCase = CaseFolding.upperCase(c, turkic);
+                      if (altCase != c) {
+                        return Operations.union(List.of(a, Automata.makeChar(altCase)));
                       }
-                      return Operations.union(caseAutomata);
+                      return a;
                     })
                 .collect(Collectors.toList());
         if (charAutomata.isEmpty()) {
@@ -138,7 +141,7 @@ public void testRandomMinimizedCaseInsensitive() throws Exception {
       Automaton expected =
           MinimizationOperations.minimize(
               Operations.union(automatonList), Operations.DEFAULT_DETERMINIZE_WORK_LIMIT);
-      Automaton actual = build(sortedTerms, false, true);
+      Automaton actual = buildCaseInsensitive(sortedTerms, turkic);
       assertSameAutomaton(expected, actual);
     }
   }
@@ -157,7 +160,7 @@ public void testLargeTerms() throws Exception {
     IllegalArgumentException e =
         expectThrows(
             IllegalArgumentException.class,
-            () -> build(Collections.singleton(new BytesRef(b10k)), false, false));
+            () -> build(Collections.singleton(new BytesRef(b10k)), false));
     assertTrue(
         e.getMessage()
             .startsWith(
@@ -166,7 +169,7 @@ public void testLargeTerms() throws Exception {
                     + " UTF-8 bytes"));
 
     byte[] b1k = ArrayUtil.copyOfSubArray(b10k, 0, 1000);
-    build(Collections.singleton(new BytesRef(b1k)), false, false); // no exception
+    build(Collections.singleton(new BytesRef(b1k)), false); // no exception
   }
 
   private void testRandom(boolean allowBinary) throws Exception {
@@ -184,7 +187,7 @@ private void testRandom(boolean allowBinary) throws Exception {
       }
 
       List<BytesRef> sorted = terms.stream().sorted().toList();
-      Automaton a = build(sorted, allowBinary, false);
+      Automaton a = build(sorted, allowBinary);
       checkAutomaton(sorted, a, allowBinary);
     }
   }
@@ -238,12 +241,20 @@ private List<BytesRef> basicTerms() {
     return terms;
   }
 
-  private Automaton build(Collection<BytesRef> terms, boolean asBinary, boolean caseInsensitive)
+  private Automaton build(Collection<BytesRef> terms, boolean asBinary) throws IOException {
+    if (random().nextBoolean()) {
+      return StringsToAutomaton.build(terms, asBinary, false, false);
+    } else {
+      return StringsToAutomaton.build(new TermIterator(terms), asBinary, false, false);
+    }
+  }
+
+  private Automaton buildCaseInsensitive(Collection<BytesRef> terms, boolean turkic)
       throws IOException {
     if (random().nextBoolean()) {
-      return StringsToAutomaton.build(terms, asBinary, caseInsensitive);
+      return StringsToAutomaton.build(terms, false, true, turkic);
     } else {
-      return StringsToAutomaton.build(new TermIterator(terms), asBinary, caseInsensitive);
+      return StringsToAutomaton.build(new TermIterator(terms), false, true, turkic);
     }
   }