apache
diff --git a/‎gradle/validation/error-prone.gradle
Lines changed: 99 additions & 1 deletion b/‎gradle/validation/error-prone.gradle
Lines changed: 99 additions & 1 deletion
diff --git a/‎lucene/CHANGES.txt
Lines changed: 22 additions & 3 deletions b/‎lucene/CHANGES.txt
Lines changed: 22 additions & 3 deletions
diff --git a/‎lucene/analysis/common/src/java/org/apache/lucene/analysis/compound/hyphenation/HyphenationTree.java
Lines changed: 1 addition & 1 deletion b/‎lucene/analysis/common/src/java/org/apache/lucene/analysis/compound/hyphenation/HyphenationTree.java
Lines changed: 1 addition & 1 deletion
diff --git a/‎lucene/analysis/common/src/java/org/apache/lucene/analysis/compound/hyphenation/PatternParser.java
Lines changed: 1 addition & 2 deletions b/‎lucene/analysis/common/src/java/org/apache/lucene/analysis/compound/hyphenation/PatternParser.java
Lines changed: 1 addition & 2 deletions
diff --git a/‎lucene/analysis/common/src/java/org/apache/lucene/analysis/en/KStemmer.java
Lines changed: 16 additions & 16 deletions b/‎lucene/analysis/common/src/java/org/apache/lucene/analysis/en/KStemmer.java
Lines changed: 16 additions & 16 deletions
diff --git a/‎lucene/analysis/common/src/java/org/apache/lucene/analysis/minhash/MinHashFilter.java
Lines changed: 2 additions & 1 deletion b/‎lucene/analysis/common/src/java/org/apache/lucene/analysis/minhash/MinHashFilter.java
Lines changed: 2 additions & 1 deletion
diff --git a/‎lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestFlattenGraphFilter.java
Lines changed: 3 additions & 3 deletions b/‎lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestFlattenGraphFilter.java
Lines changed: 3 additions & 3 deletions
diff --git a/‎lucene/analysis/common/src/test/org/apache/lucene/analysis/minhash/TestMinHashFilter.java
Lines changed: 4 additions & 4 deletions b/‎lucene/analysis/common/src/test/org/apache/lucene/analysis/minhash/TestMinHashFilter.java
Lines changed: 4 additions & 4 deletions
diff --git a/‎lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/ViterbiNBest.java
Lines changed: 1 addition & 1 deletion b/‎lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/ViterbiNBest.java
Lines changed: 1 addition & 1 deletion
diff --git a/‎lucene/analysis/opennlp/build.gradle
Lines changed: 29 additions & 0 deletions b/‎lucene/analysis/opennlp/build.gradle
Lines changed: 29 additions & 0 deletions
diff --git a/‎lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/OpenNLPSentenceBreakIterator.java
Lines changed: 1 addition & 2 deletions b/‎lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/OpenNLPSentenceBreakIterator.java
Lines changed: 1 addition & 2 deletions
diff --git a/‎lucene/analysis/opennlp/src/test-files/org/apache/lucene/analysis/opennlp/en-test-chunker.bin
-929 Bytes b/‎lucene/analysis/opennlp/src/test-files/org/apache/lucene/analysis/opennlp/en-test-chunker.bin
-929 Bytes
diff --git a/‎lucene/analysis/opennlp/src/test-files/org/apache/lucene/analysis/opennlp/en-test-lemmatizer.bin
-122 Bytes b/‎lucene/analysis/opennlp/src/test-files/org/apache/lucene/analysis/opennlp/en-test-lemmatizer.bin
-122 Bytes
diff --git a/‎lucene/analysis/opennlp/src/test-files/org/apache/lucene/analysis/opennlp/en-test-ner.bin
-34 Bytes b/‎lucene/analysis/opennlp/src/test-files/org/apache/lucene/analysis/opennlp/en-test-ner.bin
-34 Bytes
diff --git a/‎lucene/analysis/opennlp/src/test-files/org/apache/lucene/analysis/opennlp/en-test-pos-maxent.bin
-85 Bytes b/‎lucene/analysis/opennlp/src/test-files/org/apache/lucene/analysis/opennlp/en-test-pos-maxent.bin
-85 Bytes
diff --git a/‎lucene/analysis/opennlp/src/test-files/org/apache/lucene/analysis/opennlp/en-test-sent.bin
-9 Bytes b/‎lucene/analysis/opennlp/src/test-files/org/apache/lucene/analysis/opennlp/en-test-sent.bin
-9 Bytes
diff --git a/‎lucene/analysis/opennlp/src/test-files/org/apache/lucene/analysis/opennlp/en-test-tokenizer.bin
-244 Bytes b/‎lucene/analysis/opennlp/src/test-files/org/apache/lucene/analysis/opennlp/en-test-tokenizer.bin
-244 Bytes
diff --git a/‎lucene/analysis/opennlp/src/test/org/apache/lucene/analysis/opennlp/TestOpenNLPChunkerFilterFactory.java
Lines changed: 1 addition & 1 deletion b/‎lucene/analysis/opennlp/src/test/org/apache/lucene/analysis/opennlp/TestOpenNLPChunkerFilterFactory.java
Lines changed: 1 addition & 1 deletion
diff --git a/‎lucene/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/smart/AnalyzerProfile.java
Lines changed: 11 additions & 16 deletions b/‎lucene/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/smart/AnalyzerProfile.java
Lines changed: 11 additions & 16 deletions
diff --git a/‎lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene60/Lucene60PointsWriter.java
Lines changed: 1 addition & 1 deletion b/‎lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene60/Lucene60PointsWriter.java
Lines changed: 1 addition & 1 deletion
diff --git a/‎lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene60/bkd/BKDWriter60.java
Lines changed: 1 addition & 1 deletion b/‎lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene60/bkd/BKDWriter60.java
Lines changed: 1 addition & 1 deletion
@@ -46,11 +46,14 @@ API Changes
 * GITHUB#14134: Added Bits#applyMask API to help apply live docs as a mask on a
   bit set of matches. (Adrien Grand)
 
+* GITHUB#14209: Deprecate Operations.union(Automaton,Automaton) and 
+  concatenate(Automaton,Automaton) in favor of the methods taking List.  (Robert Muir)
+
 New Features
 ---------------------
 
-* GITHUB#14084, GITHUB#13635, GITHUB#13634: Adds new `SeededKnnByteVectorQuery` and `SeededKnnFloatVectorQuery`
-  queries. These queries allow for the vector search entry points to be initialized via a `seed` query. This follows
+* GITHUB#14084, GITHUB#13635, GITHUB#13634, GITHUB#14170: Adds new `SeededKnnVectorQuery` query.
+  These queries allow for the vector search entry points to be initialized via a `seed` query. This follows
   the research provided via https://arxiv.org/abs/2307.16779. (Sean MacAvaney, Ben Trent).
 
 
@@ -75,6 +78,11 @@ Improvements
   mergeFactor segments together when the merge is below the min merge size.
   (Adrien Grand)
 
+* GITHUB#14154: Add UnwrappingReuseStrategy for AnalyzerWrapper that consults
+  the wrapped analyzer's strategy to decide if components can be reused or need
+  to be updated. (Mayya Sharipova)
+
+
 Optimizations
 ---------------------
 
@@ -86,7 +94,16 @@ Optimizations
 * GITHUB#14133: Dense blocks of postings are now encoded as bit sets.
   (Adrien Grand)
 
-# GITHUB#14169: Optimize ContextQuery with big number of contexts. (Mayya Sharipova)
+* GITHUB#14169: Optimize ContextQuery with big number of contexts. (Mayya Sharipova)
+
+* GITHUB#14181: Add updateable random scorer interface for knn vector index building. This allows
+  for fewer objects to be created during indexing and simplifies internally used iterfaces.
+  (Ben Trent)
+
+* GITHUB#14193: Add Automata.makeCharSet() and makeCharClass() that return minimal DFA
+  for lists of characters and ranges. Use them in RegExp parser.  (Robert Muir)
+
+* GITHUB#14176: Reduce when visiting bpv24-encoded doc ids in BKD leaves. (Guo Feng)
 
 * GITHUB#14094: Early terminate when HNSW nearest neighbor queue saturates (Tommaso Teofili)
 
@@ -101,6 +118,8 @@ Bug Fixes
 * GITHUB#14126: Avoid overflow in index input slices invariant checks
   (Chris Hegarty)
 
+* GITHUB#14215: Fix degenerate case in HNSW where all vectors have the same score. (Ben Trent)
+
 Other
 ---------------------
 
 
@@ -225,7 +225,7 @@ protected void searchPatterns(char[] word, int index, byte[] il) {
           } else {
             q = lo[q];
 
-            /**
+            /*
              * actually the code should be: q = sc[q] < 0 ? hi[q] : lo[q]; but java chars are
              * unsigned
              */
 
@@ -172,8 +172,7 @@ protected ArrayList<Object> normalizeException(ArrayList<?> ex) {
     ArrayList<Object> res = new ArrayList<>();
     for (int i = 0; i < ex.size(); i++) {
       Object item = ex.get(i);
-      if (item instanceof String) {
-        String str = (String) item;
+      if (item instanceof String str) {
         StringBuilder buf = new StringBuilder();
         for (int j = 0; j < str.length(); j++) {
           char c = str.charAt(j);
 
@@ -822,12 +822,12 @@ private boolean endsIn(char a, char b, char c, char d) {
   }
 
   private DictEntry wordInDict() {
-    /***
+    /*
      * if (matchedEntry != null) { if (dict_ht.get(word.getArray(), 0,
      * word.size()) != matchedEntry) {
      * System.out.println("Uh oh... cached entry doesn't match"); } return
      * matchedEntry; }
-     ***/
+     */
     if (matchedEntry != null) return matchedEntry;
     DictEntry e = dict_ht.get(word.getArray(), 0, word.length());
     if (e != null && !e.exception) {
@@ -861,11 +861,11 @@ private void plural() {
          * common
          */
 
-        /****
+        /*
          * YCS: this was the one place where lookup was not followed by return.
          * So restructure it. if ((j>0)&&(lookup(word.toString())) &&
          * !((word.charAt(j) == 's') && (word.charAt(j-1) == 's'))) return;
-         *****/
+         */
         boolean tryE = j > 0 && !((word.charAt(j) == 's') && (word.charAt(j - 1) == 's'));
         if (tryE && lookup()) return;
 
@@ -913,15 +913,15 @@ private void setSuff(String s, int len) {
   DictEntry matchedEntry = null;
 
   private boolean lookup() {
-    /******
+    /*
      * debugging code String thisLookup = word.toString(); boolean added =
      * lookups.add(thisLookup); if (!added) {
      * System.out.println("######extra lookup:" + thisLookup); // occaasional
      * extra lookups aren't necessarily errors... could happen by diff
      * manipulations // throw new RuntimeException("######extra lookup:" +
      * thisLookup); } else { // System.out.println("new lookup:" + thisLookup);
      * }
-     ******/
+     */
 
     matchedEntry = dict_ht.get(word.getArray(), 0, word.size());
     return matchedEntry != null;
@@ -1742,11 +1742,11 @@ int getLength() {
   String result;
 
   private boolean matched() {
-    /***
+    /*
      * if (!lookups.contains(word.toString())) { throw new
      * RuntimeException("didn't look up "+word.toString()+" prev="+prevLookup);
      * }
-     ***/
+     */
     // lookup();
     return matchedEntry != null;
   }
@@ -1772,13 +1772,13 @@ boolean stem(char[] term, int len) {
       return false;
     }
 
-    /***
+    /*
      * caching off is normally faster if (cache == null) initializeStemHash();
      *
      * // now check the cache, before we copy chars to "word" if (cache != null)
      * { String val = cache.get(term, 0, len); if (val != null) { if (val !=
      * SAME) { result = val; return true; } return false; } }
-     ***/
+     */
 
     word.reset();
     // allocate enough space so that an expansion is never needed
@@ -1792,9 +1792,9 @@ boolean stem(char[] term, int len) {
     }
 
     matchedEntry = null;
-    /***
+    /*
      * lookups.clear(); lookups.add(word.toString());
-     ***/
+     */
 
     /*
      * This while loop will never be executed more than one time; it is here
@@ -1851,20 +1851,20 @@ boolean stem(char[] term, int len) {
       result = entry.root; // may be null, which means that "word" is the stem
     }
 
-    /***
+    /*
      * caching off is normally faster if (cache != null && cache.size() <
      * maxCacheSize) { char[] key = new char[len]; System.arraycopy(term, 0,
      * key, 0, len); if (result != null) { cache.put(key, result); } else {
      * cache.put(key, word.toString()); } }
-     ***/
+     */
 
-    /***
+    /*
      * if (entry == null) { if (!word.toString().equals(new String(term,0,len)))
      * { System.out.println("CASE:" + word.toString() + "," + new
      * String(term,0,len));
      *
      * } }
-     ***/
+     */
 
     // no entry matched means result is "word"
     return true;
 
@@ -18,6 +18,7 @@
 package org.apache.lucene.analysis.minhash;
 
 import java.io.IOException;
+import java.nio.charset.StandardCharsets;
 import java.util.ArrayList;
 import java.util.List;
 import java.util.TreeSet;
@@ -160,7 +161,7 @@ public final boolean incrementToken() throws IOException {
         String current = new String(termAttribute.buffer(), 0, termAttribute.length());
 
         for (int i = 0; i < hashCount; i++) {
-          byte[] bytes = current.getBytes("UTF-16LE");
+          byte[] bytes = current.getBytes(StandardCharsets.UTF_16LE);
           LongPair hash = new LongPair();
           murmurhash3_x64_128(bytes, 0, bytes.length, 0, hash);
           LongPair rehashed = combineOrdered(hash, getIntHash(i));
 
@@ -816,7 +816,7 @@ public boolean recursivelyValidate(
     return accept;
   }
 
-  /**
+  /*
    * This method checks if strings that lead to the accept state of the not flattened TokenStream
    * also lead to the accept state in the flattened TokenStream. This gets complicated when you
    * factor in holes. The FlattenGraphFilter will remove alternate paths that are made entirely of
@@ -840,7 +840,7 @@ public boolean recursivelyValidate(
     notFlattened.close();
   }*/
 
-  /**
+  /*
    * gets up to 10000 strings that lead to accept state in the given automaton.
    *
    * @param automaton automaton
@@ -856,7 +856,7 @@ public boolean recursivelyValidate(
     return acceptedSequences;
   }*/
 
-  /**
+  /*
    * @param automaton automaton to generate strings from
    * @param state state to start at
    * @param prefix string prefix
 
@@ -19,7 +19,7 @@
 
 import java.io.IOException;
 import java.io.StringReader;
-import java.io.UnsupportedEncodingException;
+import java.nio.charset.StandardCharsets;
 import java.util.ArrayList;
 import java.util.HashMap;
 import java.util.HashSet;
@@ -47,16 +47,16 @@ public void testIntHash() {
   }
 
   @Test
-  public void testStringHash() throws UnsupportedEncodingException {
+  public void testStringHash() {
     LongPair hash = new LongPair();
-    byte[] bytes = "woof woof woof woof woof".getBytes("UTF-16LE");
+    byte[] bytes = "woof woof woof woof woof".getBytes(StandardCharsets.UTF_16LE);
     MinHashFilter.murmurhash3_x64_128(bytes, 0, bytes.length, 0, hash);
     assertEquals(7638079586852243959L, hash.val1);
     assertEquals(4378804943379391304L, hash.val2);
   }
 
   @Test
-  public void testSimpleOrder() throws UnsupportedEncodingException {
+  public void testSimpleOrder() {
     LongPair hash1 = new LongPair();
     hash1.val1 = 1;
     hash1.val2 = 2;
 
@@ -179,7 +179,7 @@ void setGraphvizFormatter(GraphvizFormatter<JaMorphData> dotOut) {
   protected void backtrace(Position endPosData, int fromIDX) throws IOException {
     final int endPos = endPosData.getPos();
 
-    /**
+    /*
      * LUCENE-10059: If the endPos is the same as lastBackTracePos, we don't want to backtrace to
      * avoid an assertion error {@link RollingCharBuffer#get(int)} when it tries to generate an
      * empty buffer
 
@@ -26,3 +26,32 @@ dependencies {
 
   moduleTestImplementation project(':lucene:test-framework')
 }
+
+ext {
+  testModelDataDir = file('src/tools/test-model-data')
+  testsUserDir = file('src/test-files')
+  testModelDir = file("${testsUserDir}/org/apache/lucene/analysis/opennlp")
+}
+
+tasks.register('trainTestModels') {
+  description = 'Train all small test models for unit tests'
+  doLast {
+    mkdir testModelDir
+    trainModel('SentenceDetectorTrainer', 'en', 'sentences.txt', 'en-test-sent.bin')
+    trainModel('TokenizerTrainer', 'en', 'tokenizer.txt', 'en-test-tokenizer.bin')
+    trainModel('POSTaggerTrainer', 'en', 'pos.txt', 'en-test-pos-maxent.bin')
+    trainModel('ChunkerTrainerME', 'en', 'chunks.txt', 'en-test-chunker.bin')
+    trainModel('TokenNameFinderTrainer', 'en', 'ner.txt', 'en-test-ner.bin', ['-params', 'ner_TrainerParams.txt'])
+    trainModel('LemmatizerTrainerME', 'en', 'lemmas.txt', 'en-test-lemmatizer.bin')
+  }
+}
+
+def trainModel(String command, String lang, String data, String model, List extraArgs = []) {
+  javaexec {
+    classpath = sourceSets.main.compileClasspath
+    mainClass = 'opennlp.tools.cmdline.CLI'
+    workingDir = testModelDataDir
+    args = [command, '-lang', lang, '-data', data, '-model', "${testModelDir}/${model}"] + extraArgs
+  }
+}
+
@@ -206,8 +206,7 @@ public void setText(CharacterIterator newText) {
 
   private String characterIteratorToString() {
     String fullText;
-    if (text instanceof CharArrayIterator) {
-      CharArrayIterator charArrayIterator = (CharArrayIterator) text;
+    if (text instanceof CharArrayIterator charArrayIterator) {
       fullText =
           new String(
               charArrayIterator.getText(),
 
@@ -58,7 +58,7 @@ public class TestOpenNLPChunkerFilterFactory extends BaseTokenStreamTestCase {
     8, 15, 17, 21, 23, 29, 30, 39, 46, 48, 49, 51, 57, 58
   };
   private static final String[] SENTENCES_chunks = {
-    "B-NP", "I-NP", "I-NP", "I-NP", "I-NP", "I-NP", "O", "B-NP", "I-NP", "I-NP", "O", "B-NP",
+    "B-NP", "I-NP", "I-NP", "B-VP", "B-NP", "I-NP", "O", "B-NP", "I-NP", "I-NP", "O", "B-NP",
     "I-NP", "O"
   };
 
 
@@ -34,45 +34,40 @@
 public class AnalyzerProfile {
 
   /** Global indicating the configured analysis data directory */
-  public static String ANALYSIS_DATA_DIR = "";
+  public static final String ANALYSIS_DATA_DIR = resolveDataDir();
 
-  static {
-    init();
-  }
-
-  private static void init() {
+  private static String resolveDataDir() {
     String dirName = "analysis-data";
     String propName = "analysis.properties";
 
     // Try the system property：-Danalysis.data.dir=/path/to/analysis-data
-    ANALYSIS_DATA_DIR = System.getProperty("analysis.data.dir", "");
-    if (ANALYSIS_DATA_DIR.length() != 0) return;
+    String analysisDataDir = System.getProperty("analysis.data.dir", "");
+    if (analysisDataDir.isEmpty() == false) return analysisDataDir;
 
+    Path lib = Paths.get("lib");
     Path[] candidateFiles =
         new Path[] {
-          Paths.get(dirName),
-          Paths.get("lib").resolve(dirName),
-          Paths.get(propName),
-          Paths.get("lib").resolve(propName)
+          Paths.get(dirName), lib.resolve(dirName), Paths.get(propName), lib.resolve(propName)
         };
     for (Path file : candidateFiles) {
       if (Files.exists(file)) {
         if (Files.isDirectory(file)) {
-          ANALYSIS_DATA_DIR = file.toAbsolutePath().toString();
-        } else if (Files.isRegularFile(file) && getAnalysisDataDir(file).length() != 0) {
-          ANALYSIS_DATA_DIR = getAnalysisDataDir(file).toString();
+          analysisDataDir = file.toAbsolutePath().toString();
+        } else if (Files.isRegularFile(file) && getAnalysisDataDir(file).isEmpty() == false) {
+          analysisDataDir = getAnalysisDataDir(file);
         }
         break;
       }
     }
 
-    if (ANALYSIS_DATA_DIR.length() == 0) {
+    if (analysisDataDir.isEmpty()) {
       // Dictionary directory cannot be found.
       throw new RuntimeException(
           "WARNING: Can not find lexical dictionary directory!"
               + " This will cause unpredictable exceptions in your application!"
               + " Please refer to the manual to download the dictionaries.");
     }
+    return analysisDataDir;
   }
 
   private static String getAnalysisDataDir(Path propFile) {
 
@@ -146,7 +146,7 @@ public Relation compare(byte[] minPackedValue, byte[] maxPackedValue) {
 
   @Override
   public void merge(MergeState mergeState) throws IOException {
-    /**
+    /*
      * If indexSort is activated and some of the leaves are not sorted the next test will catch that
      * and the non-optimized merge will run. If the readers are all sorted then it's safe to perform
      * a bulk merge of the points.
 
@@ -1074,7 +1074,7 @@ private byte[] packIndex(long[] leafBlockFPs, byte[] splitPackedValues) throws I
       }
     }
 
-    /** Reused while packing the index */
+    /* Reused while packing the index */
     ByteBuffersDataOutput writeBuffer = new ByteBuffersDataOutput();
 
     // This is the "file" we append the byte[] to:
Original file line number	Diff line number	Diff line change
`@@ -1074,7 +1074,7 @@ private byte[] packIndex(long[] leafBlockFPs, byte[] splitPackedValues) throws I`
`1074`	`1074`	`}`
`1075`	`1075`	`}`
`1076`	`1076`
`1077`		`- /** Reused while packing the index */`
	`1077`	`+ /* Reused while packing the index */`
`1078`	`1078`	`ByteBuffersDataOutput writeBuffer = new ByteBuffersDataOutput();`
`1079`	`1079`
`1080`	`1080`	`// This is the "file" we append the byte[] to:`