Skip to content

Commit 1dbaa1a

Browse files
committed
Merge branch 'main' of github.com:apache/lucene into hnsw_qset
2 parents 20a481f + de1ed71 commit 1dbaa1a

File tree

233 files changed

+2885
-1672
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

233 files changed

+2885
-1672
lines changed

gradle/validation/error-prone.gradle

Lines changed: 99 additions & 1 deletion
Large diffs are not rendered by default.

lucene/CHANGES.txt

Lines changed: 22 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -46,11 +46,14 @@ API Changes
4646
* GITHUB#14134: Added Bits#applyMask API to help apply live docs as a mask on a
4747
bit set of matches. (Adrien Grand)
4848

49+
* GITHUB#14209: Deprecate Operations.union(Automaton,Automaton) and
50+
concatenate(Automaton,Automaton) in favor of the methods taking List. (Robert Muir)
51+
4952
New Features
5053
---------------------
5154

52-
* GITHUB#14084, GITHUB#13635, GITHUB#13634: Adds new `SeededKnnByteVectorQuery` and `SeededKnnFloatVectorQuery`
53-
queries. These queries allow for the vector search entry points to be initialized via a `seed` query. This follows
55+
* GITHUB#14084, GITHUB#13635, GITHUB#13634, GITHUB#14170: Adds new `SeededKnnVectorQuery` query.
56+
These queries allow for the vector search entry points to be initialized via a `seed` query. This follows
5457
the research provided via https://arxiv.org/abs/2307.16779. (Sean MacAvaney, Ben Trent).
5558

5659

@@ -75,6 +78,11 @@ Improvements
7578
mergeFactor segments together when the merge is below the min merge size.
7679
(Adrien Grand)
7780

81+
* GITHUB#14154: Add UnwrappingReuseStrategy for AnalyzerWrapper that consults
82+
the wrapped analyzer's strategy to decide if components can be reused or need
83+
to be updated. (Mayya Sharipova)
84+
85+
7886
Optimizations
7987
---------------------
8088

@@ -86,7 +94,16 @@ Optimizations
8694
* GITHUB#14133: Dense blocks of postings are now encoded as bit sets.
8795
(Adrien Grand)
8896

89-
# GITHUB#14169: Optimize ContextQuery with big number of contexts. (Mayya Sharipova)
97+
* GITHUB#14169: Optimize ContextQuery with big number of contexts. (Mayya Sharipova)
98+
99+
* GITHUB#14181: Add updateable random scorer interface for knn vector index building. This allows
100+
for fewer objects to be created during indexing and simplifies internally used iterfaces.
101+
(Ben Trent)
102+
103+
* GITHUB#14193: Add Automata.makeCharSet() and makeCharClass() that return minimal DFA
104+
for lists of characters and ranges. Use them in RegExp parser. (Robert Muir)
105+
106+
* GITHUB#14176: Reduce when visiting bpv24-encoded doc ids in BKD leaves. (Guo Feng)
90107

91108
* GITHUB#14094: Early terminate when HNSW nearest neighbor queue saturates (Tommaso Teofili)
92109

@@ -101,6 +118,8 @@ Bug Fixes
101118
* GITHUB#14126: Avoid overflow in index input slices invariant checks
102119
(Chris Hegarty)
103120

121+
* GITHUB#14215: Fix degenerate case in HNSW where all vectors have the same score. (Ben Trent)
122+
104123
Other
105124
---------------------
106125

lucene/analysis/common/src/java/org/apache/lucene/analysis/compound/hyphenation/HyphenationTree.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -225,7 +225,7 @@ protected void searchPatterns(char[] word, int index, byte[] il) {
225225
} else {
226226
q = lo[q];
227227

228-
/**
228+
/*
229229
* actually the code should be: q = sc[q] < 0 ? hi[q] : lo[q]; but java chars are
230230
* unsigned
231231
*/

lucene/analysis/common/src/java/org/apache/lucene/analysis/compound/hyphenation/PatternParser.java

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -172,8 +172,7 @@ protected ArrayList<Object> normalizeException(ArrayList<?> ex) {
172172
ArrayList<Object> res = new ArrayList<>();
173173
for (int i = 0; i < ex.size(); i++) {
174174
Object item = ex.get(i);
175-
if (item instanceof String) {
176-
String str = (String) item;
175+
if (item instanceof String str) {
177176
StringBuilder buf = new StringBuilder();
178177
for (int j = 0; j < str.length(); j++) {
179178
char c = str.charAt(j);

lucene/analysis/common/src/java/org/apache/lucene/analysis/en/KStemmer.java

Lines changed: 16 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -822,12 +822,12 @@ private boolean endsIn(char a, char b, char c, char d) {
822822
}
823823

824824
private DictEntry wordInDict() {
825-
/***
825+
/*
826826
* if (matchedEntry != null) { if (dict_ht.get(word.getArray(), 0,
827827
* word.size()) != matchedEntry) {
828828
* System.out.println("Uh oh... cached entry doesn't match"); } return
829829
* matchedEntry; }
830-
***/
830+
*/
831831
if (matchedEntry != null) return matchedEntry;
832832
DictEntry e = dict_ht.get(word.getArray(), 0, word.length());
833833
if (e != null && !e.exception) {
@@ -861,11 +861,11 @@ private void plural() {
861861
* common
862862
*/
863863

864-
/****
864+
/*
865865
* YCS: this was the one place where lookup was not followed by return.
866866
* So restructure it. if ((j>0)&&(lookup(word.toString())) &&
867867
* !((word.charAt(j) == 's') && (word.charAt(j-1) == 's'))) return;
868-
*****/
868+
*/
869869
boolean tryE = j > 0 && !((word.charAt(j) == 's') && (word.charAt(j - 1) == 's'));
870870
if (tryE && lookup()) return;
871871

@@ -913,15 +913,15 @@ private void setSuff(String s, int len) {
913913
DictEntry matchedEntry = null;
914914

915915
private boolean lookup() {
916-
/******
916+
/*
917917
* debugging code String thisLookup = word.toString(); boolean added =
918918
* lookups.add(thisLookup); if (!added) {
919919
* System.out.println("######extra lookup:" + thisLookup); // occaasional
920920
* extra lookups aren't necessarily errors... could happen by diff
921921
* manipulations // throw new RuntimeException("######extra lookup:" +
922922
* thisLookup); } else { // System.out.println("new lookup:" + thisLookup);
923923
* }
924-
******/
924+
*/
925925

926926
matchedEntry = dict_ht.get(word.getArray(), 0, word.size());
927927
return matchedEntry != null;
@@ -1742,11 +1742,11 @@ int getLength() {
17421742
String result;
17431743

17441744
private boolean matched() {
1745-
/***
1745+
/*
17461746
* if (!lookups.contains(word.toString())) { throw new
17471747
* RuntimeException("didn't look up "+word.toString()+" prev="+prevLookup);
17481748
* }
1749-
***/
1749+
*/
17501750
// lookup();
17511751
return matchedEntry != null;
17521752
}
@@ -1772,13 +1772,13 @@ boolean stem(char[] term, int len) {
17721772
return false;
17731773
}
17741774

1775-
/***
1775+
/*
17761776
* caching off is normally faster if (cache == null) initializeStemHash();
17771777
*
17781778
* // now check the cache, before we copy chars to "word" if (cache != null)
17791779
* { String val = cache.get(term, 0, len); if (val != null) { if (val !=
17801780
* SAME) { result = val; return true; } return false; } }
1781-
***/
1781+
*/
17821782

17831783
word.reset();
17841784
// allocate enough space so that an expansion is never needed
@@ -1792,9 +1792,9 @@ boolean stem(char[] term, int len) {
17921792
}
17931793

17941794
matchedEntry = null;
1795-
/***
1795+
/*
17961796
* lookups.clear(); lookups.add(word.toString());
1797-
***/
1797+
*/
17981798

17991799
/*
18001800
* This while loop will never be executed more than one time; it is here
@@ -1851,20 +1851,20 @@ boolean stem(char[] term, int len) {
18511851
result = entry.root; // may be null, which means that "word" is the stem
18521852
}
18531853

1854-
/***
1854+
/*
18551855
* caching off is normally faster if (cache != null && cache.size() <
18561856
* maxCacheSize) { char[] key = new char[len]; System.arraycopy(term, 0,
18571857
* key, 0, len); if (result != null) { cache.put(key, result); } else {
18581858
* cache.put(key, word.toString()); } }
1859-
***/
1859+
*/
18601860

1861-
/***
1861+
/*
18621862
* if (entry == null) { if (!word.toString().equals(new String(term,0,len)))
18631863
* { System.out.println("CASE:" + word.toString() + "," + new
18641864
* String(term,0,len));
18651865
*
18661866
* } }
1867-
***/
1867+
*/
18681868

18691869
// no entry matched means result is "word"
18701870
return true;

lucene/analysis/common/src/java/org/apache/lucene/analysis/minhash/MinHashFilter.java

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
package org.apache.lucene.analysis.minhash;
1919

2020
import java.io.IOException;
21+
import java.nio.charset.StandardCharsets;
2122
import java.util.ArrayList;
2223
import java.util.List;
2324
import java.util.TreeSet;
@@ -160,7 +161,7 @@ public final boolean incrementToken() throws IOException {
160161
String current = new String(termAttribute.buffer(), 0, termAttribute.length());
161162

162163
for (int i = 0; i < hashCount; i++) {
163-
byte[] bytes = current.getBytes("UTF-16LE");
164+
byte[] bytes = current.getBytes(StandardCharsets.UTF_16LE);
164165
LongPair hash = new LongPair();
165166
murmurhash3_x64_128(bytes, 0, bytes.length, 0, hash);
166167
LongPair rehashed = combineOrdered(hash, getIntHash(i));

lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestFlattenGraphFilter.java

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -816,7 +816,7 @@ public boolean recursivelyValidate(
816816
return accept;
817817
}
818818

819-
/**
819+
/*
820820
* This method checks if strings that lead to the accept state of the not flattened TokenStream
821821
* also lead to the accept state in the flattened TokenStream. This gets complicated when you
822822
* factor in holes. The FlattenGraphFilter will remove alternate paths that are made entirely of
@@ -840,7 +840,7 @@ public boolean recursivelyValidate(
840840
notFlattened.close();
841841
}*/
842842

843-
/**
843+
/*
844844
* gets up to 10000 strings that lead to accept state in the given automaton.
845845
*
846846
* @param automaton automaton
@@ -856,7 +856,7 @@ public boolean recursivelyValidate(
856856
return acceptedSequences;
857857
}*/
858858

859-
/**
859+
/*
860860
* @param automaton automaton to generate strings from
861861
* @param state state to start at
862862
* @param prefix string prefix

lucene/analysis/common/src/test/org/apache/lucene/analysis/minhash/TestMinHashFilter.java

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@
1919

2020
import java.io.IOException;
2121
import java.io.StringReader;
22-
import java.io.UnsupportedEncodingException;
22+
import java.nio.charset.StandardCharsets;
2323
import java.util.ArrayList;
2424
import java.util.HashMap;
2525
import java.util.HashSet;
@@ -47,16 +47,16 @@ public void testIntHash() {
4747
}
4848

4949
@Test
50-
public void testStringHash() throws UnsupportedEncodingException {
50+
public void testStringHash() {
5151
LongPair hash = new LongPair();
52-
byte[] bytes = "woof woof woof woof woof".getBytes("UTF-16LE");
52+
byte[] bytes = "woof woof woof woof woof".getBytes(StandardCharsets.UTF_16LE);
5353
MinHashFilter.murmurhash3_x64_128(bytes, 0, bytes.length, 0, hash);
5454
assertEquals(7638079586852243959L, hash.val1);
5555
assertEquals(4378804943379391304L, hash.val2);
5656
}
5757

5858
@Test
59-
public void testSimpleOrder() throws UnsupportedEncodingException {
59+
public void testSimpleOrder() {
6060
LongPair hash1 = new LongPair();
6161
hash1.val1 = 1;
6262
hash1.val2 = 2;

lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/ViterbiNBest.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -179,7 +179,7 @@ void setGraphvizFormatter(GraphvizFormatter<JaMorphData> dotOut) {
179179
protected void backtrace(Position endPosData, int fromIDX) throws IOException {
180180
final int endPos = endPosData.getPos();
181181

182-
/**
182+
/*
183183
* LUCENE-10059: If the endPos is the same as lastBackTracePos, we don't want to backtrace to
184184
* avoid an assertion error {@link RollingCharBuffer#get(int)} when it tries to generate an
185185
* empty buffer

lucene/analysis/opennlp/build.gradle

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,3 +26,32 @@ dependencies {
2626

2727
moduleTestImplementation project(':lucene:test-framework')
2828
}
29+
30+
ext {
31+
testModelDataDir = file('src/tools/test-model-data')
32+
testsUserDir = file('src/test-files')
33+
testModelDir = file("${testsUserDir}/org/apache/lucene/analysis/opennlp")
34+
}
35+
36+
tasks.register('trainTestModels') {
37+
description = 'Train all small test models for unit tests'
38+
doLast {
39+
mkdir testModelDir
40+
trainModel('SentenceDetectorTrainer', 'en', 'sentences.txt', 'en-test-sent.bin')
41+
trainModel('TokenizerTrainer', 'en', 'tokenizer.txt', 'en-test-tokenizer.bin')
42+
trainModel('POSTaggerTrainer', 'en', 'pos.txt', 'en-test-pos-maxent.bin')
43+
trainModel('ChunkerTrainerME', 'en', 'chunks.txt', 'en-test-chunker.bin')
44+
trainModel('TokenNameFinderTrainer', 'en', 'ner.txt', 'en-test-ner.bin', ['-params', 'ner_TrainerParams.txt'])
45+
trainModel('LemmatizerTrainerME', 'en', 'lemmas.txt', 'en-test-lemmatizer.bin')
46+
}
47+
}
48+
49+
def trainModel(String command, String lang, String data, String model, List extraArgs = []) {
50+
javaexec {
51+
classpath = sourceSets.main.compileClasspath
52+
mainClass = 'opennlp.tools.cmdline.CLI'
53+
workingDir = testModelDataDir
54+
args = [command, '-lang', lang, '-data', data, '-model', "${testModelDir}/${model}"] + extraArgs
55+
}
56+
}
57+

lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/OpenNLPSentenceBreakIterator.java

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -206,8 +206,7 @@ public void setText(CharacterIterator newText) {
206206

207207
private String characterIteratorToString() {
208208
String fullText;
209-
if (text instanceof CharArrayIterator) {
210-
CharArrayIterator charArrayIterator = (CharArrayIterator) text;
209+
if (text instanceof CharArrayIterator charArrayIterator) {
211210
fullText =
212211
new String(
213212
charArrayIterator.getText(),

lucene/analysis/opennlp/src/test/org/apache/lucene/analysis/opennlp/TestOpenNLPChunkerFilterFactory.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -58,7 +58,7 @@ public class TestOpenNLPChunkerFilterFactory extends BaseTokenStreamTestCase {
5858
8, 15, 17, 21, 23, 29, 30, 39, 46, 48, 49, 51, 57, 58
5959
};
6060
private static final String[] SENTENCES_chunks = {
61-
"B-NP", "I-NP", "I-NP", "I-NP", "I-NP", "I-NP", "O", "B-NP", "I-NP", "I-NP", "O", "B-NP",
61+
"B-NP", "I-NP", "I-NP", "B-VP", "B-NP", "I-NP", "O", "B-NP", "I-NP", "I-NP", "O", "B-NP",
6262
"I-NP", "O"
6363
};
6464

lucene/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/smart/AnalyzerProfile.java

Lines changed: 11 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -34,45 +34,40 @@
3434
public class AnalyzerProfile {
3535

3636
/** Global indicating the configured analysis data directory */
37-
public static String ANALYSIS_DATA_DIR = "";
37+
public static final String ANALYSIS_DATA_DIR = resolveDataDir();
3838

39-
static {
40-
init();
41-
}
42-
43-
private static void init() {
39+
private static String resolveDataDir() {
4440
String dirName = "analysis-data";
4541
String propName = "analysis.properties";
4642

4743
// Try the system property:-Danalysis.data.dir=/path/to/analysis-data
48-
ANALYSIS_DATA_DIR = System.getProperty("analysis.data.dir", "");
49-
if (ANALYSIS_DATA_DIR.length() != 0) return;
44+
String analysisDataDir = System.getProperty("analysis.data.dir", "");
45+
if (analysisDataDir.isEmpty() == false) return analysisDataDir;
5046

47+
Path lib = Paths.get("lib");
5148
Path[] candidateFiles =
5249
new Path[] {
53-
Paths.get(dirName),
54-
Paths.get("lib").resolve(dirName),
55-
Paths.get(propName),
56-
Paths.get("lib").resolve(propName)
50+
Paths.get(dirName), lib.resolve(dirName), Paths.get(propName), lib.resolve(propName)
5751
};
5852
for (Path file : candidateFiles) {
5953
if (Files.exists(file)) {
6054
if (Files.isDirectory(file)) {
61-
ANALYSIS_DATA_DIR = file.toAbsolutePath().toString();
62-
} else if (Files.isRegularFile(file) && getAnalysisDataDir(file).length() != 0) {
63-
ANALYSIS_DATA_DIR = getAnalysisDataDir(file).toString();
55+
analysisDataDir = file.toAbsolutePath().toString();
56+
} else if (Files.isRegularFile(file) && getAnalysisDataDir(file).isEmpty() == false) {
57+
analysisDataDir = getAnalysisDataDir(file);
6458
}
6559
break;
6660
}
6761
}
6862

69-
if (ANALYSIS_DATA_DIR.length() == 0) {
63+
if (analysisDataDir.isEmpty()) {
7064
// Dictionary directory cannot be found.
7165
throw new RuntimeException(
7266
"WARNING: Can not find lexical dictionary directory!"
7367
+ " This will cause unpredictable exceptions in your application!"
7468
+ " Please refer to the manual to download the dictionaries.");
7569
}
70+
return analysisDataDir;
7671
}
7772

7873
private static String getAnalysisDataDir(Path propFile) {

lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene60/Lucene60PointsWriter.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -146,7 +146,7 @@ public Relation compare(byte[] minPackedValue, byte[] maxPackedValue) {
146146

147147
@Override
148148
public void merge(MergeState mergeState) throws IOException {
149-
/**
149+
/*
150150
* If indexSort is activated and some of the leaves are not sorted the next test will catch that
151151
* and the non-optimized merge will run. If the readers are all sorted then it's safe to perform
152152
* a bulk merge of the points.

lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene60/bkd/BKDWriter60.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1074,7 +1074,7 @@ private byte[] packIndex(long[] leafBlockFPs, byte[] splitPackedValues) throws I
10741074
}
10751075
}
10761076

1077-
/** Reused while packing the index */
1077+
/* Reused while packing the index */
10781078
ByteBuffersDataOutput writeBuffer = new ByteBuffersDataOutput();
10791079

10801080
// This is the "file" we append the byte[] to:

0 commit comments

Comments
 (0)