Skip to content

Commit fe913e5

Browse files
authored
Make Lucene better at skipping long runs of matches. (#14312)
This is an attempt to resurrect #12194 in a (hopefully) better way. Now that many queries run with `DenseConjunctionBulkScorer`, which scores windows of doc IDs at a time, it becomes natural to skip clauses that have long runs of matches by checking if they match the whole window. This introduces the same `DocIdSetIterator#peekNextNonMatchingDocID()` API that PR #12194 suggested, implements it in `DocIdSetIterator#all`, and uses it in `DenseConjunctionBulkScorer` to skip clauses that match the whole window. For better test coverage, `DenseConjunctionBulkScorer` was refactored to require at least one iterator, which can be a `DocIdSetIterator#all` instance if all docs match. In follow-ups, we should look into supporting other queries that are likely to have long runs of matches, in particular doc-value range queries on fields that are part of the index sort and take advantage of a doc-value skipper. Closes #11915
1 parent 00e507e commit fe913e5

14 files changed

+370
-140
lines changed

lucene/CHANGES.txt

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,11 @@ API Changes
7474

7575
* GITHUB#14237: Add utils for sandbox facets module for the most common tasks. (Egor Potemkin)
7676

77+
* GITHUB#14312: Added DocIdSetIterator#docIDRunEnd to let iterators tell when
78+
they have runs of matches. This is then leveraged by conjunctions to
79+
automatically ignore clauses that have long runs of matches over these runs,
80+
resulting in better efficiency. (Adrien Grand)
81+
7782
New Features
7883
---------------------
7984

lucene/core/src/java/org/apache/lucene/document/SortedNumericDocValuesRangeQuery.java

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,6 @@
3131
import org.apache.lucene.search.DocValuesRangeIterator;
3232
import org.apache.lucene.search.FieldExistsQuery;
3333
import org.apache.lucene.search.IndexSearcher;
34-
import org.apache.lucene.search.MatchAllScorerSupplier;
3534
import org.apache.lucene.search.MatchNoDocsQuery;
3635
import org.apache.lucene.search.Query;
3736
import org.apache.lucene.search.QueryVisitor;
@@ -127,7 +126,7 @@ public ScorerSupplier scorerSupplier(LeafReaderContext context) throws IOExcepti
127126
&& skipper.minValue() >= lowerValue
128127
&& skipper.maxValue() <= upperValue) {
129128

130-
return new MatchAllScorerSupplier(score(), scoreMode, maxDoc);
129+
return ConstantScoreScorerSupplier.matchAll(score(), scoreMode, maxDoc);
131130
}
132131
}
133132

lucene/core/src/java/org/apache/lucene/search/ConstantScoreScorerSupplier.java

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,11 @@
2727
*/
2828
public abstract class ConstantScoreScorerSupplier extends ScorerSupplier {
2929

30+
/** Create a {@link ConstantScoreScorerSupplier} that matches all docs in [0, maxDoc). */
31+
public static ConstantScoreScorerSupplier matchAll(float score, ScoreMode scoreMode, int maxDoc) {
32+
return fromIterator(DocIdSetIterator.all(maxDoc), score, scoreMode, maxDoc);
33+
}
34+
3035
/** Create a {@link ConstantScoreScorerSupplier} for the given iterator. */
3136
public static ConstantScoreScorerSupplier fromIterator(
3237
DocIdSetIterator iterator, float score, ScoreMode scoreMode, int maxDoc) {

lucene/core/src/java/org/apache/lucene/search/DenseConjunctionBulkScorer.java

Lines changed: 81 additions & 58 deletions
Original file line numberDiff line numberDiff line change
@@ -27,8 +27,6 @@
2727
* BulkScorer implementation of {@link ConjunctionScorer} that is specialized for dense clauses.
2828
* Whenever sensible, it intersects clauses by loading their matches into a bit set and computing
2929
* the intersection of clauses by and-ing these bit sets.
30-
*
31-
* <p>An empty set of iterators is interpreted as meaning that all docs in [0, maxDoc) match.
3230
*/
3331
final class DenseConjunctionBulkScorer extends BulkScorer {
3432

@@ -46,12 +44,16 @@ final class DenseConjunctionBulkScorer extends BulkScorer {
4644

4745
private final FixedBitSet windowMatches = new FixedBitSet(WINDOW_SIZE);
4846
private final FixedBitSet clauseWindowMatches = new FixedBitSet(WINDOW_SIZE);
47+
private final List<DocIdSetIterator> windowIterators = new ArrayList<>();
4948
private final DocIdStreamView docIdStreamView = new DocIdStreamView();
5049
private final RangeDocIdStream rangeDocIdStream = new RangeDocIdStream();
5150
private final SingleIteratorDocIdStream singleIteratorDocIdStream =
5251
new SingleIteratorDocIdStream();
5352

5453
DenseConjunctionBulkScorer(List<DocIdSetIterator> iterators, int maxDoc, float constantScore) {
54+
if (iterators.isEmpty()) {
55+
throw new IllegalArgumentException("Expected one or more iterators, got 0");
56+
}
5557
this.maxDoc = maxDoc;
5658
iterators = new ArrayList<>(iterators);
5759
iterators.sort(Comparator.comparingLong(DocIdSetIterator::cost));
@@ -63,6 +65,7 @@ final class DenseConjunctionBulkScorer extends BulkScorer {
6365
@Override
6466
public int score(LeafCollector collector, Bits acceptDocs, int min, int max) throws IOException {
6567
collector.setScorer(scorable);
68+
6669
List<DocIdSetIterator> iterators = this.iterators;
6770
if (collector.competitiveIterator() != null) {
6871
iterators = new ArrayList<>(iterators);
@@ -75,37 +78,24 @@ public int score(LeafCollector collector, Bits acceptDocs, int min, int max) thr
7578

7679
max = Math.min(max, maxDoc);
7780

78-
DocIdSetIterator lead = null;
79-
if (iterators.isEmpty() == false) {
80-
lead = iterators.get(0);
81-
if (lead.docID() < min) {
82-
min = lead.advance(min);
83-
}
81+
DocIdSetIterator lead = iterators.get(0);
82+
if (lead.docID() < min) {
83+
min = lead.advance(min);
8484
}
8585

86-
if (min >= max) {
87-
return min >= maxDoc ? DocIdSetIterator.NO_MORE_DOCS : min;
88-
}
89-
90-
int windowMax = min;
91-
do {
86+
while (min < max) {
9287
if (scorable.minCompetitiveScore > scorable.score) {
9388
return DocIdSetIterator.NO_MORE_DOCS;
9489
}
90+
min = scoreWindow(collector, acceptDocs, iterators, min, max);
91+
}
9592

96-
int windowBase = lead == null ? windowMax : lead.docID();
97-
windowMax = (int) Math.min(max, (long) windowBase + WINDOW_SIZE);
98-
if (windowMax > windowBase) {
99-
scoreWindowUsingBitSet(collector, acceptDocs, iterators, windowBase, windowMax);
100-
}
101-
} while (windowMax < max);
102-
103-
if (lead != null) {
93+
if (lead.docID() > max) {
10494
return lead.docID();
105-
} else if (windowMax >= maxDoc) {
95+
} else if (max >= maxDoc) {
10696
return DocIdSetIterator.NO_MORE_DOCS;
10797
} else {
108-
return windowMax;
98+
return max;
10999
}
110100
}
111101

@@ -117,6 +107,65 @@ private static int advance(FixedBitSet set, int i) {
117107
}
118108
}
119109

110+
private int scoreWindow(
111+
LeafCollector collector, Bits acceptDocs, List<DocIdSetIterator> iterators, int min, int max)
112+
throws IOException {
113+
114+
// Advance all iterators to the first doc that is greater than or equal to min. This is
115+
// important as this is the only place where we can take advantage of a large gap between
116+
// consecutive matches in any clause.
117+
for (DocIdSetIterator iterator : iterators) {
118+
if (iterator.docID() >= min) {
119+
min = iterator.docID();
120+
} else {
121+
min = iterator.advance(min);
122+
}
123+
}
124+
if (min >= max) {
125+
return min;
126+
}
127+
128+
if (acceptDocs == null) {
129+
int minDocIDRunEnd = max;
130+
for (DocIdSetIterator iterator : iterators) {
131+
if (iterator.docID() > min) {
132+
minDocIDRunEnd = min;
133+
break;
134+
} else {
135+
minDocIDRunEnd = Math.min(minDocIDRunEnd, iterator.docIDRunEnd());
136+
}
137+
}
138+
139+
if (minDocIDRunEnd - min >= WINDOW_SIZE / 2) {
140+
// We have a large range of doc IDs that all match.
141+
rangeDocIdStream.from = min;
142+
rangeDocIdStream.to = minDocIDRunEnd;
143+
collector.collect(rangeDocIdStream);
144+
return minDocIDRunEnd;
145+
}
146+
}
147+
148+
int bitsetWindowMax = (int) Math.min(max, (long) min + WINDOW_SIZE);
149+
150+
for (DocIdSetIterator it : iterators) {
151+
if (it.docID() > min || it.docIDRunEnd() < bitsetWindowMax) {
152+
windowIterators.add(it);
153+
}
154+
}
155+
156+
if (acceptDocs == null && windowIterators.size() == 1) {
157+
// We have a range of doc IDs where all matches of an iterator are matches of the conjunction.
158+
singleIteratorDocIdStream.iterator = windowIterators.get(0);
159+
singleIteratorDocIdStream.from = min;
160+
singleIteratorDocIdStream.to = bitsetWindowMax;
161+
collector.collect(singleIteratorDocIdStream);
162+
} else {
163+
scoreWindowUsingBitSet(collector, acceptDocs, windowIterators, min, bitsetWindowMax);
164+
}
165+
windowIterators.clear();
166+
return bitsetWindowMax;
167+
}
168+
120169
private void scoreWindowUsingBitSet(
121170
LeafCollector collector,
122171
Bits acceptDocs,
@@ -128,26 +177,14 @@ private void scoreWindowUsingBitSet(
128177
assert windowMatches.scanIsEmpty();
129178
assert clauseWindowMatches.scanIsEmpty();
130179

131-
if (acceptDocs == null) {
132-
if (iterators.isEmpty()) {
133-
// All docs in the range match.
134-
rangeDocIdStream.from = windowBase;
135-
rangeDocIdStream.to = windowMax;
136-
collector.collect(rangeDocIdStream);
137-
return;
138-
} else if (iterators.size() == 1) {
139-
singleIteratorDocIdStream.iterator = iterators.get(0);
140-
singleIteratorDocIdStream.from = windowBase;
141-
singleIteratorDocIdStream.to = windowMax;
142-
collector.collect(singleIteratorDocIdStream);
143-
return;
144-
}
145-
}
146-
147180
if (iterators.isEmpty()) {
181+
// This happens if all clauses fully matched the window and there are deleted docs.
148182
windowMatches.set(0, windowMax - windowBase);
149183
} else {
150184
DocIdSetIterator lead = iterators.get(0);
185+
if (lead.docID() < windowBase) {
186+
lead.advance(windowBase);
187+
}
151188
lead.intoBitSet(windowMax, windowMatches, windowBase);
152189
}
153190

@@ -199,28 +236,11 @@ private void scoreWindowUsingBitSet(
199236
docIdStreamView.windowBase = windowBase;
200237
collector.collect(docIdStreamView);
201238
windowMatches.clear();
202-
203-
// If another clause is more advanced than the leading clause then advance the leading clause,
204-
// it's important to take advantage of large gaps in the postings lists of other clauses.
205-
if (iterators.size() >= 2) {
206-
DocIdSetIterator lead = iterators.get(0);
207-
int maxOtherDocID = -1;
208-
for (int i = 1; i < iterators.size(); ++i) {
209-
maxOtherDocID = Math.max(maxOtherDocID, iterators.get(i).docID());
210-
}
211-
if (lead.docID() < maxOtherDocID) {
212-
lead.advance(maxOtherDocID);
213-
}
214-
}
215239
}
216240

217241
@Override
218242
public long cost() {
219-
if (iterators.isEmpty()) {
220-
return maxDoc;
221-
} else {
222-
return iterators.get(0).cost();
223-
}
243+
return iterators.get(0).cost();
224244
}
225245

226246
final class DocIdStreamView extends DocIdStream {
@@ -287,6 +307,9 @@ public int count() throws IOException {
287307
// If the collector is just interested in the count, loading in a bit set and counting bits is
288308
// often faster than incrementing a counter on every call to nextDoc().
289309
assert windowMatches.scanIsEmpty();
310+
if (iterator.docID() < from) {
311+
iterator.advance(from);
312+
}
290313
iterator.intoBitSet(to, clauseWindowMatches, from);
291314
int count = clauseWindowMatches.cardinality();
292315
clauseWindowMatches.clear();

lucene/core/src/java/org/apache/lucene/search/DocIdSetIterator.java

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -97,6 +97,11 @@ public void intoBitSet(int upTo, FixedBitSet bitSet, int offset) {
9797
advance(upTo);
9898
}
9999
}
100+
101+
@Override
102+
public int docIDRunEnd() throws IOException {
103+
return maxDoc;
104+
}
100105
};
101106
}
102107

@@ -151,6 +156,11 @@ public void intoBitSet(int upTo, FixedBitSet bitSet, int offset) {
151156
advance(upTo);
152157
}
153158
}
159+
160+
@Override
161+
public int docIDRunEnd() throws IOException {
162+
return maxDoc;
163+
}
154164
};
155165
}
156166

@@ -258,4 +268,24 @@ public void intoBitSet(int upTo, FixedBitSet bitSet, int offset) throws IOExcept
258268
bitSet.set(doc - offset);
259269
}
260270
}
271+
272+
/**
273+
* Returns the end of the run of consecutive doc IDs that match this {@link DocIdSetIterator} and
274+
* that contains the current {@link #docID()}, that is: one plus the last doc ID of the run.
275+
*
276+
* <ol>
277+
* <li>The returned doc is greater than {@link #docID()}.
278+
* <li>All docs in range {@code [docID(), docIDRunEnd())} match this iterator.
279+
* <li>The current position of this iterator is not affected by calling {@link #docIDRunEnd()}.
280+
* </ol>
281+
*
282+
* <p><b>Note</b>: It is illegal to call this method when the iterator is exhausted or not
283+
* positioned.
284+
*
285+
* <p>The default implementation assumes runs of a single doc ID and returns {@link #docID()}) +
286+
* 1.
287+
*/
288+
public int docIDRunEnd() throws IOException {
289+
return docID() + 1;
290+
}
261291
}

lucene/core/src/java/org/apache/lucene/search/MatchAllDocsQuery.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ public String toString() {
3232

3333
@Override
3434
public ScorerSupplier scorerSupplier(LeafReaderContext context) throws IOException {
35-
return new MatchAllScorerSupplier(score(), scoreMode, context.reader().maxDoc());
35+
return ConstantScoreScorerSupplier.matchAll(score(), scoreMode, context.reader().maxDoc());
3636
}
3737

3838
@Override

lucene/core/src/java/org/apache/lucene/search/MatchAllScorerSupplier.java

Lines changed: 0 additions & 58 deletions
This file was deleted.

lucene/core/src/java/org/apache/lucene/search/PointRangeQuery.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -341,7 +341,7 @@ public ScorerSupplier scorerSupplier(LeafReaderContext context) throws IOExcepti
341341

342342
if (allDocsMatch) {
343343
// all docs have a value and all points are within bounds, so everything matches
344-
return new MatchAllScorerSupplier(score(), scoreMode, reader.maxDoc());
344+
return ConstantScoreScorerSupplier.matchAll(score(), scoreMode, reader.maxDoc());
345345
} else {
346346
return new ConstantScoreScorerSupplier(score(), scoreMode, reader.maxDoc()) {
347347

0 commit comments

Comments
 (0)