Skip to content

Make Lucene better at skipping long runs of matches. #14312

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 9 commits into from
Mar 10, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions lucene/CHANGES.txt
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,11 @@ API Changes

* GITHUB#14237: Add utils for sandbox facets module for the most common tasks. (Egor Potemkin)

* GITHUB#14312: Added DocIdSetIterator#docIDRunEnd to let iterators tell when
they have runs of matches. This is then leveraged by conjunctions to
automatically ignore clauses that have long runs of matches over these runs,
resulting in better efficiency. (Adrien Grand)

New Features
---------------------

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,6 @@
import org.apache.lucene.search.DocValuesRangeIterator;
import org.apache.lucene.search.FieldExistsQuery;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.MatchAllScorerSupplier;
import org.apache.lucene.search.MatchNoDocsQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.QueryVisitor;
Expand Down Expand Up @@ -127,7 +126,7 @@ public ScorerSupplier scorerSupplier(LeafReaderContext context) throws IOExcepti
&& skipper.minValue() >= lowerValue
&& skipper.maxValue() <= upperValue) {

return new MatchAllScorerSupplier(score(), scoreMode, maxDoc);
return ConstantScoreScorerSupplier.matchAll(score(), scoreMode, maxDoc);
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,11 @@
*/
public abstract class ConstantScoreScorerSupplier extends ScorerSupplier {

/** Create a {@link ConstantScoreScorerSupplier} that matches all docs in [0, maxDoc). */
public static ConstantScoreScorerSupplier matchAll(float score, ScoreMode scoreMode, int maxDoc) {
return fromIterator(DocIdSetIterator.all(maxDoc), score, scoreMode, maxDoc);
}

/** Create a {@link ConstantScoreScorerSupplier} for the given iterator. */
public static ConstantScoreScorerSupplier fromIterator(
DocIdSetIterator iterator, float score, ScoreMode scoreMode, int maxDoc) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,8 +27,6 @@
* BulkScorer implementation of {@link ConjunctionScorer} that is specialized for dense clauses.
* Whenever sensible, it intersects clauses by loading their matches into a bit set and computing
* the intersection of clauses by and-ing these bit sets.
*
* <p>An empty set of iterators is interpreted as meaning that all docs in [0, maxDoc) match.
*/
final class DenseConjunctionBulkScorer extends BulkScorer {

Expand All @@ -46,12 +44,16 @@ final class DenseConjunctionBulkScorer extends BulkScorer {

private final FixedBitSet windowMatches = new FixedBitSet(WINDOW_SIZE);
private final FixedBitSet clauseWindowMatches = new FixedBitSet(WINDOW_SIZE);
private final List<DocIdSetIterator> windowIterators = new ArrayList<>();
private final DocIdStreamView docIdStreamView = new DocIdStreamView();
private final RangeDocIdStream rangeDocIdStream = new RangeDocIdStream();
private final SingleIteratorDocIdStream singleIteratorDocIdStream =
new SingleIteratorDocIdStream();

DenseConjunctionBulkScorer(List<DocIdSetIterator> iterators, int maxDoc, float constantScore) {
if (iterators.isEmpty()) {
throw new IllegalArgumentException("Expected one or more iterators, got 0");
}
this.maxDoc = maxDoc;
iterators = new ArrayList<>(iterators);
iterators.sort(Comparator.comparingLong(DocIdSetIterator::cost));
Expand All @@ -63,6 +65,7 @@ final class DenseConjunctionBulkScorer extends BulkScorer {
@Override
public int score(LeafCollector collector, Bits acceptDocs, int min, int max) throws IOException {
collector.setScorer(scorable);

List<DocIdSetIterator> iterators = this.iterators;
if (collector.competitiveIterator() != null) {
iterators = new ArrayList<>(iterators);
Expand All @@ -75,37 +78,24 @@ public int score(LeafCollector collector, Bits acceptDocs, int min, int max) thr

max = Math.min(max, maxDoc);

DocIdSetIterator lead = null;
if (iterators.isEmpty() == false) {
lead = iterators.get(0);
if (lead.docID() < min) {
min = lead.advance(min);
}
DocIdSetIterator lead = iterators.get(0);
if (lead.docID() < min) {
min = lead.advance(min);
}

if (min >= max) {
return min >= maxDoc ? DocIdSetIterator.NO_MORE_DOCS : min;
}

int windowMax = min;
do {
while (min < max) {
if (scorable.minCompetitiveScore > scorable.score) {
return DocIdSetIterator.NO_MORE_DOCS;
}
min = scoreWindow(collector, acceptDocs, iterators, min, max);
}

int windowBase = lead == null ? windowMax : lead.docID();
windowMax = (int) Math.min(max, (long) windowBase + WINDOW_SIZE);
if (windowMax > windowBase) {
scoreWindowUsingBitSet(collector, acceptDocs, iterators, windowBase, windowMax);
}
} while (windowMax < max);

if (lead != null) {
if (lead.docID() > max) {
return lead.docID();
} else if (windowMax >= maxDoc) {
} else if (max >= maxDoc) {
return DocIdSetIterator.NO_MORE_DOCS;
} else {
return windowMax;
return max;
}
}

Expand All @@ -117,6 +107,65 @@ private static int advance(FixedBitSet set, int i) {
}
}

private int scoreWindow(
LeafCollector collector, Bits acceptDocs, List<DocIdSetIterator> iterators, int min, int max)
throws IOException {

// Advance all iterators to the first doc that is greater than or equal to min. This is
// important as this is the only place where we can take advantage of a large gap between
// consecutive matches in any clause.
for (DocIdSetIterator iterator : iterators) {
if (iterator.docID() >= min) {
min = iterator.docID();
} else {
min = iterator.advance(min);
}
}
if (min >= max) {
return min;
}

if (acceptDocs == null) {
int minDocIDRunEnd = max;
for (DocIdSetIterator iterator : iterators) {
if (iterator.docID() > min) {
minDocIDRunEnd = min;
break;
} else {
minDocIDRunEnd = Math.min(minDocIDRunEnd, iterator.docIDRunEnd());
}
}

if (minDocIDRunEnd - min >= WINDOW_SIZE / 2) {
// We have a large range of doc IDs that all match.
rangeDocIdStream.from = min;
rangeDocIdStream.to = minDocIDRunEnd;
collector.collect(rangeDocIdStream);
return minDocIDRunEnd;
}
}

int bitsetWindowMax = (int) Math.min(max, (long) min + WINDOW_SIZE);

for (DocIdSetIterator it : iterators) {
if (it.docID() > min || it.docIDRunEnd() < bitsetWindowMax) {
windowIterators.add(it);
}
}

if (acceptDocs == null && windowIterators.size() == 1) {
// We have a range of doc IDs where all matches of an iterator are matches of the conjunction.
singleIteratorDocIdStream.iterator = windowIterators.get(0);
singleIteratorDocIdStream.from = min;
singleIteratorDocIdStream.to = bitsetWindowMax;
collector.collect(singleIteratorDocIdStream);
} else {
scoreWindowUsingBitSet(collector, acceptDocs, windowIterators, min, bitsetWindowMax);
}
windowIterators.clear();
return bitsetWindowMax;
}

private void scoreWindowUsingBitSet(
LeafCollector collector,
Bits acceptDocs,
Expand All @@ -128,26 +177,14 @@ private void scoreWindowUsingBitSet(
assert windowMatches.scanIsEmpty();
assert clauseWindowMatches.scanIsEmpty();

if (acceptDocs == null) {
if (iterators.isEmpty()) {
// All docs in the range match.
rangeDocIdStream.from = windowBase;
rangeDocIdStream.to = windowMax;
collector.collect(rangeDocIdStream);
return;
} else if (iterators.size() == 1) {
singleIteratorDocIdStream.iterator = iterators.get(0);
singleIteratorDocIdStream.from = windowBase;
singleIteratorDocIdStream.to = windowMax;
collector.collect(singleIteratorDocIdStream);
return;
}
}

if (iterators.isEmpty()) {
// This happens if all clauses fully matched the window and there are deleted docs.
windowMatches.set(0, windowMax - windowBase);
} else {
DocIdSetIterator lead = iterators.get(0);
if (lead.docID() < windowBase) {
lead.advance(windowBase);
}
lead.intoBitSet(windowMax, windowMatches, windowBase);
}

Expand Down Expand Up @@ -199,28 +236,11 @@ private void scoreWindowUsingBitSet(
docIdStreamView.windowBase = windowBase;
collector.collect(docIdStreamView);
windowMatches.clear();

// If another clause is more advanced than the leading clause then advance the leading clause,
// it's important to take advantage of large gaps in the postings lists of other clauses.
if (iterators.size() >= 2) {
DocIdSetIterator lead = iterators.get(0);
int maxOtherDocID = -1;
for (int i = 1; i < iterators.size(); ++i) {
maxOtherDocID = Math.max(maxOtherDocID, iterators.get(i).docID());
}
if (lead.docID() < maxOtherDocID) {
lead.advance(maxOtherDocID);
}
}
}

@Override
public long cost() {
if (iterators.isEmpty()) {
return maxDoc;
} else {
return iterators.get(0).cost();
}
return iterators.get(0).cost();
}

final class DocIdStreamView extends DocIdStream {
Expand Down Expand Up @@ -287,6 +307,9 @@ public int count() throws IOException {
// If the collector is just interested in the count, loading in a bit set and counting bits is
// often faster than incrementing a counter on every call to nextDoc().
assert windowMatches.scanIsEmpty();
if (iterator.docID() < from) {
iterator.advance(from);
}
iterator.intoBitSet(to, clauseWindowMatches, from);
int count = clauseWindowMatches.cardinality();
clauseWindowMatches.clear();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,11 @@ public void intoBitSet(int upTo, FixedBitSet bitSet, int offset) {
advance(upTo);
}
}

@Override
public int docIDRunEnd() throws IOException {
return maxDoc;
}
};
}

Expand Down Expand Up @@ -151,6 +156,11 @@ public void intoBitSet(int upTo, FixedBitSet bitSet, int offset) {
advance(upTo);
}
}

@Override
public int docIDRunEnd() throws IOException {
return maxDoc;
}
};
}

Expand Down Expand Up @@ -258,4 +268,24 @@ public void intoBitSet(int upTo, FixedBitSet bitSet, int offset) throws IOExcept
bitSet.set(doc - offset);
}
}

/**
* Returns the end of the run of consecutive doc IDs that match this {@link DocIdSetIterator} and
* that contains the current {@link #docID()}, that is: one plus the last doc ID of the run.
*
* <ol>
* <li>The returned doc is greater than {@link #docID()}.
* <li>All docs in range {@code [docID(), docIDRunEnd())} match this iterator.
* <li>The current position of this iterator is not affected by calling {@link #docIDRunEnd()}.
* </ol>
*
* <p><b>Note</b>: It is illegal to call this method when the iterator is exhausted or not
* positioned.
*
* <p>The default implementation assumes runs of a single doc ID and returns {@link #docID()}) +
* 1.
*/
public int docIDRunEnd() throws IOException {
return docID() + 1;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ public String toString() {

@Override
public ScorerSupplier scorerSupplier(LeafReaderContext context) throws IOException {
return new MatchAllScorerSupplier(score(), scoreMode, context.reader().maxDoc());
return ConstantScoreScorerSupplier.matchAll(score(), scoreMode, context.reader().maxDoc());
}

@Override
Expand Down

This file was deleted.

Original file line number Diff line number Diff line change
Expand Up @@ -341,7 +341,7 @@ public ScorerSupplier scorerSupplier(LeafReaderContext context) throws IOExcepti

if (allDocsMatch) {
// all docs have a value and all points are within bounds, so everything matches
return new MatchAllScorerSupplier(score(), scoreMode, reader.maxDoc());
return ConstantScoreScorerSupplier.matchAll(score(), scoreMode, reader.maxDoc());
} else {
return new ConstantScoreScorerSupplier(score(), scoreMode, reader.maxDoc()) {

Expand Down
Loading