Skip to content

Commit e74f19b

Browse files
authored
Let DocIdSetIterator optimize loading into a FixedBitSet. (#14069)
This is an iteration on #14064. The benefits of this approach are that the API is a bit nicer and allows optimizing not only when doc IDs are stored in an int[]. The downside is that it only helps non-scoring disjunctions for now, but we can look into scoring disjunctions later on.
1 parent 5f0fa2b commit e74f19b

File tree

7 files changed

+175
-24
lines changed

7 files changed

+175
-24
lines changed

lucene/CHANGES.txt

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,8 @@ Other
3535

3636
API Changes
3737
---------------------
38-
(No changes)
38+
* GITHUB#14069: Added DocIdSetIterator#intoBitSet API to let implementations
39+
optimize loading doc IDs into a bit set. (Adrien Grand)
3940

4041
New Features
4142
---------------------

lucene/core/src/java/org/apache/lucene/codecs/lucene101/Lucene101PostingsReader.java

Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,9 @@
5353
import org.apache.lucene.store.ReadAdvice;
5454
import org.apache.lucene.util.ArrayUtil;
5555
import org.apache.lucene.util.BitUtil;
56+
import org.apache.lucene.util.Bits;
5657
import org.apache.lucene.util.BytesRef;
58+
import org.apache.lucene.util.FixedBitSet;
5759
import org.apache.lucene.util.IOUtils;
5860
import org.apache.lucene.util.VectorUtil;
5961

@@ -875,6 +877,63 @@ public int advance(int target) throws IOException {
875877
return doc;
876878
}
877879

880+
@Override
881+
public void intoBitSet(Bits acceptDocs, int upTo, FixedBitSet bitSet, int offset)
882+
throws IOException {
883+
if (doc >= upTo) {
884+
return;
885+
}
886+
887+
// Handle the current doc separately, it may be on the previous docBuffer.
888+
if (acceptDocs == null || acceptDocs.get(doc)) {
889+
bitSet.set(doc - offset);
890+
}
891+
892+
for (; ; ) {
893+
if (docBufferUpto == BLOCK_SIZE) {
894+
// refill
895+
moveToNextLevel0Block();
896+
}
897+
898+
int start = docBufferUpto;
899+
int end = computeBufferEndBoundary(upTo);
900+
if (end != 0) {
901+
bufferIntoBitSet(start, end, acceptDocs, bitSet, offset);
902+
doc = docBuffer[end - 1];
903+
}
904+
docBufferUpto = end;
905+
906+
if (end != BLOCK_SIZE) {
907+
// Either the block is a tail block, or the block did not fully match, we're done.
908+
nextDoc();
909+
assert doc >= upTo;
910+
break;
911+
}
912+
}
913+
}
914+
915+
private int computeBufferEndBoundary(int upTo) {
916+
if (docBufferSize != 0 && docBuffer[docBufferSize - 1] < upTo) {
917+
// All docs in the buffer are under upTo
918+
return docBufferSize;
919+
} else {
920+
// Find the index of the first doc that is greater than or equal to upTo
921+
return VectorUtil.findNextGEQ(docBuffer, upTo, docBufferUpto, docBufferSize);
922+
}
923+
}
924+
925+
private void bufferIntoBitSet(
926+
int start, int end, Bits acceptDocs, FixedBitSet bitSet, int offset) throws IOException {
927+
// acceptDocs#get (if backed by FixedBitSet), bitSet#set and `doc - offset` get
928+
// auto-vectorized
929+
for (int i = start; i < end; ++i) {
930+
int doc = docBuffer[i];
931+
if (acceptDocs == null || acceptDocs.get(doc)) {
932+
bitSet.set(doc - offset);
933+
}
934+
}
935+
}
936+
878937
private void skipPositions(int freq) throws IOException {
879938
// Skip positions now:
880939
int toSkip = posPendingCount - freq;

lucene/core/src/java/org/apache/lucene/search/BooleanScorer.java

Lines changed: 17 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -17,11 +17,11 @@
1717
package org.apache.lucene.search;
1818

1919
import java.io.IOException;
20-
import java.util.Arrays;
2120
import java.util.Collection;
2221
import java.util.Objects;
2322
import org.apache.lucene.internal.hppc.LongArrayList;
2423
import org.apache.lucene.util.Bits;
24+
import org.apache.lucene.util.FixedBitSet;
2525
import org.apache.lucene.util.PriorityQueue;
2626

2727
/**
@@ -34,8 +34,6 @@ final class BooleanScorer extends BulkScorer {
3434
static final int SHIFT = 12;
3535
static final int SIZE = 1 << SHIFT;
3636
static final int MASK = SIZE - 1;
37-
static final int SET_SIZE = 1 << (SHIFT - 6);
38-
static final int SET_MASK = SET_SIZE - 1;
3937

4038
static class Bucket {
4139
double score;
@@ -74,8 +72,7 @@ public DisiWrapper get(int i) {
7472
// One bucket per doc ID in the window, non-null if scores are needed or if frequencies need to be
7573
// counted
7674
final Bucket[] buckets;
77-
// This is basically an inlined FixedBitSet... seems to help with bound checks
78-
final long[] matching = new long[SET_SIZE];
75+
final FixedBitSet matching = new FixedBitSet(SIZE);
7976

8077
final DisiWrapper[] leads;
8178
final HeadPriorityQueue head;
@@ -91,11 +88,12 @@ final class DocIdStreamView extends DocIdStream {
9188

9289
@Override
9390
public void forEach(CheckedIntConsumer<IOException> consumer) throws IOException {
94-
long[] matching = BooleanScorer.this.matching;
91+
FixedBitSet matching = BooleanScorer.this.matching;
9592
Bucket[] buckets = BooleanScorer.this.buckets;
9693
int base = this.base;
97-
for (int idx = 0; idx < matching.length; idx++) {
98-
long bits = matching[idx];
94+
long[] bitArray = matching.getBits();
95+
for (int idx = 0; idx < bitArray.length; idx++) {
96+
long bits = bitArray[idx];
9997
while (bits != 0L) {
10098
int ntz = Long.numberOfTrailingZeros(bits);
10199
if (buckets != null) {
@@ -121,11 +119,7 @@ public int count() throws IOException {
121119
// We can't just count bits in that case
122120
return super.count();
123121
}
124-
int count = 0;
125-
for (long l : matching) {
126-
count += Long.bitCount(l);
127-
}
128-
return count;
122+
return matching.cardinality();
129123
}
130124
}
131125

@@ -173,7 +167,7 @@ public long cost() {
173167
private void scoreDisiWrapperIntoBitSet(DisiWrapper w, Bits acceptDocs, int min, int max)
174168
throws IOException {
175169
boolean needsScores = BooleanScorer.this.needsScores;
176-
long[] matching = BooleanScorer.this.matching;
170+
FixedBitSet matching = BooleanScorer.this.matching;
177171
Bucket[] buckets = BooleanScorer.this.buckets;
178172

179173
DocIdSetIterator it = w.iterator;
@@ -182,12 +176,13 @@ private void scoreDisiWrapperIntoBitSet(DisiWrapper w, Bits acceptDocs, int min,
182176
if (doc < min) {
183177
doc = it.advance(min);
184178
}
185-
for (; doc < max; doc = it.nextDoc()) {
186-
if (acceptDocs == null || acceptDocs.get(doc)) {
187-
final int i = doc & MASK;
188-
final int idx = i >> 6;
189-
matching[idx] |= 1L << i;
190-
if (buckets != null) {
179+
if (buckets == null) {
180+
it.intoBitSet(acceptDocs, max, matching, doc & ~MASK);
181+
} else {
182+
for (; doc < max; doc = it.nextDoc()) {
183+
if (acceptDocs == null || acceptDocs.get(doc)) {
184+
final int i = doc & MASK;
185+
matching.set(i);
191186
final Bucket bucket = buckets[i];
192187
bucket.freq++;
193188
if (needsScores) {
@@ -197,7 +192,7 @@ private void scoreDisiWrapperIntoBitSet(DisiWrapper w, Bits acceptDocs, int min,
197192
}
198193
}
199194

200-
w.doc = doc;
195+
w.doc = it.docID();
201196
}
202197

203198
private void scoreWindowIntoBitSetAndReplay(
@@ -218,7 +213,7 @@ private void scoreWindowIntoBitSetAndReplay(
218213
docIdStreamView.base = base;
219214
collector.collect(docIdStreamView);
220215

221-
Arrays.fill(matching, 0L);
216+
matching.clear();
222217
}
223218

224219
private DisiWrapper advance(int min) throws IOException {

lucene/core/src/java/org/apache/lucene/search/DocIdSetIterator.java

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,8 @@
1717
package org.apache.lucene.search;
1818

1919
import java.io.IOException;
20+
import org.apache.lucene.util.Bits;
21+
import org.apache.lucene.util.FixedBitSet;
2022

2123
/**
2224
* This abstract class defines methods to iterate over a set of non-decreasing doc ids. Note that
@@ -211,4 +213,33 @@ protected final int slowAdvance(int target) throws IOException {
211213
* may be a rough heuristic, hardcoded value, or otherwise completely inaccurate.
212214
*/
213215
public abstract long cost();
216+
217+
/**
218+
* Load doc IDs into a {@link FixedBitSet}. This should behave exactly as if implemented as below,
219+
* which is the default implementation:
220+
*
221+
* <pre class="prettyprint">
222+
* for (int doc = docID(); doc &lt; upTo; doc = nextDoc()) {
223+
* if (acceptDocs == null || acceptDocs.get(doc)) {
224+
* bitSet.set(doc - offset);
225+
* }
226+
* }
227+
* </pre>
228+
*
229+
* <p><b>Note</b>: {@code offset} must be less than or equal to the {@link #docID() current doc
230+
* ID}.
231+
*
232+
* <p><b>Note</b>: It is important not to clear bits from {@code bitSet} that may be already set.
233+
*
234+
* @lucene.internal
235+
*/
236+
public void intoBitSet(Bits acceptDocs, int upTo, FixedBitSet bitSet, int offset)
237+
throws IOException {
238+
assert offset <= docID();
239+
for (int doc = docID(); doc < upTo; doc = nextDoc()) {
240+
if (acceptDocs == null || acceptDocs.get(doc)) {
241+
bitSet.set(doc - offset);
242+
}
243+
}
244+
}
214245
}

lucene/core/src/java/org/apache/lucene/util/FixedBitSet.java

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -343,7 +343,9 @@ public void or(DocIdSetIterator iter) throws IOException {
343343
DocBaseBitSetIterator baseIter = (DocBaseBitSetIterator) iter;
344344
or(baseIter.getDocBase() >> 6, baseIter.getBitSet());
345345
} else {
346-
super.or(iter);
346+
checkUnpositioned(iter);
347+
iter.nextDoc();
348+
iter.intoBitSet(null, DocIdSetIterator.NO_MORE_DOCS, this, 0);
347349
}
348350
}
349351

lucene/test-framework/src/java/org/apache/lucene/tests/index/RandomPostingsTester.java

Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -75,6 +75,7 @@
7575
import org.apache.lucene.tests.util.automaton.AutomatonTestUtil;
7676
import org.apache.lucene.tests.util.automaton.AutomatonTestUtil.RandomAcceptedStrings;
7777
import org.apache.lucene.util.BytesRef;
78+
import org.apache.lucene.util.FixedBitSet;
7879
import org.apache.lucene.util.IOUtils;
7980
import org.apache.lucene.util.StringHelper;
8081
import org.apache.lucene.util.UnicodeUtil;
@@ -110,6 +111,9 @@ public enum Option {
110111
// Sometimes don't fully consume positions at each doc
111112
PARTIAL_POS_CONSUME,
112113

114+
// Check DocIdSetIterator#intoBitSet
115+
INTO_BIT_SET,
116+
113117
// Sometimes check payloads
114118
PAYLOADS,
115119

@@ -1364,6 +1368,54 @@ private void verifyEnum(
13641368
idx <= impactsCopy.size() && impactsCopy.get(idx).norm <= norm);
13651369
}
13661370
}
1371+
1372+
if (options.contains(Option.INTO_BIT_SET)) {
1373+
int flags = PostingsEnum.FREQS;
1374+
if (doCheckPositions) {
1375+
flags |= PostingsEnum.POSITIONS;
1376+
if (doCheckOffsets) {
1377+
flags |= PostingsEnum.OFFSETS;
1378+
}
1379+
if (doCheckPayloads) {
1380+
flags |= PostingsEnum.PAYLOADS;
1381+
}
1382+
}
1383+
PostingsEnum pe1 = termsEnum.postings(null, flags);
1384+
if (random.nextBoolean()) {
1385+
pe1.advance(maxDoc / 2);
1386+
pe1 = termsEnum.postings(pe1, flags);
1387+
}
1388+
PostingsEnum pe2 = termsEnum.postings(null, flags);
1389+
FixedBitSet set1 = new FixedBitSet(1024);
1390+
FixedBitSet set2 = new FixedBitSet(1024);
1391+
FixedBitSet acceptDocs = new FixedBitSet(maxDoc);
1392+
for (int i = 0; i < maxDoc; i += 2) {
1393+
acceptDocs.set(i);
1394+
}
1395+
1396+
while (true) {
1397+
pe1.nextDoc();
1398+
pe2.nextDoc();
1399+
1400+
int offset =
1401+
TestUtil.nextInt(random, Math.max(0, pe1.docID() - set1.length()), pe1.docID());
1402+
int upTo = offset + random.nextInt(set1.length());
1403+
pe1.intoBitSet(acceptDocs, upTo, set1, offset);
1404+
for (int d = pe2.docID(); d < upTo; d = pe2.nextDoc()) {
1405+
if (acceptDocs.get(d)) {
1406+
set2.set(d - offset);
1407+
}
1408+
}
1409+
1410+
assertEquals(set1, set2);
1411+
assertEquals(pe1.docID(), pe2.docID());
1412+
if (pe1.docID() == DocIdSetIterator.NO_MORE_DOCS) {
1413+
break;
1414+
}
1415+
set1.clear();
1416+
set2.clear();
1417+
}
1418+
}
13671419
}
13681420

13691421
private static class TestThread extends Thread {

lucene/test-framework/src/java/org/apache/lucene/tests/search/AssertingScorer.java

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,8 @@
2424
import org.apache.lucene.search.ScoreMode;
2525
import org.apache.lucene.search.Scorer;
2626
import org.apache.lucene.search.TwoPhaseIterator;
27+
import org.apache.lucene.util.Bits;
28+
import org.apache.lucene.util.FixedBitSet;
2729

2830
/** Wraps a Scorer with additional checks */
2931
public class AssertingScorer extends Scorer {
@@ -192,6 +194,15 @@ public int advance(int target) throws IOException {
192194
public long cost() {
193195
return in.cost();
194196
}
197+
198+
@Override
199+
public void intoBitSet(Bits acceptDocs, int upTo, FixedBitSet bitSet, int offset)
200+
throws IOException {
201+
assert docID() != -1;
202+
assert offset <= docID();
203+
in.intoBitSet(acceptDocs, upTo, bitSet, offset);
204+
assert docID() >= upTo;
205+
}
195206
};
196207
}
197208

0 commit comments

Comments
 (0)