Skip to content

Commit ffbbd7c

Browse files
committed
Pack file pointers when buikding BKD trees (#14393)
1 parent baf3148 commit ffbbd7c

File tree

2 files changed

+49
-17
lines changed

2 files changed

+49
-17
lines changed

lucene/CHANGES.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -155,6 +155,8 @@ Optimizations
155155

156156
* GITHUB#14373: Optimized `ParallelLeafReader` to improve term vector fetching efficiency. (Divyansh Agrawal)
157157

158+
* GITHUB#14393: Pack file pointers using the packed monotonic builder when building BKD trees. (Ignacio Vera)
159+
158160
Bug Fixes
159161
---------------------
160162

lucene/core/src/java/org/apache/lucene/util/bkd/BKDWriter.java

Lines changed: 47 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,6 @@
2828
import org.apache.lucene.index.PointValues;
2929
import org.apache.lucene.index.PointValues.IntersectVisitor;
3030
import org.apache.lucene.index.PointValues.Relation;
31-
import org.apache.lucene.internal.hppc.LongArrayList;
3231
import org.apache.lucene.store.ByteBuffersDataOutput;
3332
import org.apache.lucene.store.ChecksumIndexInput;
3433
import org.apache.lucene.store.DataOutput;
@@ -43,9 +42,12 @@
4342
import org.apache.lucene.util.FixedLengthBytesRefArray;
4443
import org.apache.lucene.util.IORunnable;
4544
import org.apache.lucene.util.IOUtils;
45+
import org.apache.lucene.util.LongValues;
4646
import org.apache.lucene.util.NumericUtils;
4747
import org.apache.lucene.util.PriorityQueue;
4848
import org.apache.lucene.util.bkd.BKDUtil.ByteArrayPredicate;
49+
import org.apache.lucene.util.packed.PackedInts;
50+
import org.apache.lucene.util.packed.PackedLongValues;
4951

5052
// TODO
5153
// - allow variable length byte[] (across docs and dims), but this is quite a bit more hairy
@@ -582,8 +584,21 @@ private IORunnable writeFieldNDims(
582584

583585
scratchBytesRef1.length = config.bytesPerDim();
584586
scratchBytesRef1.bytes = splitPackedValues;
587+
final LongValues leafFPLongValues =
588+
new LongValues() {
589+
@Override
590+
public long get(long index) {
591+
return leafBlockFPs[(int) index];
592+
}
593+
};
585594

586-
return makeWriter(metaOut, indexOut, splitDimensionValues, leafBlockFPs, dataStartFP);
595+
return makeWriter(
596+
metaOut,
597+
indexOut,
598+
splitDimensionValues,
599+
leafFPLongValues,
600+
leafBlockFPs.length,
601+
dataStartFP);
587602
}
588603

589604
/* In the 1D case, we can simply sort points in ascending order and use the
@@ -678,7 +693,8 @@ private class OneDimensionBKDWriter {
678693

679694
final IndexOutput metaOut, indexOut, dataOut;
680695
final long dataStartFP;
681-
final LongArrayList leafBlockFPs = new LongArrayList();
696+
private final PackedLongValues.Builder leafBlockFPs =
697+
PackedLongValues.monotonicBuilder(PackedInts.COMPACT);
682698
final FixedLengthBytesRefArray leafBlockStartValues =
683699
new FixedLengthBytesRefArray(config.packedIndexBytesLength());
684700
final byte[] leafValues = new byte[config.maxPointsInLeafNode() * config.packedBytesLength()];
@@ -708,7 +724,6 @@ private class OneDimensionBKDWriter {
708724
this.indexOut = indexOut;
709725
this.dataOut = dataOut;
710726
this.dataStartFP = dataOut.getFilePointer();
711-
712727
lastPackedValue = new byte[config.packedBytesLength()];
713728
}
714729

@@ -773,11 +788,12 @@ public IORunnable finish() throws IOException {
773788
scratchBytesRef1.length = config.packedIndexBytesLength();
774789
scratchBytesRef1.offset = 0;
775790
assert leafBlockStartValues.size() + 1 == leafBlockFPs.size();
791+
final LongValues leafFPLongValues = leafBlockFPs.build();
776792
BKDTreeLeafNodes leafNodes =
777793
new BKDTreeLeafNodes() {
778794
@Override
779795
public long getLeafLP(int index) {
780-
return leafBlockFPs.get(index);
796+
return leafFPLongValues.get(index);
781797
}
782798

783799
@Override
@@ -792,7 +808,7 @@ public int getSplitDimension(int index) {
792808

793809
@Override
794810
public int numLeaves() {
795-
return leafBlockFPs.size();
811+
return Math.toIntExact(leafBlockFPs.size());
796812
}
797813
};
798814
return () -> {
@@ -823,7 +839,7 @@ private void writeLeafBlock(int leafCardinality) throws IOException {
823839
leafBlockStartValues.append(scratchBytesRef1);
824840
}
825841
leafBlockFPs.add(dataOut.getFilePointer());
826-
checkMaxLeafNodeCount(leafBlockFPs.size());
842+
checkMaxLeafNodeCount(Math.toIntExact(leafBlockFPs.size()));
827843

828844
// Find per-dim common prefix:
829845
commonPrefixLengths[0] =
@@ -955,7 +971,8 @@ public IORunnable finish(IndexOutput metaOut, IndexOutput indexOut, IndexOutput
955971

956972
// +1 because leaf count is power of 2 (e.g. 8), and innerNodeCount is power of 2 minus 1 (e.g.
957973
// 7)
958-
long[] leafBlockFPs = new long[numLeaves];
974+
final PackedLongValues.Builder leafBlockFPs =
975+
PackedLongValues.monotonicBuilder(PackedInts.COMPACT);
959976

960977
// Make sure the math above "worked":
961978
assert pointCount / numLeaves <= config.maxPointsInLeafNode()
@@ -987,8 +1004,10 @@ public IORunnable finish(IndexOutput metaOut, IndexOutput indexOut, IndexOutput
9871004
splitPackedValues,
9881005
splitDimensionValues,
9891006
leafBlockFPs,
1007+
numLeaves,
9901008
new int[config.maxPointsInLeafNode()]);
9911009
assert Arrays.equals(parentSplits, new int[config.numIndexDims()]);
1010+
assert leafBlockFPs.size() == numLeaves;
9921011

9931012
// If no exception, we should have cleaned everything up:
9941013
assert tempDir.getCreatedFiles().isEmpty();
@@ -1002,22 +1021,30 @@ public IORunnable finish(IndexOutput metaOut, IndexOutput indexOut, IndexOutput
10021021
}
10031022
}
10041023

1024+
LongValues leafBlockLongValues = leafBlockFPs.build();
10051025
scratchBytesRef1.bytes = splitPackedValues;
10061026
scratchBytesRef1.length = config.bytesPerDim();
1007-
return makeWriter(metaOut, indexOut, splitDimensionValues, leafBlockFPs, dataStartFP);
1027+
return makeWriter(
1028+
metaOut,
1029+
indexOut,
1030+
splitDimensionValues,
1031+
leafBlockLongValues,
1032+
Math.toIntExact(leafBlockFPs.size()),
1033+
dataStartFP);
10081034
}
10091035

10101036
private IORunnable makeWriter(
10111037
IndexOutput metaOut,
10121038
IndexOutput indexOut,
10131039
byte[] splitDimensionValues,
1014-
long[] leafBlockFPs,
1040+
LongValues leafBlockFPs,
1041+
int numLeaves,
10151042
long dataStartFP) {
10161043
BKDTreeLeafNodes leafNodes =
10171044
new BKDTreeLeafNodes() {
10181045
@Override
10191046
public long getLeafLP(int index) {
1020-
return leafBlockFPs[index];
1047+
return leafBlockFPs.get(index);
10211048
}
10221049

10231050
@Override
@@ -1033,7 +1060,7 @@ public int getSplitDimension(int index) {
10331060

10341061
@Override
10351062
public int numLeaves() {
1036-
return leafBlockFPs.length;
1063+
return numLeaves;
10371064
}
10381065
};
10391066

@@ -1903,7 +1930,8 @@ private void build(
19031930
int[] parentSplits,
19041931
byte[] splitPackedValues,
19051932
byte[] splitDimensionValues,
1906-
long[] leafBlockFPs,
1933+
PackedLongValues.Builder leafBlockFPs,
1934+
int totalNumLeaves,
19071935
int[] spareDocIds)
19081936
throws IOException {
19091937

@@ -1961,7 +1989,7 @@ private void build(
19611989
int leafCardinality = heapSource.computeCardinality(from, to, commonPrefixLengths);
19621990

19631991
// Save the block file pointer:
1964-
leafBlockFPs[leavesOffset] = out.getFilePointer();
1992+
leafBlockFPs.add(out.getFilePointer());
19651993
// System.out.println(" write leaf block @ fp=" + out.getFilePointer());
19661994

19671995
// Write docIDs first, as their own chunk, so that at intersect time we can add all docIDs w/o
@@ -2003,16 +2031,16 @@ assert valuesInOrderAndBounds(
20032031
// split dimensions. Because it is an expensive operation, the frequency we recompute the
20042032
// bounds is given
20052033
// by SPLITS_BEFORE_EXACT_BOUNDS.
2006-
if (numLeaves != leafBlockFPs.length
2034+
if (numLeaves != totalNumLeaves
20072035
&& config.numIndexDims() > 2
20082036
&& Arrays.stream(parentSplits).sum() % SPLITS_BEFORE_EXACT_BOUNDS == 0) {
20092037
computePackedValueBounds(points, minPackedValue, maxPackedValue);
20102038
}
20112039
splitDim = split(minPackedValue, maxPackedValue, parentSplits);
20122040
}
20132041

2014-
assert numLeaves <= leafBlockFPs.length
2015-
: "numLeaves=" + numLeaves + " leafBlockFPs.length=" + leafBlockFPs.length;
2042+
assert numLeaves <= totalNumLeaves
2043+
: "numLeaves=" + numLeaves + " totalNumLeaves=" + totalNumLeaves;
20162044

20172045
// How many leaves will be in the left tree:
20182046
final int numLeftLeafNodes = getNumLeftLeafNodes(numLeaves);
@@ -2078,6 +2106,7 @@ assert valuesInOrderAndBounds(
20782106
splitPackedValues,
20792107
splitDimensionValues,
20802108
leafBlockFPs,
2109+
totalNumLeaves,
20812110
spareDocIds);
20822111

20832112
// Recurse on right tree:
@@ -2093,6 +2122,7 @@ assert valuesInOrderAndBounds(
20932122
splitPackedValues,
20942123
splitDimensionValues,
20952124
leafBlockFPs,
2125+
totalNumLeaves,
20962126
spareDocIds);
20972127

20982128
parentSplits[splitDim]--;

0 commit comments

Comments
 (0)