Skip to content

Pack file pointers when merging BKD trees #14393

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 6 commits into from
Mar 25, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions lucene/CHANGES.txt
Original file line number Diff line number Diff line change
Expand Up @@ -200,6 +200,8 @@ Optimizations

* GITHUB#14373: Optimized `ParallelLeafReader` to improve term vector fetching efficiency. (Divyansh Agrawal)

* GITHUB#14393: Pack file pointers using the packed monotonic builder when building BKD trees. (Ignacio Vera)

Bug Fixes
---------------------

Expand Down
64 changes: 47 additions & 17 deletions lucene/core/src/java/org/apache/lucene/util/bkd/BKDWriter.java
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,6 @@
import org.apache.lucene.index.PointValues;
import org.apache.lucene.index.PointValues.IntersectVisitor;
import org.apache.lucene.index.PointValues.Relation;
import org.apache.lucene.internal.hppc.LongArrayList;
import org.apache.lucene.store.ByteBuffersDataOutput;
import org.apache.lucene.store.ChecksumIndexInput;
import org.apache.lucene.store.DataOutput;
Expand All @@ -43,9 +42,12 @@
import org.apache.lucene.util.FixedLengthBytesRefArray;
import org.apache.lucene.util.IORunnable;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.LongValues;
import org.apache.lucene.util.NumericUtils;
import org.apache.lucene.util.PriorityQueue;
import org.apache.lucene.util.bkd.BKDUtil.ByteArrayPredicate;
import org.apache.lucene.util.packed.PackedInts;
import org.apache.lucene.util.packed.PackedLongValues;

// TODO
// - allow variable length byte[] (across docs and dims), but this is quite a bit more hairy
Expand Down Expand Up @@ -582,8 +584,21 @@ private IORunnable writeFieldNDims(

scratchBytesRef1.length = config.bytesPerDim();
scratchBytesRef1.bytes = splitPackedValues;
final LongValues leafFPLongValues =
new LongValues() {
@Override
public long get(long index) {
return leafBlockFPs[(int) index];
}
};

return makeWriter(metaOut, indexOut, splitDimensionValues, leafBlockFPs, dataStartFP);
return makeWriter(
metaOut,
indexOut,
splitDimensionValues,
leafFPLongValues,
leafBlockFPs.length,
dataStartFP);
}

/* In the 1D case, we can simply sort points in ascending order and use the
Expand Down Expand Up @@ -678,7 +693,8 @@ private class OneDimensionBKDWriter {

final IndexOutput metaOut, indexOut, dataOut;
final long dataStartFP;
final LongArrayList leafBlockFPs = new LongArrayList();
private final PackedLongValues.Builder leafBlockFPs =
PackedLongValues.monotonicBuilder(PackedInts.COMPACT);
final FixedLengthBytesRefArray leafBlockStartValues =
new FixedLengthBytesRefArray(config.packedIndexBytesLength());
final byte[] leafValues = new byte[config.maxPointsInLeafNode() * config.packedBytesLength()];
Expand Down Expand Up @@ -708,7 +724,6 @@ private class OneDimensionBKDWriter {
this.indexOut = indexOut;
this.dataOut = dataOut;
this.dataStartFP = dataOut.getFilePointer();

lastPackedValue = new byte[config.packedBytesLength()];
}

Expand Down Expand Up @@ -773,11 +788,12 @@ public IORunnable finish() throws IOException {
scratchBytesRef1.length = config.packedIndexBytesLength();
scratchBytesRef1.offset = 0;
assert leafBlockStartValues.size() + 1 == leafBlockFPs.size();
final LongValues leafFPLongValues = leafBlockFPs.build();
BKDTreeLeafNodes leafNodes =
new BKDTreeLeafNodes() {
@Override
public long getLeafLP(int index) {
return leafBlockFPs.get(index);
return leafFPLongValues.get(index);
}

@Override
Expand All @@ -792,7 +808,7 @@ public int getSplitDimension(int index) {

@Override
public int numLeaves() {
return leafBlockFPs.size();
return Math.toIntExact(leafBlockFPs.size());
}
};
return () -> {
Expand Down Expand Up @@ -823,7 +839,7 @@ private void writeLeafBlock(int leafCardinality) throws IOException {
leafBlockStartValues.append(scratchBytesRef1);
}
leafBlockFPs.add(dataOut.getFilePointer());
checkMaxLeafNodeCount(leafBlockFPs.size());
checkMaxLeafNodeCount(Math.toIntExact(leafBlockFPs.size()));

// Find per-dim common prefix:
commonPrefixLengths[0] =
Expand Down Expand Up @@ -955,7 +971,8 @@ public IORunnable finish(IndexOutput metaOut, IndexOutput indexOut, IndexOutput

// +1 because leaf count is power of 2 (e.g. 8), and innerNodeCount is power of 2 minus 1 (e.g.
// 7)
long[] leafBlockFPs = new long[numLeaves];
final PackedLongValues.Builder leafBlockFPs =
PackedLongValues.monotonicBuilder(PackedInts.COMPACT);

// Make sure the math above "worked":
assert pointCount / numLeaves <= config.maxPointsInLeafNode()
Expand Down Expand Up @@ -987,8 +1004,10 @@ public IORunnable finish(IndexOutput metaOut, IndexOutput indexOut, IndexOutput
splitPackedValues,
splitDimensionValues,
leafBlockFPs,
numLeaves,
new int[config.maxPointsInLeafNode()]);
assert Arrays.equals(parentSplits, new int[config.numIndexDims()]);
assert leafBlockFPs.size() == numLeaves;

// If no exception, we should have cleaned everything up:
assert tempDir.getCreatedFiles().isEmpty();
Expand All @@ -1002,22 +1021,30 @@ public IORunnable finish(IndexOutput metaOut, IndexOutput indexOut, IndexOutput
}
}

LongValues leafBlockLongValues = leafBlockFPs.build();
scratchBytesRef1.bytes = splitPackedValues;
scratchBytesRef1.length = config.bytesPerDim();
return makeWriter(metaOut, indexOut, splitDimensionValues, leafBlockFPs, dataStartFP);
return makeWriter(
metaOut,
indexOut,
splitDimensionValues,
leafBlockLongValues,
Math.toIntExact(leafBlockFPs.size()),
dataStartFP);
}

private IORunnable makeWriter(
IndexOutput metaOut,
IndexOutput indexOut,
byte[] splitDimensionValues,
long[] leafBlockFPs,
LongValues leafBlockFPs,
int numLeaves,
long dataStartFP) {
BKDTreeLeafNodes leafNodes =
new BKDTreeLeafNodes() {
@Override
public long getLeafLP(int index) {
return leafBlockFPs[index];
return leafBlockFPs.get(index);
}

@Override
Expand All @@ -1033,7 +1060,7 @@ public int getSplitDimension(int index) {

@Override
public int numLeaves() {
return leafBlockFPs.length;
return numLeaves;
}
};

Expand Down Expand Up @@ -1903,7 +1930,8 @@ private void build(
int[] parentSplits,
byte[] splitPackedValues,
byte[] splitDimensionValues,
long[] leafBlockFPs,
PackedLongValues.Builder leafBlockFPs,
int totalNumLeaves,
int[] spareDocIds)
throws IOException {

Expand Down Expand Up @@ -1961,7 +1989,7 @@ private void build(
int leafCardinality = heapSource.computeCardinality(from, to, commonPrefixLengths);

// Save the block file pointer:
leafBlockFPs[leavesOffset] = out.getFilePointer();
leafBlockFPs.add(out.getFilePointer());
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah, since filepointers are monotonic, we can make the compact. NICE!

// System.out.println(" write leaf block @ fp=" + out.getFilePointer());

// Write docIDs first, as their own chunk, so that at intersect time we can add all docIDs w/o
Expand Down Expand Up @@ -2003,16 +2031,16 @@ assert valuesInOrderAndBounds(
// split dimensions. Because it is an expensive operation, the frequency we recompute the
// bounds is given
// by SPLITS_BEFORE_EXACT_BOUNDS.
if (numLeaves != leafBlockFPs.length
if (numLeaves != totalNumLeaves
&& config.numIndexDims() > 2
&& Arrays.stream(parentSplits).sum() % SPLITS_BEFORE_EXACT_BOUNDS == 0) {
computePackedValueBounds(points, minPackedValue, maxPackedValue);
}
splitDim = split(minPackedValue, maxPackedValue, parentSplits);
}

assert numLeaves <= leafBlockFPs.length
: "numLeaves=" + numLeaves + " leafBlockFPs.length=" + leafBlockFPs.length;
assert numLeaves <= totalNumLeaves
: "numLeaves=" + numLeaves + " totalNumLeaves=" + totalNumLeaves;

// How many leaves will be in the left tree:
final int numLeftLeafNodes = getNumLeftLeafNodes(numLeaves);
Expand Down Expand Up @@ -2078,6 +2106,7 @@ assert valuesInOrderAndBounds(
splitPackedValues,
splitDimensionValues,
leafBlockFPs,
totalNumLeaves,
spareDocIds);

// Recurse on right tree:
Expand All @@ -2093,6 +2122,7 @@ assert valuesInOrderAndBounds(
splitPackedValues,
splitDimensionValues,
leafBlockFPs,
totalNumLeaves,
spareDocIds);

parentSplits[splitDim]--;
Expand Down