Skip to content

Commit 36b3577

Browse files
authored
Fix FuzzySet#getEstimatedNumberUniqueValuesAllowingForCollisions to properly account for hashCount (#14614)
1 parent 92e0eb8 commit 36b3577

File tree

1 file changed

+17
-4
lines changed
  • lucene/codecs/src/java/org/apache/lucene/codecs/bloom

1 file changed

+17
-4
lines changed

lucene/codecs/src/java/org/apache/lucene/codecs/bloom/FuzzySet.java

+17-4
Original file line numberDiff line numberDiff line change
@@ -268,18 +268,31 @@ public FuzzySet downsize(float targetMaxSaturation) {
268268
}
269269

270270
public int getEstimatedUniqueValues() {
271-
return getEstimatedNumberUniqueValuesAllowingForCollisions(bloomSize, filter.cardinality());
271+
return getEstimatedNumberUniqueValuesAllowingForCollisions(
272+
bloomSize, filter.cardinality(), hashCount);
272273
}
273274

274-
// Given a set size and a the number of set bits, produces an estimate of the number of unique
275-
// values recorded
275+
/**
276+
* Given a set size and the number of set bits, produces an estimate of the number of unique
277+
* values recorded (assuming a single hash function is used)
278+
*/
276279
public static int getEstimatedNumberUniqueValuesAllowingForCollisions(
277280
int setSize, int numRecordedBits) {
281+
return getEstimatedNumberUniqueValuesAllowingForCollisions(setSize, numRecordedBits, 1);
282+
}
283+
284+
/**
285+
* Given a set size, the number of set bits and hash function count, produces an estimate of the
286+
* number of unique values recorded
287+
*/
288+
public static int getEstimatedNumberUniqueValuesAllowingForCollisions(
289+
int setSize, int numRecordedBits, int hashCount) {
278290
double setSizeAsDouble = setSize;
279291
double numRecordedBitsAsDouble = numRecordedBits;
292+
double hashCountAsDouble = hashCount;
280293
double saturation = numRecordedBitsAsDouble / setSizeAsDouble;
281294
double logInverseSaturation = Math.log(1 - saturation) * -1;
282-
return (int) (setSizeAsDouble * logInverseSaturation);
295+
return (int) (setSizeAsDouble * logInverseSaturation / hashCountAsDouble);
283296
}
284297

285298
public float getTargetMaxSaturation() {

0 commit comments

Comments
 (0)