Skip to content

Commit e3ded15

Browse files
author
Greg Miller
committed
Shrink the public surface area of FuzzySet
Removes public methods with no current library usage that we probably don't want to publicly maintain
1 parent 36b3577 commit e3ded15

File tree

1 file changed

+6
-60
lines changed
  • lucene/codecs/src/java/org/apache/lucene/codecs/bloom

1 file changed

+6
-60
lines changed

lucene/codecs/src/java/org/apache/lucene/codecs/bloom/FuzzySet.java

+6-60
Original file line numberDiff line numberDiff line change
@@ -78,7 +78,7 @@ public enum ContainsResult {
7878
* Rounds down required maxNumberOfBits to the nearest number that is made up of all ones as a
7979
* binary number. Use this method where controlling memory use is paramount.
8080
*/
81-
public static int getNearestSetSize(int maxNumberOfBits) {
81+
private static int getNearestSetSize(int maxNumberOfBits) {
8282
int result = usableBitSetSizes[0];
8383
for (int i = 0; i < usableBitSetSizes.length; i++) {
8484
if (usableBitSetSizes[i] <= maxNumberOfBits) {
@@ -88,41 +88,11 @@ public static int getNearestSetSize(int maxNumberOfBits) {
8888
return result;
8989
}
9090

91-
/**
92-
* Use this method to choose a set size where accuracy (low content saturation) is more important
93-
* than deciding how much memory to throw at the problem.
94-
*
95-
* @param desiredSaturation A number between 0 and 1 expressing the % of bits set once all values
96-
* have been recorded
97-
* @return The size of the set nearest to the required size
98-
*/
99-
public static int getNearestSetSize(int maxNumberOfValuesExpected, float desiredSaturation) {
100-
// Iterate around the various scales of bitset from smallest to largest looking for the first
101-
// that
102-
// satisfies value volumes at the chosen saturation level
103-
for (int i = 0; i < usableBitSetSizes.length; i++) {
104-
int numSetBitsAtDesiredSaturation = (int) (usableBitSetSizes[i] * desiredSaturation);
105-
int estimatedNumUniqueValues =
106-
getEstimatedNumberUniqueValuesAllowingForCollisions(
107-
usableBitSetSizes[i], numSetBitsAtDesiredSaturation);
108-
if (estimatedNumUniqueValues > maxNumberOfValuesExpected) {
109-
return usableBitSetSizes[i];
110-
}
111-
}
112-
return -1;
113-
}
114-
11591
public static FuzzySet createSetBasedOnMaxMemory(int maxNumBytes) {
11692
int setSize = getNearestSetSize(maxNumBytes);
11793
return new FuzzySet(new FixedBitSet(setSize + 1), setSize, 1);
11894
}
11995

120-
public static FuzzySet createSetBasedOnQuality(
121-
int maxNumUniqueValues, float desiredMaxSaturation, int version) {
122-
int setSize = getNearestSetSize(maxNumUniqueValues, desiredMaxSaturation);
123-
return new FuzzySet(new FixedBitSet(setSize + 1), setSize, 1);
124-
}
125-
12696
public static FuzzySet createOptimalSet(int maxNumUniqueValues, float targetMaxFpp) {
12797
int setSize =
12898
(int)
@@ -154,8 +124,9 @@ public ContainsResult contains(BytesRef value) {
154124
long msb = hash[0];
155125
long lsb = hash[1];
156126
for (int i = 0; i < hashCount; i++) {
127+
// Bloom sizes are always base 2 and so can be ANDed for a fast modulo
157128
int bloomPos = ((int) (lsb + i * msb)) & bloomSize;
158-
if (!mayContainValue(bloomPos)) {
129+
if (filter.get(bloomPos) == false) {
159130
return ContainsResult.NO;
160131
}
161132
}
@@ -201,12 +172,6 @@ public static FuzzySet deserialize(DataInput in) throws IOException {
201172
return new FuzzySet(bits, bloomSize, hashCount);
202173
}
203174

204-
private boolean mayContainValue(int aHash) {
205-
// Bloom sizes are always base 2 and so can be ANDed for a fast modulo
206-
int pos = aHash & bloomSize;
207-
return filter.get(pos);
208-
}
209-
210175
/**
211176
* Records a value in the set. The referenced bytes are hashed. From the 64-bit generated hash,
212177
* two 32-bit hashes are derived from the msb and lsb which can be used to derive more hashes (see
@@ -268,28 +233,9 @@ public FuzzySet downsize(float targetMaxSaturation) {
268233
}
269234

270235
public int getEstimatedUniqueValues() {
271-
return getEstimatedNumberUniqueValuesAllowingForCollisions(
272-
bloomSize, filter.cardinality(), hashCount);
273-
}
274-
275-
/**
276-
* Given a set size and the number of set bits, produces an estimate of the number of unique
277-
* values recorded (assuming a single hash function is used)
278-
*/
279-
public static int getEstimatedNumberUniqueValuesAllowingForCollisions(
280-
int setSize, int numRecordedBits) {
281-
return getEstimatedNumberUniqueValuesAllowingForCollisions(setSize, numRecordedBits, 1);
282-
}
283-
284-
/**
285-
* Given a set size, the number of set bits and hash function count, produces an estimate of the
286-
* number of unique values recorded
287-
*/
288-
public static int getEstimatedNumberUniqueValuesAllowingForCollisions(
289-
int setSize, int numRecordedBits, int hashCount) {
290-
double setSizeAsDouble = setSize;
291-
double numRecordedBitsAsDouble = numRecordedBits;
292-
double hashCountAsDouble = hashCount;
236+
double setSizeAsDouble = bloomSize;
237+
double numRecordedBitsAsDouble = filter.cardinality();
238+
double hashCountAsDouble = filter.hashCode();
293239
double saturation = numRecordedBitsAsDouble / setSizeAsDouble;
294240
double logInverseSaturation = Math.log(1 - saturation) * -1;
295241
return (int) (setSizeAsDouble * logInverseSaturation / hashCountAsDouble);

0 commit comments

Comments
 (0)