@@ -78,7 +78,7 @@ public enum ContainsResult {
78
78
* Rounds down required maxNumberOfBits to the nearest number that is made up of all ones as a
79
79
* binary number. Use this method where controlling memory use is paramount.
80
80
*/
81
- public static int getNearestSetSize (int maxNumberOfBits ) {
81
+ private static int getNearestSetSize (int maxNumberOfBits ) {
82
82
int result = usableBitSetSizes [0 ];
83
83
for (int i = 0 ; i < usableBitSetSizes .length ; i ++) {
84
84
if (usableBitSetSizes [i ] <= maxNumberOfBits ) {
@@ -88,41 +88,11 @@ public static int getNearestSetSize(int maxNumberOfBits) {
88
88
return result ;
89
89
}
90
90
91
- /**
92
- * Use this method to choose a set size where accuracy (low content saturation) is more important
93
- * than deciding how much memory to throw at the problem.
94
- *
95
- * @param desiredSaturation A number between 0 and 1 expressing the % of bits set once all values
96
- * have been recorded
97
- * @return The size of the set nearest to the required size
98
- */
99
- public static int getNearestSetSize (int maxNumberOfValuesExpected , float desiredSaturation ) {
100
- // Iterate around the various scales of bitset from smallest to largest looking for the first
101
- // that
102
- // satisfies value volumes at the chosen saturation level
103
- for (int i = 0 ; i < usableBitSetSizes .length ; i ++) {
104
- int numSetBitsAtDesiredSaturation = (int ) (usableBitSetSizes [i ] * desiredSaturation );
105
- int estimatedNumUniqueValues =
106
- getEstimatedNumberUniqueValuesAllowingForCollisions (
107
- usableBitSetSizes [i ], numSetBitsAtDesiredSaturation );
108
- if (estimatedNumUniqueValues > maxNumberOfValuesExpected ) {
109
- return usableBitSetSizes [i ];
110
- }
111
- }
112
- return -1 ;
113
- }
114
-
115
91
public static FuzzySet createSetBasedOnMaxMemory (int maxNumBytes ) {
116
92
int setSize = getNearestSetSize (maxNumBytes );
117
93
return new FuzzySet (new FixedBitSet (setSize + 1 ), setSize , 1 );
118
94
}
119
95
120
- public static FuzzySet createSetBasedOnQuality (
121
- int maxNumUniqueValues , float desiredMaxSaturation , int version ) {
122
- int setSize = getNearestSetSize (maxNumUniqueValues , desiredMaxSaturation );
123
- return new FuzzySet (new FixedBitSet (setSize + 1 ), setSize , 1 );
124
- }
125
-
126
96
public static FuzzySet createOptimalSet (int maxNumUniqueValues , float targetMaxFpp ) {
127
97
int setSize =
128
98
(int )
@@ -154,8 +124,9 @@ public ContainsResult contains(BytesRef value) {
154
124
long msb = hash [0 ];
155
125
long lsb = hash [1 ];
156
126
for (int i = 0 ; i < hashCount ; i ++) {
127
+ // Bloom sizes are always base 2 and so can be ANDed for a fast modulo
157
128
int bloomPos = ((int ) (lsb + i * msb )) & bloomSize ;
158
- if (! mayContainValue (bloomPos )) {
129
+ if (filter . get (bloomPos ) == false ) {
159
130
return ContainsResult .NO ;
160
131
}
161
132
}
@@ -201,12 +172,6 @@ public static FuzzySet deserialize(DataInput in) throws IOException {
201
172
return new FuzzySet (bits , bloomSize , hashCount );
202
173
}
203
174
204
- private boolean mayContainValue (int aHash ) {
205
- // Bloom sizes are always base 2 and so can be ANDed for a fast modulo
206
- int pos = aHash & bloomSize ;
207
- return filter .get (pos );
208
- }
209
-
210
175
/**
211
176
* Records a value in the set. The referenced bytes are hashed. From the 64-bit generated hash,
212
177
* two 32-bit hashes are derived from the msb and lsb which can be used to derive more hashes (see
@@ -268,28 +233,9 @@ public FuzzySet downsize(float targetMaxSaturation) {
268
233
}
269
234
270
235
public int getEstimatedUniqueValues () {
271
- return getEstimatedNumberUniqueValuesAllowingForCollisions (
272
- bloomSize , filter .cardinality (), hashCount );
273
- }
274
-
275
- /**
276
- * Given a set size and the number of set bits, produces an estimate of the number of unique
277
- * values recorded (assuming a single hash function is used)
278
- */
279
- public static int getEstimatedNumberUniqueValuesAllowingForCollisions (
280
- int setSize , int numRecordedBits ) {
281
- return getEstimatedNumberUniqueValuesAllowingForCollisions (setSize , numRecordedBits , 1 );
282
- }
283
-
284
- /**
285
- * Given a set size, the number of set bits and hash function count, produces an estimate of the
286
- * number of unique values recorded
287
- */
288
- public static int getEstimatedNumberUniqueValuesAllowingForCollisions (
289
- int setSize , int numRecordedBits , int hashCount ) {
290
- double setSizeAsDouble = setSize ;
291
- double numRecordedBitsAsDouble = numRecordedBits ;
292
- double hashCountAsDouble = hashCount ;
236
+ double setSizeAsDouble = bloomSize ;
237
+ double numRecordedBitsAsDouble = filter .cardinality ();
238
+ double hashCountAsDouble = filter .hashCode ();
293
239
double saturation = numRecordedBitsAsDouble / setSizeAsDouble ;
294
240
double logInverseSaturation = Math .log (1 - saturation ) * -1 ;
295
241
return (int ) (setSizeAsDouble * logInverseSaturation / hashCountAsDouble );
0 commit comments