Skip to content

Commit 086ee67

Browse files
committed
Switch to a more efficient rolling Bloom filter
For each 'bit' in the filter we really maintain 2 bits, which store either: 0: not set 1-3: set in generation N After (nElements / 2) insertions, we switch to a new generation, and wipe entries which already had the new generation number, effectively switching from the last 1.5 * nElements set to the last 1.0 * nElements set. This is 25% more space efficient than the previous implementation, and can (at peak) store 1.5 times the requested amount of history (though only 1.0 times the requested history is guaranteed). The existing unit tests should be sufficient.
1 parent 92aa731 commit 086ee67

File tree

3 files changed

+75
-30
lines changed

3 files changed

+75
-30
lines changed

src/bloom.cpp

Lines changed: 53 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -216,30 +216,54 @@ void CBloomFilter::UpdateEmptyFull()
216216
isEmpty = empty;
217217
}
218218

219-
CRollingBloomFilter::CRollingBloomFilter(unsigned int nElements, double fpRate) :
220-
b1(nElements * 2, fpRate, 0), b2(nElements * 2, fpRate, 0)
219+
CRollingBloomFilter::CRollingBloomFilter(unsigned int nElements, double fpRate)
221220
{
222-
// Implemented using two bloom filters of 2 * nElements each.
223-
// We fill them up, and clear them, staggered, every nElements
224-
// inserted, so at least one always contains the last nElements
225-
// inserted.
226-
nInsertions = 0;
227-
nBloomSize = nElements * 2;
228-
221+
double logFpRate = log(fpRate);
222+
/* The optimal number of hash functions is log(fpRate) / log(0.5), but
223+
* restrict it to the range 1-50. */
224+
nHashFuncs = std::max(1, std::min((int)round(logFpRate / log(0.5)), 50));
225+
/* In this rolling bloom filter, we'll store between 2 and 3 generations of nElements / 2 entries. */
226+
nEntriesPerGeneration = (nElements + 1) / 2;
227+
uint32_t nMaxElements = nEntriesPerGeneration * 3;
228+
/* The maximum fpRate = pow(1.0 - exp(-nHashFuncs * nMaxElements / nFilterBits), nHashFuncs)
229+
* => pow(fpRate, 1.0 / nHashFuncs) = 1.0 - exp(-nHashFuncs * nMaxElements / nFilterBits)
230+
* => 1.0 - pow(fpRate, 1.0 / nHashFuncs) = exp(-nHashFuncs * nMaxElements / nFilterBits)
231+
* => log(1.0 - pow(fpRate, 1.0 / nHashFuncs)) = -nHashFuncs * nMaxElements / nFilterBits
232+
* => nFilterBits = -nHashFuncs * nMaxElements / log(1.0 - pow(fpRate, 1.0 / nHashFuncs))
233+
* => nFilterBits = -nHashFuncs * nMaxElements / log(1.0 - exp(logFpRate / nHashFuncs))
234+
*/
235+
uint32_t nFilterBits = (uint32_t)ceil(-1.0 * nHashFuncs * nMaxElements / log(1.0 - exp(logFpRate / nHashFuncs)));
236+
data.clear();
237+
/* We store up to 16 'bits' per data element. */
238+
data.resize((nFilterBits + 15) / 16);
229239
reset();
230240
}
231241

242+
/* Similar to CBloomFilter::Hash */
243+
inline unsigned int CRollingBloomFilter::Hash(unsigned int nHashNum, const std::vector<unsigned char>& vDataToHash) const {
244+
return MurmurHash3(nHashNum * 0xFBA4C795 + nTweak, vDataToHash) % (data.size() * 16);
245+
}
246+
232247
void CRollingBloomFilter::insert(const std::vector<unsigned char>& vKey)
233248
{
234-
if (nInsertions == 0) {
235-
b1.clear();
236-
} else if (nInsertions == nBloomSize / 2) {
237-
b2.clear();
249+
if (nEntriesThisGeneration == nEntriesPerGeneration) {
250+
nEntriesThisGeneration = 0;
251+
nGeneration++;
252+
if (nGeneration == 4) {
253+
nGeneration = 1;
254+
}
255+
/* Wipe old entries that used this generation number. */
256+
for (uint32_t p = 0; p < data.size() * 16; p++) {
257+
if (get(p) == nGeneration) {
258+
put(p, 0);
259+
}
260+
}
238261
}
239-
b1.insert(vKey);
240-
b2.insert(vKey);
241-
if (++nInsertions == nBloomSize) {
242-
nInsertions = 0;
262+
nEntriesThisGeneration++;
263+
264+
for (int n = 0; n < nHashFuncs; n++) {
265+
uint32_t h = Hash(n, vKey);
266+
put(h, nGeneration);
243267
}
244268
}
245269

@@ -251,10 +275,13 @@ void CRollingBloomFilter::insert(const uint256& hash)
251275

252276
bool CRollingBloomFilter::contains(const std::vector<unsigned char>& vKey) const
253277
{
254-
if (nInsertions < nBloomSize / 2) {
255-
return b2.contains(vKey);
278+
for (int n = 0; n < nHashFuncs; n++) {
279+
uint32_t h = Hash(n, vKey);
280+
if (get(h) == 0) {
281+
return false;
282+
}
256283
}
257-
return b1.contains(vKey);
284+
return true;
258285
}
259286

260287
bool CRollingBloomFilter::contains(const uint256& hash) const
@@ -265,8 +292,10 @@ bool CRollingBloomFilter::contains(const uint256& hash) const
265292

266293
void CRollingBloomFilter::reset()
267294
{
268-
unsigned int nNewTweak = GetRand(std::numeric_limits<unsigned int>::max());
269-
b1.reset(nNewTweak);
270-
b2.reset(nNewTweak);
271-
nInsertions = 0;
295+
nTweak = GetRand(std::numeric_limits<unsigned int>::max());
296+
nEntriesThisGeneration = 0;
297+
nGeneration = 1;
298+
for (std::vector<uint32_t>::iterator it = data.begin(); it != data.end(); it++) {
299+
*it = 0;
300+
}
272301
}

src/bloom.h

Lines changed: 21 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -110,8 +110,11 @@ class CBloomFilter
110110
* reset() is provided, which also changes nTweak to decrease the impact of
111111
* false-positives.
112112
*
113-
* contains(item) will always return true if item was one of the last N things
113+
* contains(item) will always return true if item was one of the last N to 1.5*N
114114
* insert()'ed ... but may also return true for items that were not inserted.
115+
*
116+
* It needs around 1.8 bytes per element per factor 0.1 of false positive rate.
117+
* (More accurately: 3/(log(256)*log(2)) * log(1/fpRate) * nElements bytes)
115118
*/
116119
class CRollingBloomFilter
117120
{
@@ -129,10 +132,23 @@ class CRollingBloomFilter
129132
void reset();
130133

131134
private:
132-
unsigned int nBloomSize;
133-
unsigned int nInsertions;
134-
CBloomFilter b1, b2;
135-
};
135+
int nEntriesPerGeneration;
136+
int nEntriesThisGeneration;
137+
int nGeneration;
138+
std::vector<uint32_t> data;
139+
unsigned int nTweak;
140+
int nHashFuncs;
141+
142+
unsigned int Hash(unsigned int nHashNum, const std::vector<unsigned char>& vDataToHash) const;
136143

144+
inline int get(uint32_t position) const {
145+
return (data[(position >> 4) % data.size()] >> (2 * (position & 0xF))) & 0x3;
146+
}
147+
148+
inline void put(uint32_t position, uint32_t val) {
149+
uint32_t& cell = data[(position >> 4) % data.size()];
150+
cell = (cell & ~(((uint32_t)3) << (2 * (position & 0xF)))) | (val << (2 * (position & 0xF)));
151+
}
152+
};
137153

138154
#endif // BITCOIN_BLOOM_H

src/main.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -180,7 +180,7 @@ namespace {
180180
* million to make it highly unlikely for users to have issues with this
181181
* filter.
182182
*
183-
* Memory used: 1.7MB
183+
* Memory used: 1.3 MB
184184
*/
185185
boost::scoped_ptr<CRollingBloomFilter> recentRejects;
186186
uint256 hashRecentRejectsChainTip;

0 commit comments

Comments
 (0)