mfbt/BloomFilter.h


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338

/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
/* vim: set ts=8 sts=2 et sw=2 tw=80: */
/* This Source Code Form is subject to the terms of the Mozilla Public
 * License, v. 2.0. If a copy of the MPL was not distributed with this
 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */

/*
 * A counting Bloom filter implementation.  This allows consumers to
 * do fast probabilistic "is item X in set Y?" testing which will
 * never answer "no" when the correct answer is "yes" (but might
 * incorrectly answer "yes" when the correct answer is "no").
 */

#ifndef mozilla_BloomFilter_h
#define mozilla_BloomFilter_h

#include "mozilla/Assertions.h"
#include "mozilla/Likely.h"

#include <stdint.h>
#include <string.h>

namespace mozilla {

/*
 * This class implements a classic Bloom filter as described at
 * <http://en.wikipedia.org/wiki/Bloom_filter>.  This allows quick
 * probabilistic answers to the question "is object X in set Y?" where the
 * contents of Y might not be time-invariant.  The probabilistic nature of the
 * test means that sometimes the answer will be "yes" when it should be "no".
 * If the answer is "no", then X is guaranteed not to be in Y.
 *
 * The filter is parametrized on KeySize, which is the size of the key
 * generated by each of hash functions used by the filter, in bits,
 * and the type of object T being added and removed.  T must implement
 * a |uint32_t hash() const| method which returns a uint32_t hash key
 * that will be used to generate the two separate hash functions for
 * the Bloom filter.  This hash key MUST be well-distributed for good
 * results!  KeySize is not allowed to be larger than 16.
 *
 * The filter uses exactly 2**KeySize bit (2**(KeySize-3) bytes) of memory.
 * From now on we will refer to the memory used by the filter as M.
 *
 * The expected rate of incorrect "yes" answers depends on M and on
 * the number N of objects in set Y.  As long as N is small compared
 * to M, the rate of such answers is expected to be approximately
 * 4*(N/M)**2 for this filter.  In practice, if Y has a few hundred
 * elements then using a KeySize of 12 gives a reasonably low
 * incorrect answer rate.  A KeySize of 12 has the additional benefit
 * of using exactly one page for the filter in typical hardware
 * configurations.
 */
template <unsigned KeySize, class T>
class BitBloomFilter {
  /*
   * A counting Bloom filter with 8-bit counters.  For now we assume
   * that having two hash functions is enough, but we may revisit that
   * decision later.
   *
   * The filter uses an array with 2**KeySize entries.
   *
   * Assuming a well-distributed hash function, a Bloom filter with
   * array size M containing N elements and
   * using k hash function has expected false positive rate exactly
   *
   * $  (1 - (1 - 1/M)^{kN})^k  $
   *
   * because each array slot has a
   *
   * $  (1 - 1/M)^{kN}  $
   *
   * chance of being 0, and the expected false positive rate is the
   * probability that all of the k hash functions will hit a nonzero
   * slot.
   *
   * For reasonable assumptions (M large, kN large, which should both
   * hold if we're worried about false positives) about M and kN this
   * becomes approximately
   *
   * $$  (1 - \exp(-kN/M))^k   $$
   *
   * For our special case of k == 2, that's $(1 - \exp(-2N/M))^2$,
   * or in other words
   *
   * $$    N/M = -0.5 * \ln(1 - \sqrt(r))   $$
   *
   * where r is the false positive rate.  This can be used to compute
   * the desired KeySize for a given load N and false positive rate r.
   *
   * If N/M is assumed small, then the false positive rate can
   * further be approximated as 4*N^2/M^2.  So increasing KeySize by
   * 1, which doubles M, reduces the false positive rate by about a
   * factor of 4, and a false positive rate of 1% corresponds to
   * about M/N == 20.
   *
   * What this means in practice is that for a few hundred keys using a
   * KeySize of 12 gives false positive rates on the order of 0.25-4%.
   *
   * Similarly, using a KeySize of 10 would lead to a 4% false
   * positive rate for N == 100 and to quite bad false positive
   * rates for larger N.
   */
 public:
  BitBloomFilter() {
    static_assert(KeySize >= 3, "KeySize too small");
    static_assert(KeySize <= kKeyShift, "KeySize too big");

    // XXX: Should we have a custom operator new using calloc instead and
    // require that we're allocated via the operator?
    clear();
  }

  /*
   * Clear the filter.  This should be done before reusing it.
   */
  void clear();

  /*
   * Add an item to the filter.
   */
  void add(const T* aValue);

  /*
   * Check whether the filter might contain an item.  This can
   * sometimes return true even if the item is not in the filter,
   * but will never return false for items that are actually in the
   * filter.
   */
  bool mightContain(const T* aValue) const;

  /*
   * Methods for add/contain when we already have a hash computed
   */
  void add(uint32_t aHash);
  bool mightContain(uint32_t aHash) const;

 private:
  static const size_t kArraySize = (1 << (KeySize - 3));
  static const uint32_t kKeyMask = (1 << KeySize) - 1;
  static const uint32_t kKeyShift = 16;

  static uint32_t hash1(uint32_t aHash) { return aHash & kKeyMask; }
  static uint32_t hash2(uint32_t aHash) {
    return (aHash >> kKeyShift) & kKeyMask;
  }

  bool getSlot(uint32_t aHash) const {
    uint32_t index = aHash / 8;
    uint8_t shift = aHash % 8;
    uint8_t mask = 1 << shift;
    return !!(mBits[index] & mask);
  }

  void setSlot(uint32_t aHash) {
    uint32_t index = aHash / 8;
    uint8_t shift = aHash % 8;
    uint8_t bit = 1 << shift;
    mBits[index] |= bit;
  }

  bool getFirstSlot(uint32_t aHash) const { return getSlot(hash1(aHash)); }
  bool getSecondSlot(uint32_t aHash) const { return getSlot(hash2(aHash)); }

  void setFirstSlot(uint32_t aHash) { setSlot(hash1(aHash)); }
  void setSecondSlot(uint32_t aHash) { setSlot(hash2(aHash)); }

  uint8_t mBits[kArraySize];
};

template <unsigned KeySize, class T>
inline void BitBloomFilter<KeySize, T>::clear() {
  memset(mBits, 0, kArraySize);
}

template <unsigned KeySize, class T>
inline void BitBloomFilter<KeySize, T>::add(uint32_t aHash) {
  setFirstSlot(aHash);
  setSecondSlot(aHash);
}

template <unsigned KeySize, class T>
MOZ_ALWAYS_INLINE void BitBloomFilter<KeySize, T>::add(const T* aValue) {
  uint32_t hash = aValue->hash();
  return add(hash);
}

template <unsigned KeySize, class T>
MOZ_ALWAYS_INLINE bool BitBloomFilter<KeySize, T>::mightContain(
    uint32_t aHash) const {
  // Check that all the slots for this hash contain something
  return getFirstSlot(aHash) && getSecondSlot(aHash);
}

template <unsigned KeySize, class T>
MOZ_ALWAYS_INLINE bool BitBloomFilter<KeySize, T>::mightContain(
    const T* aValue) const {
  uint32_t hash = aValue->hash();
  return mightContain(hash);
}

/*
 * This class implements a counting Bloom filter as described at
 * <http://en.wikipedia.org/wiki/Bloom_filter#Counting_filters>, with
 * 8-bit counters.
 *
 * Compared to `BitBloomFilter`, this class supports 'remove' operation.
 *
 * The filter uses exactly 2**KeySize bytes of memory.
 *
 * Other characteristics are the same as BitBloomFilter.
 */
template <unsigned KeySize, class T>
class CountingBloomFilter {
 public:
  CountingBloomFilter() {
    static_assert(KeySize <= kKeyShift, "KeySize too big");

    clear();
  }

  /*
   * Clear the filter.  This should be done before reusing it, because
   * just removing all items doesn't clear counters that hit the upper
   * bound.
   */
  void clear();

  /*
   * Add an item to the filter.
   */
  void add(const T* aValue);

  /*
   * Remove an item from the filter.
   */
  void remove(const T* aValue);

  /*
   * Check whether the filter might contain an item.  This can
   * sometimes return true even if the item is not in the filter,
   * but will never return false for items that are actually in the
   * filter.
   */
  bool mightContain(const T* aValue) const;

  /*
   * Methods for add/remove/contain when we already have a hash computed
   */
  void add(uint32_t aHash);
  void remove(uint32_t aHash);
  bool mightContain(uint32_t aHash) const;

 private:
  static const size_t kArraySize = (1 << KeySize);
  static const uint32_t kKeyMask = (1 << KeySize) - 1;
  static const uint32_t kKeyShift = 16;

  static uint32_t hash1(uint32_t aHash) { return aHash & kKeyMask; }
  static uint32_t hash2(uint32_t aHash) {
    return (aHash >> kKeyShift) & kKeyMask;
  }

  uint8_t& firstSlot(uint32_t aHash) { return mCounters[hash1(aHash)]; }
  uint8_t& secondSlot(uint32_t aHash) { return mCounters[hash2(aHash)]; }

  const uint8_t& firstSlot(uint32_t aHash) const {
    return mCounters[hash1(aHash)];
  }
  const uint8_t& secondSlot(uint32_t aHash) const {
    return mCounters[hash2(aHash)];
  }

  static bool full(const uint8_t& aSlot) { return aSlot == UINT8_MAX; }

  uint8_t mCounters[kArraySize];
};

template <unsigned KeySize, class T>
inline void CountingBloomFilter<KeySize, T>::clear() {
  memset(mCounters, 0, kArraySize);
}

template <unsigned KeySize, class T>
inline void CountingBloomFilter<KeySize, T>::add(uint32_t aHash) {
  uint8_t& slot1 = firstSlot(aHash);
  if (MOZ_LIKELY(!full(slot1))) {
    ++slot1;
  }
  uint8_t& slot2 = secondSlot(aHash);
  if (MOZ_LIKELY(!full(slot2))) {
    ++slot2;
  }
}

template <unsigned KeySize, class T>
MOZ_ALWAYS_INLINE void CountingBloomFilter<KeySize, T>::add(const T* aValue) {
  uint32_t hash = aValue->hash();
  return add(hash);
}

template <unsigned KeySize, class T>
inline void CountingBloomFilter<KeySize, T>::remove(uint32_t aHash) {
  // If the slots are full, we don't know whether we bumped them to be
  // there when we added or not, so just leave them full.
  uint8_t& slot1 = firstSlot(aHash);
  if (MOZ_LIKELY(!full(slot1))) {
    --slot1;
  }
  uint8_t& slot2 = secondSlot(aHash);
  if (MOZ_LIKELY(!full(slot2))) {
    --slot2;
  }
}

template <unsigned KeySize, class T>
MOZ_ALWAYS_INLINE void CountingBloomFilter<KeySize, T>::remove(
    const T* aValue) {
  uint32_t hash = aValue->hash();
  remove(hash);
}

template <unsigned KeySize, class T>
MOZ_ALWAYS_INLINE bool CountingBloomFilter<KeySize, T>::mightContain(
    uint32_t aHash) const {
  // Check that all the slots for this hash contain something
  return firstSlot(aHash) && secondSlot(aHash);
}

template <unsigned KeySize, class T>
MOZ_ALWAYS_INLINE bool CountingBloomFilter<KeySize, T>::mightContain(
    const T* aValue) const {
  uint32_t hash = aValue->hash();
  return mightContain(hash);
}

}  // namespace mozilla

#endif /* mozilla_BloomFilter_h */