/* This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this * file, You can obtain one at https://mozilla.org/MPL/2.0/. */ //! Counting and non-counting Bloom filters tuned for use as ancestor filters //! for selector matching. use std::fmt::{self, Debug}; // The top 8 bits of the 32-bit hash value are not used by the bloom filter. // Consumers may rely on this to pack hashes more efficiently. pub const BLOOM_HASH_MASK: u32 = 0x00ffffff; const KEY_SIZE: usize = 12; const ARRAY_SIZE: usize = 1 << KEY_SIZE; const KEY_MASK: u32 = (1 << KEY_SIZE) - 1; /// A counting Bloom filter with 8-bit counters. pub type BloomFilter = CountingBloomFilter; /// A counting Bloom filter with parameterized storage to handle /// counters of different sizes. For now we assume that having two hash /// functions is enough, but we may revisit that decision later. /// /// The filter uses an array with 2**KeySize entries. /// /// Assuming a well-distributed hash function, a Bloom filter with /// array size M containing N elements and /// using k hash function has expected false positive rate exactly /// /// $ (1 - (1 - 1/M)^{kN})^k $ /// /// because each array slot has a /// /// $ (1 - 1/M)^{kN} $ /// /// chance of being 0, and the expected false positive rate is the /// probability that all of the k hash functions will hit a nonzero /// slot. /// /// For reasonable assumptions (M large, kN large, which should both /// hold if we're worried about false positives) about M and kN this /// becomes approximately /// /// $$ (1 - \exp(-kN/M))^k $$ /// /// For our special case of k == 2, that's $(1 - \exp(-2N/M))^2$, /// or in other words /// /// $$ N/M = -0.5 * \ln(1 - \sqrt(r)) $$ /// /// where r is the false positive rate. This can be used to compute /// the desired KeySize for a given load N and false positive rate r. /// /// If N/M is assumed small, then the false positive rate can /// further be approximated as 4*N^2/M^2. So increasing KeySize by /// 1, which doubles M, reduces the false positive rate by about a /// factor of 4, and a false positive rate of 1% corresponds to /// about M/N == 20. /// /// What this means in practice is that for a few hundred keys using a /// KeySize of 12 gives false positive rates on the order of 0.25-4%. /// /// Similarly, using a KeySize of 10 would lead to a 4% false /// positive rate for N == 100 and to quite bad false positive /// rates for larger N. #[derive(Clone, Default)] pub struct CountingBloomFilter where S: BloomStorage, { storage: S, } impl CountingBloomFilter where S: BloomStorage, { /// Creates a new bloom filter. #[inline] pub fn new() -> Self { Default::default() } #[inline] pub fn clear(&mut self) { self.storage = Default::default(); } // Slow linear accessor to make sure the bloom filter is zeroed. This should // never be used in release builds. #[cfg(debug_assertions)] pub fn is_zeroed(&self) -> bool { self.storage.is_zeroed() } #[cfg(not(debug_assertions))] pub fn is_zeroed(&self) -> bool { unreachable!() } /// Inserts an item with a particular hash into the bloom filter. #[inline] pub fn insert_hash(&mut self, hash: u32) { self.storage.adjust_first_slot(hash, true); self.storage.adjust_second_slot(hash, true); } /// Removes an item with a particular hash from the bloom filter. #[inline] pub fn remove_hash(&mut self, hash: u32) { self.storage.adjust_first_slot(hash, false); self.storage.adjust_second_slot(hash, false); } /// Check whether the filter might contain an item with the given hash. /// This can sometimes return true even if the item is not in the filter, /// but will never return false for items that are actually in the filter. #[inline] pub fn might_contain_hash(&self, hash: u32) -> bool { !self.storage.first_slot_is_empty(hash) && !self.storage.second_slot_is_empty(hash) } } impl Debug for CountingBloomFilter where S: BloomStorage, { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { let mut slots_used = 0; for i in 0..ARRAY_SIZE { if !self.storage.slot_is_empty(i) { slots_used += 1; } } write!(f, "BloomFilter({}/{})", slots_used, ARRAY_SIZE) } } pub trait BloomStorage: Clone + Default { fn slot_is_empty(&self, index: usize) -> bool; fn adjust_slot(&mut self, index: usize, increment: bool); fn is_zeroed(&self) -> bool; #[inline] fn first_slot_is_empty(&self, hash: u32) -> bool { self.slot_is_empty(Self::first_slot_index(hash)) } #[inline] fn second_slot_is_empty(&self, hash: u32) -> bool { self.slot_is_empty(Self::second_slot_index(hash)) } #[inline] fn adjust_first_slot(&mut self, hash: u32, increment: bool) { self.adjust_slot(Self::first_slot_index(hash), increment) } #[inline] fn adjust_second_slot(&mut self, hash: u32, increment: bool) { self.adjust_slot(Self::second_slot_index(hash), increment) } #[inline] fn first_slot_index(hash: u32) -> usize { hash1(hash) as usize } #[inline] fn second_slot_index(hash: u32) -> usize { hash2(hash) as usize } } /// Storage class for a CountingBloomFilter that has 8-bit counters. pub struct BloomStorageU8 { counters: [u8; ARRAY_SIZE], } impl BloomStorage for BloomStorageU8 { #[inline] fn adjust_slot(&mut self, index: usize, increment: bool) { let slot = &mut self.counters[index]; if *slot != 0xff { // full if increment { *slot += 1; } else { *slot -= 1; } } } #[inline] fn slot_is_empty(&self, index: usize) -> bool { self.counters[index] == 0 } #[inline] fn is_zeroed(&self) -> bool { self.counters.iter().all(|x| *x == 0) } } impl Default for BloomStorageU8 { fn default() -> Self { BloomStorageU8 { counters: [0; ARRAY_SIZE], } } } impl Clone for BloomStorageU8 { fn clone(&self) -> Self { BloomStorageU8 { counters: self.counters, } } } /// Storage class for a CountingBloomFilter that has 1-bit counters. pub struct BloomStorageBool { counters: [u8; ARRAY_SIZE / 8], } impl BloomStorage for BloomStorageBool { #[inline] fn adjust_slot(&mut self, index: usize, increment: bool) { let bit = 1 << (index % 8); let byte = &mut self.counters[index / 8]; // Since we have only one bit for storage, decrementing it // should never do anything. Assert against an accidental // decrementing of a bit that was never set. assert!( increment || (*byte & bit) != 0, "should not decrement if slot is already false" ); if increment { *byte |= bit; } } #[inline] fn slot_is_empty(&self, index: usize) -> bool { let bit = 1 << (index % 8); (self.counters[index / 8] & bit) == 0 } #[inline] fn is_zeroed(&self) -> bool { self.counters.iter().all(|x| *x == 0) } } impl Default for BloomStorageBool { fn default() -> Self { BloomStorageBool { counters: [0; ARRAY_SIZE / 8], } } } impl Clone for BloomStorageBool { fn clone(&self) -> Self { BloomStorageBool { counters: self.counters, } } } #[inline] fn hash1(hash: u32) -> u32 { hash & KEY_MASK } #[inline] fn hash2(hash: u32) -> u32 { (hash >> KEY_SIZE) & KEY_MASK } #[test] fn create_and_insert_some_stuff() { use fxhash::FxHasher; use std::hash::{Hash, Hasher}; use std::mem::transmute; fn hash_as_str(i: usize) -> u32 { let mut hasher = FxHasher::default(); let s = i.to_string(); s.hash(&mut hasher); let hash: u64 = hasher.finish(); (hash >> 32) as u32 ^ (hash as u32) } let mut bf = BloomFilter::new(); // Statically assert that ARRAY_SIZE is a multiple of 8, which // BloomStorageBool relies on. unsafe { transmute::<[u8; ARRAY_SIZE % 8], [u8; 0]>([]); } for i in 0_usize..1000 { bf.insert_hash(hash_as_str(i)); } for i in 0_usize..1000 { assert!(bf.might_contain_hash(hash_as_str(i))); } let false_positives = (1001_usize..2000) .filter(|i| bf.might_contain_hash(hash_as_str(*i))) .count(); assert!(false_positives < 190, "{} is not < 190", false_positives); // 19%. for i in 0_usize..100 { bf.remove_hash(hash_as_str(i)); } for i in 100_usize..1000 { assert!(bf.might_contain_hash(hash_as_str(i))); } let false_positives = (0_usize..100) .filter(|i| bf.might_contain_hash(hash_as_str(*i))) .count(); assert!(false_positives < 20, "{} is not < 20", false_positives); // 20%. bf.clear(); for i in 0_usize..2000 { assert!(!bf.might_contain_hash(hash_as_str(i))); } } #[cfg(feature = "bench")] #[cfg(test)] mod bench { extern crate test; use super::BloomFilter; #[derive(Default)] struct HashGenerator(u32); impl HashGenerator { fn next(&mut self) -> u32 { // Each hash is split into two twelve-bit segments, which are used // as an index into an array. We increment each by 64 so that we // hit the next cache line, and then another 1 so that our wrapping // behavior leads us to different entries each time. // // Trying to simulate cold caches is rather difficult with the cargo // benchmarking setup, so it may all be moot depending on the number // of iterations that end up being run. But we might as well. self.0 += (65) + (65 << super::KEY_SIZE); self.0 } } #[bench] fn create_insert_1000_remove_100_lookup_100(b: &mut test::Bencher) { b.iter(|| { let mut gen1 = HashGenerator::default(); let mut gen2 = HashGenerator::default(); let mut bf = BloomFilter::new(); for _ in 0_usize..1000 { bf.insert_hash(gen1.next()); } for _ in 0_usize..100 { bf.remove_hash(gen2.next()); } for _ in 100_usize..200 { test::black_box(bf.might_contain_hash(gen2.next())); } }); } #[bench] fn might_contain_10(b: &mut test::Bencher) { let bf = BloomFilter::new(); let mut gen = HashGenerator::default(); b.iter(|| { for _ in 0..10 { test::black_box(bf.might_contain_hash(gen.next())); } }); } #[bench] fn clear(b: &mut test::Bencher) { let mut bf = Box::new(BloomFilter::new()); b.iter(|| test::black_box(&mut bf).clear()); } #[bench] fn insert_10(b: &mut test::Bencher) { let mut bf = BloomFilter::new(); let mut gen = HashGenerator::default(); b.iter(|| { for _ in 0..10 { test::black_box(bf.insert_hash(gen.next())); } }); } #[bench] fn remove_10(b: &mut test::Bencher) { let mut bf = BloomFilter::new(); let mut gen = HashGenerator::default(); // Note: this will underflow, and that's ok. b.iter(|| { for _ in 0..10 { bf.remove_hash(gen.next()) } }); } }