summaryrefslogtreecommitdiffstats
path: root/servo/components/selectors/bloom.rs
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-07 17:32:43 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-07 17:32:43 +0000
commit6bf0a5cb5034a7e684dcc3500e841785237ce2dd (patch)
treea68f146d7fa01f0134297619fbe7e33db084e0aa /servo/components/selectors/bloom.rs
parentInitial commit. (diff)
downloadthunderbird-6bf0a5cb5034a7e684dcc3500e841785237ce2dd.tar.xz
thunderbird-6bf0a5cb5034a7e684dcc3500e841785237ce2dd.zip
Adding upstream version 1:115.7.0.upstream/1%115.7.0upstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'servo/components/selectors/bloom.rs')
-rw-r--r--servo/components/selectors/bloom.rs422
1 files changed, 422 insertions, 0 deletions
diff --git a/servo/components/selectors/bloom.rs b/servo/components/selectors/bloom.rs
new file mode 100644
index 0000000000..98461d1ba2
--- /dev/null
+++ b/servo/components/selectors/bloom.rs
@@ -0,0 +1,422 @@
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at https://mozilla.org/MPL/2.0/. */
+
+//! Counting and non-counting Bloom filters tuned for use as ancestor filters
+//! for selector matching.
+
+use std::fmt::{self, Debug};
+
+// The top 8 bits of the 32-bit hash value are not used by the bloom filter.
+// Consumers may rely on this to pack hashes more efficiently.
+pub const BLOOM_HASH_MASK: u32 = 0x00ffffff;
+const KEY_SIZE: usize = 12;
+
+const ARRAY_SIZE: usize = 1 << KEY_SIZE;
+const KEY_MASK: u32 = (1 << KEY_SIZE) - 1;
+
+/// A counting Bloom filter with 8-bit counters.
+pub type BloomFilter = CountingBloomFilter<BloomStorageU8>;
+
+/// A counting Bloom filter with parameterized storage to handle
+/// counters of different sizes. For now we assume that having two hash
+/// functions is enough, but we may revisit that decision later.
+///
+/// The filter uses an array with 2**KeySize entries.
+///
+/// Assuming a well-distributed hash function, a Bloom filter with
+/// array size M containing N elements and
+/// using k hash function has expected false positive rate exactly
+///
+/// $ (1 - (1 - 1/M)^{kN})^k $
+///
+/// because each array slot has a
+///
+/// $ (1 - 1/M)^{kN} $
+///
+/// chance of being 0, and the expected false positive rate is the
+/// probability that all of the k hash functions will hit a nonzero
+/// slot.
+///
+/// For reasonable assumptions (M large, kN large, which should both
+/// hold if we're worried about false positives) about M and kN this
+/// becomes approximately
+///
+/// $$ (1 - \exp(-kN/M))^k $$
+///
+/// For our special case of k == 2, that's $(1 - \exp(-2N/M))^2$,
+/// or in other words
+///
+/// $$ N/M = -0.5 * \ln(1 - \sqrt(r)) $$
+///
+/// where r is the false positive rate. This can be used to compute
+/// the desired KeySize for a given load N and false positive rate r.
+///
+/// If N/M is assumed small, then the false positive rate can
+/// further be approximated as 4*N^2/M^2. So increasing KeySize by
+/// 1, which doubles M, reduces the false positive rate by about a
+/// factor of 4, and a false positive rate of 1% corresponds to
+/// about M/N == 20.
+///
+/// What this means in practice is that for a few hundred keys using a
+/// KeySize of 12 gives false positive rates on the order of 0.25-4%.
+///
+/// Similarly, using a KeySize of 10 would lead to a 4% false
+/// positive rate for N == 100 and to quite bad false positive
+/// rates for larger N.
+#[derive(Clone, Default)]
+pub struct CountingBloomFilter<S>
+where
+ S: BloomStorage,
+{
+ storage: S,
+}
+
+impl<S> CountingBloomFilter<S>
+where
+ S: BloomStorage,
+{
+ /// Creates a new bloom filter.
+ #[inline]
+ pub fn new() -> Self {
+ Default::default()
+ }
+
+ #[inline]
+ pub fn clear(&mut self) {
+ self.storage = Default::default();
+ }
+
+ // Slow linear accessor to make sure the bloom filter is zeroed. This should
+ // never be used in release builds.
+ #[cfg(debug_assertions)]
+ pub fn is_zeroed(&self) -> bool {
+ self.storage.is_zeroed()
+ }
+
+ #[cfg(not(debug_assertions))]
+ pub fn is_zeroed(&self) -> bool {
+ unreachable!()
+ }
+
+ /// Inserts an item with a particular hash into the bloom filter.
+ #[inline]
+ pub fn insert_hash(&mut self, hash: u32) {
+ self.storage.adjust_first_slot(hash, true);
+ self.storage.adjust_second_slot(hash, true);
+ }
+
+ /// Removes an item with a particular hash from the bloom filter.
+ #[inline]
+ pub fn remove_hash(&mut self, hash: u32) {
+ self.storage.adjust_first_slot(hash, false);
+ self.storage.adjust_second_slot(hash, false);
+ }
+
+ /// Check whether the filter might contain an item with the given hash.
+ /// This can sometimes return true even if the item is not in the filter,
+ /// but will never return false for items that are actually in the filter.
+ #[inline]
+ pub fn might_contain_hash(&self, hash: u32) -> bool {
+ !self.storage.first_slot_is_empty(hash) && !self.storage.second_slot_is_empty(hash)
+ }
+}
+
+impl<S> Debug for CountingBloomFilter<S>
+where
+ S: BloomStorage,
+{
+ fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+ let mut slots_used = 0;
+ for i in 0..ARRAY_SIZE {
+ if !self.storage.slot_is_empty(i) {
+ slots_used += 1;
+ }
+ }
+ write!(f, "BloomFilter({}/{})", slots_used, ARRAY_SIZE)
+ }
+}
+
+pub trait BloomStorage: Clone + Default {
+ fn slot_is_empty(&self, index: usize) -> bool;
+ fn adjust_slot(&mut self, index: usize, increment: bool);
+ fn is_zeroed(&self) -> bool;
+
+ #[inline]
+ fn first_slot_is_empty(&self, hash: u32) -> bool {
+ self.slot_is_empty(Self::first_slot_index(hash))
+ }
+
+ #[inline]
+ fn second_slot_is_empty(&self, hash: u32) -> bool {
+ self.slot_is_empty(Self::second_slot_index(hash))
+ }
+
+ #[inline]
+ fn adjust_first_slot(&mut self, hash: u32, increment: bool) {
+ self.adjust_slot(Self::first_slot_index(hash), increment)
+ }
+
+ #[inline]
+ fn adjust_second_slot(&mut self, hash: u32, increment: bool) {
+ self.adjust_slot(Self::second_slot_index(hash), increment)
+ }
+
+ #[inline]
+ fn first_slot_index(hash: u32) -> usize {
+ hash1(hash) as usize
+ }
+
+ #[inline]
+ fn second_slot_index(hash: u32) -> usize {
+ hash2(hash) as usize
+ }
+}
+
+/// Storage class for a CountingBloomFilter that has 8-bit counters.
+pub struct BloomStorageU8 {
+ counters: [u8; ARRAY_SIZE],
+}
+
+impl BloomStorage for BloomStorageU8 {
+ #[inline]
+ fn adjust_slot(&mut self, index: usize, increment: bool) {
+ let slot = &mut self.counters[index];
+ if *slot != 0xff {
+ // full
+ if increment {
+ *slot += 1;
+ } else {
+ *slot -= 1;
+ }
+ }
+ }
+
+ #[inline]
+ fn slot_is_empty(&self, index: usize) -> bool {
+ self.counters[index] == 0
+ }
+
+ #[inline]
+ fn is_zeroed(&self) -> bool {
+ self.counters.iter().all(|x| *x == 0)
+ }
+}
+
+impl Default for BloomStorageU8 {
+ fn default() -> Self {
+ BloomStorageU8 {
+ counters: [0; ARRAY_SIZE],
+ }
+ }
+}
+
+impl Clone for BloomStorageU8 {
+ fn clone(&self) -> Self {
+ BloomStorageU8 {
+ counters: self.counters,
+ }
+ }
+}
+
+/// Storage class for a CountingBloomFilter that has 1-bit counters.
+pub struct BloomStorageBool {
+ counters: [u8; ARRAY_SIZE / 8],
+}
+
+impl BloomStorage for BloomStorageBool {
+ #[inline]
+ fn adjust_slot(&mut self, index: usize, increment: bool) {
+ let bit = 1 << (index % 8);
+ let byte = &mut self.counters[index / 8];
+
+ // Since we have only one bit for storage, decrementing it
+ // should never do anything. Assert against an accidental
+ // decrementing of a bit that was never set.
+ assert!(
+ increment || (*byte & bit) != 0,
+ "should not decrement if slot is already false"
+ );
+
+ if increment {
+ *byte |= bit;
+ }
+ }
+
+ #[inline]
+ fn slot_is_empty(&self, index: usize) -> bool {
+ let bit = 1 << (index % 8);
+ (self.counters[index / 8] & bit) == 0
+ }
+
+ #[inline]
+ fn is_zeroed(&self) -> bool {
+ self.counters.iter().all(|x| *x == 0)
+ }
+}
+
+impl Default for BloomStorageBool {
+ fn default() -> Self {
+ BloomStorageBool {
+ counters: [0; ARRAY_SIZE / 8],
+ }
+ }
+}
+
+impl Clone for BloomStorageBool {
+ fn clone(&self) -> Self {
+ BloomStorageBool {
+ counters: self.counters,
+ }
+ }
+}
+
+#[inline]
+fn hash1(hash: u32) -> u32 {
+ hash & KEY_MASK
+}
+
+#[inline]
+fn hash2(hash: u32) -> u32 {
+ (hash >> KEY_SIZE) & KEY_MASK
+}
+
+#[test]
+fn create_and_insert_some_stuff() {
+ use fxhash::FxHasher;
+ use std::hash::{Hash, Hasher};
+ use std::mem::transmute;
+
+ fn hash_as_str(i: usize) -> u32 {
+ let mut hasher = FxHasher::default();
+ let s = i.to_string();
+ s.hash(&mut hasher);
+ let hash: u64 = hasher.finish();
+ (hash >> 32) as u32 ^ (hash as u32)
+ }
+
+ let mut bf = BloomFilter::new();
+
+ // Statically assert that ARRAY_SIZE is a multiple of 8, which
+ // BloomStorageBool relies on.
+ unsafe {
+ transmute::<[u8; ARRAY_SIZE % 8], [u8; 0]>([]);
+ }
+
+ for i in 0_usize..1000 {
+ bf.insert_hash(hash_as_str(i));
+ }
+
+ for i in 0_usize..1000 {
+ assert!(bf.might_contain_hash(hash_as_str(i)));
+ }
+
+ let false_positives = (1001_usize..2000)
+ .filter(|i| bf.might_contain_hash(hash_as_str(*i)))
+ .count();
+
+ assert!(false_positives < 190, "{} is not < 190", false_positives); // 19%.
+
+ for i in 0_usize..100 {
+ bf.remove_hash(hash_as_str(i));
+ }
+
+ for i in 100_usize..1000 {
+ assert!(bf.might_contain_hash(hash_as_str(i)));
+ }
+
+ let false_positives = (0_usize..100)
+ .filter(|i| bf.might_contain_hash(hash_as_str(*i)))
+ .count();
+
+ assert!(false_positives < 20, "{} is not < 20", false_positives); // 20%.
+
+ bf.clear();
+
+ for i in 0_usize..2000 {
+ assert!(!bf.might_contain_hash(hash_as_str(i)));
+ }
+}
+
+#[cfg(feature = "bench")]
+#[cfg(test)]
+mod bench {
+ extern crate test;
+ use super::BloomFilter;
+
+ #[derive(Default)]
+ struct HashGenerator(u32);
+
+ impl HashGenerator {
+ fn next(&mut self) -> u32 {
+ // Each hash is split into two twelve-bit segments, which are used
+ // as an index into an array. We increment each by 64 so that we
+ // hit the next cache line, and then another 1 so that our wrapping
+ // behavior leads us to different entries each time.
+ //
+ // Trying to simulate cold caches is rather difficult with the cargo
+ // benchmarking setup, so it may all be moot depending on the number
+ // of iterations that end up being run. But we might as well.
+ self.0 += (65) + (65 << super::KEY_SIZE);
+ self.0
+ }
+ }
+
+ #[bench]
+ fn create_insert_1000_remove_100_lookup_100(b: &mut test::Bencher) {
+ b.iter(|| {
+ let mut gen1 = HashGenerator::default();
+ let mut gen2 = HashGenerator::default();
+ let mut bf = BloomFilter::new();
+ for _ in 0_usize..1000 {
+ bf.insert_hash(gen1.next());
+ }
+ for _ in 0_usize..100 {
+ bf.remove_hash(gen2.next());
+ }
+ for _ in 100_usize..200 {
+ test::black_box(bf.might_contain_hash(gen2.next()));
+ }
+ });
+ }
+
+ #[bench]
+ fn might_contain_10(b: &mut test::Bencher) {
+ let bf = BloomFilter::new();
+ let mut gen = HashGenerator::default();
+ b.iter(|| {
+ for _ in 0..10 {
+ test::black_box(bf.might_contain_hash(gen.next()));
+ }
+ });
+ }
+
+ #[bench]
+ fn clear(b: &mut test::Bencher) {
+ let mut bf = Box::new(BloomFilter::new());
+ b.iter(|| test::black_box(&mut bf).clear());
+ }
+
+ #[bench]
+ fn insert_10(b: &mut test::Bencher) {
+ let mut bf = BloomFilter::new();
+ let mut gen = HashGenerator::default();
+ b.iter(|| {
+ for _ in 0..10 {
+ test::black_box(bf.insert_hash(gen.next()));
+ }
+ });
+ }
+
+ #[bench]
+ fn remove_10(b: &mut test::Bencher) {
+ let mut bf = BloomFilter::new();
+ let mut gen = HashGenerator::default();
+ // Note: this will underflow, and that's ok.
+ b.iter(|| {
+ for _ in 0..10 {
+ bf.remove_hash(gen.next())
+ }
+ });
+ }
+}