From 26a029d407be480d791972afb5975cf62c9360a6 Mon Sep 17 00:00:00 2001 From: Daniel Baumann Date: Fri, 19 Apr 2024 02:47:55 +0200 Subject: Adding upstream version 124.0.1. Signed-off-by: Daniel Baumann --- .../rust/phf_generator/src/bin/gen_hash_test.rs | 20 ++++ third_party/rust/phf_generator/src/lib.rs | 109 +++++++++++++++++++++ 2 files changed, 129 insertions(+) create mode 100644 third_party/rust/phf_generator/src/bin/gen_hash_test.rs create mode 100644 third_party/rust/phf_generator/src/lib.rs (limited to 'third_party/rust/phf_generator/src') diff --git a/third_party/rust/phf_generator/src/bin/gen_hash_test.rs b/third_party/rust/phf_generator/src/bin/gen_hash_test.rs new file mode 100644 index 0000000000..2e1fbec458 --- /dev/null +++ b/third_party/rust/phf_generator/src/bin/gen_hash_test.rs @@ -0,0 +1,20 @@ +use criterion::*; + +use rand::distributions::Alphanumeric; +use rand::rngs::SmallRng; +use rand::{Rng, SeedableRng}; + +use phf_generator::generate_hash; + +fn gen_vec(len: usize) -> Vec { + let mut rng = SmallRng::seed_from_u64(0xAAAAAAAAAAAAAAAA).sample_iter(Alphanumeric); + + (0..len) + .map(move |_| rng.by_ref().take(64).collect::()) + .collect() +} + +fn main() { + let data = black_box(gen_vec(1_000_000)); + black_box(generate_hash(&data)); +} diff --git a/third_party/rust/phf_generator/src/lib.rs b/third_party/rust/phf_generator/src/lib.rs new file mode 100644 index 0000000000..4f918896aa --- /dev/null +++ b/third_party/rust/phf_generator/src/lib.rs @@ -0,0 +1,109 @@ +//! See [the `phf` crate's documentation][phf] for details. +//! +//! [phf]: https://docs.rs/phf + +#![doc(html_root_url = "https://docs.rs/phf_generator/0.11")] +use phf_shared::{HashKey, PhfHash}; +use rand::distributions::Standard; +use rand::rngs::SmallRng; +use rand::{Rng, SeedableRng}; + +const DEFAULT_LAMBDA: usize = 5; + +const FIXED_SEED: u64 = 1234567890; + +pub struct HashState { + pub key: HashKey, + pub disps: Vec<(u32, u32)>, + pub map: Vec, +} + +pub fn generate_hash(entries: &[H]) -> HashState { + SmallRng::seed_from_u64(FIXED_SEED) + .sample_iter(Standard) + .find_map(|key| try_generate_hash(entries, key)) + .expect("failed to solve PHF") +} + +fn try_generate_hash(entries: &[H], key: HashKey) -> Option { + struct Bucket { + idx: usize, + keys: Vec, + } + + let hashes: Vec<_> = entries + .iter() + .map(|entry| phf_shared::hash(entry, &key)) + .collect(); + + let buckets_len = (hashes.len() + DEFAULT_LAMBDA - 1) / DEFAULT_LAMBDA; + let mut buckets = (0..buckets_len) + .map(|i| Bucket { + idx: i, + keys: vec![], + }) + .collect::>(); + + for (i, hash) in hashes.iter().enumerate() { + buckets[(hash.g % (buckets_len as u32)) as usize] + .keys + .push(i); + } + + // Sort descending + buckets.sort_by(|a, b| a.keys.len().cmp(&b.keys.len()).reverse()); + + let table_len = hashes.len(); + let mut map = vec![None; table_len]; + let mut disps = vec![(0u32, 0u32); buckets_len]; + + // store whether an element from the bucket being placed is + // located at a certain position, to allow for efficient overlap + // checks. It works by storing the generation in each cell and + // each new placement-attempt is a new generation, so you can tell + // if this is legitimately full by checking that the generations + // are equal. (A u64 is far too large to overflow in a reasonable + // time for current hardware.) + let mut try_map = vec![0u64; table_len]; + let mut generation = 0u64; + + // the actual values corresponding to the markers above, as + // (index, key) pairs, for adding to the main map once we've + // chosen the right disps. + let mut values_to_add = vec![]; + + 'buckets: for bucket in &buckets { + for d1 in 0..(table_len as u32) { + 'disps: for d2 in 0..(table_len as u32) { + values_to_add.clear(); + generation += 1; + + for &key in &bucket.keys { + let idx = (phf_shared::displace(hashes[key].f1, hashes[key].f2, d1, d2) + % (table_len as u32)) as usize; + if map[idx].is_some() || try_map[idx] == generation { + continue 'disps; + } + try_map[idx] = generation; + values_to_add.push((idx, key)); + } + + // We've picked a good set of disps + disps[bucket.idx] = (d1, d2); + for &(idx, key) in &values_to_add { + map[idx] = Some(key); + } + continue 'buckets; + } + } + + // Unable to find displacements for a bucket + return None; + } + + Some(HashState { + key, + disps, + map: map.into_iter().map(|i| i.unwrap()).collect(), + }) +} -- cgit v1.2.3