From 64d98f8ee037282c35007b64c2649055c56af1db Mon Sep 17 00:00:00 2001 From: Daniel Baumann Date: Wed, 17 Apr 2024 14:19:03 +0200 Subject: Merging upstream version 1.68.2+dfsg1. Signed-off-by: Daniel Baumann --- vendor/twox-hash/src/thirty_two.rs | 416 +++++++++++++++++++++++++++++++++++++ 1 file changed, 416 insertions(+) create mode 100644 vendor/twox-hash/src/thirty_two.rs (limited to 'vendor/twox-hash/src/thirty_two.rs') diff --git a/vendor/twox-hash/src/thirty_two.rs b/vendor/twox-hash/src/thirty_two.rs new file mode 100644 index 000000000..cfa44cdbc --- /dev/null +++ b/vendor/twox-hash/src/thirty_two.rs @@ -0,0 +1,416 @@ +use crate::UnalignedBuffer; +use core::{cmp, hash::Hasher}; + +#[cfg(feature = "serialize")] +use serde::{Deserialize, Serialize}; + +const CHUNK_SIZE: usize = 16; + +pub const PRIME_1: u32 = 2_654_435_761; +pub const PRIME_2: u32 = 2_246_822_519; +pub const PRIME_3: u32 = 3_266_489_917; +pub const PRIME_4: u32 = 668_265_263; +pub const PRIME_5: u32 = 374_761_393; + +#[cfg_attr(feature = "serialize", derive(Deserialize, Serialize))] +#[derive(Copy, Clone, PartialEq)] +struct XxCore { + v1: u32, + v2: u32, + v3: u32, + v4: u32, +} + +/// Calculates the 32-bit hash. Care should be taken when using this +/// hash. +/// +/// Although this struct implements `Hasher`, it only calculates a +/// 32-bit number, leaving the upper bits as 0. This means it is +/// unlikely to be correct to use this in places like a `HashMap`. +#[cfg_attr(feature = "serialize", derive(Deserialize, Serialize))] +#[derive(Debug, Copy, Clone, PartialEq)] +pub struct XxHash32 { + total_len: u64, + seed: u32, + core: XxCore, + #[cfg_attr(feature = "serialize", serde(flatten))] + buffer: Buffer, +} + +impl XxCore { + fn with_seed(seed: u32) -> XxCore { + XxCore { + v1: seed.wrapping_add(PRIME_1).wrapping_add(PRIME_2), + v2: seed.wrapping_add(PRIME_2), + v3: seed, + v4: seed.wrapping_sub(PRIME_1), + } + } + + #[inline(always)] + fn ingest_chunks(&mut self, values: I) + where + I: IntoIterator, + { + #[inline(always)] + fn ingest_one_number(mut current_value: u32, mut value: u32) -> u32 { + value = value.wrapping_mul(PRIME_2); + current_value = current_value.wrapping_add(value); + current_value = current_value.rotate_left(13); + current_value.wrapping_mul(PRIME_1) + } + + // By drawing these out, we can avoid going back and forth to + // memory. It only really helps for large files, when we need + // to iterate multiple times here. + + let mut v1 = self.v1; + let mut v2 = self.v2; + let mut v3 = self.v3; + let mut v4 = self.v4; + + for [n1, n2, n3, n4] in values { + v1 = ingest_one_number(v1, n1.to_le()); + v2 = ingest_one_number(v2, n2.to_le()); + v3 = ingest_one_number(v3, n3.to_le()); + v4 = ingest_one_number(v4, n4.to_le()); + } + + self.v1 = v1; + self.v2 = v2; + self.v3 = v3; + self.v4 = v4; + } + + #[inline(always)] + fn finish(&self) -> u32 { + // The original code pulls out local vars for v[1234] + // here. Performance tests did not show that to be effective + // here, presumably because this method is not called in a + // tight loop. + + #[allow(unknown_lints, clippy::needless_late_init)] // keeping things parallel + let mut hash; + + hash = self.v1.rotate_left(1); + hash = hash.wrapping_add(self.v2.rotate_left(7)); + hash = hash.wrapping_add(self.v3.rotate_left(12)); + hash = hash.wrapping_add(self.v4.rotate_left(18)); + + hash + } +} + +impl core::fmt::Debug for XxCore { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> Result<(), core::fmt::Error> { + write!( + f, + "XxCore {{ {:016x} {:016x} {:016x} {:016x} }}", + self.v1, self.v2, self.v3, self.v4 + ) + } +} + +#[cfg_attr(feature = "serialize", derive(Serialize, Deserialize))] +#[derive(Debug, Copy, Clone, Default, PartialEq)] +#[repr(align(4))] +#[cfg_attr(feature = "serialize", serde(transparent))] +struct AlignToU32(T); + +#[cfg_attr(feature = "serialize", derive(Serialize, Deserialize))] +#[derive(Debug, Copy, Clone, Default, PartialEq)] +struct Buffer { + #[cfg_attr(feature = "serialize", serde(rename = "buffer"))] + data: AlignToU32<[u8; CHUNK_SIZE]>, + #[cfg_attr(feature = "serialize", serde(rename = "buffer_usage"))] + len: usize, +} + +impl Buffer { + fn data(&self) -> &[u8] { + &self.data.0[..self.len] + } + + /// Consumes as much of the parameter as it can, returning the unused part. + fn consume<'a>(&mut self, data: &'a [u8]) -> &'a [u8] { + let to_use = cmp::min(self.available(), data.len()); + let (data, remaining) = data.split_at(to_use); + self.data.0[self.len..][..to_use].copy_from_slice(data); + self.len += to_use; + remaining + } + + fn set_data(&mut self, data: &[u8]) { + debug_assert!(self.is_empty()); + debug_assert!(data.len() < CHUNK_SIZE); + self.data.0[..data.len()].copy_from_slice(data); + self.len = data.len(); + } + + fn available(&self) -> usize { + CHUNK_SIZE - self.len + } + + fn is_empty(&self) -> bool { + self.len == 0 + } + + fn is_full(&self) -> bool { + self.len == CHUNK_SIZE + } +} + +impl XxHash32 { + /// Constructs the hash with an initial seed + pub fn with_seed(seed: u32) -> XxHash32 { + XxHash32 { + total_len: 0, + seed, + core: XxCore::with_seed(seed), + buffer: Buffer::default(), + } + } + + pub(crate) fn write(&mut self, bytes: &[u8]) { + let remaining = self.maybe_consume_bytes(bytes); + if !remaining.is_empty() { + let mut remaining = UnalignedBuffer::new(remaining); + self.core.ingest_chunks(&mut remaining); + self.buffer.set_data(remaining.remaining()); + } + self.total_len += bytes.len() as u64; + } + + // Consume bytes and try to make `self.buffer` empty. + // If there are not enough bytes, `self.buffer` can be non-empty, and this + // function returns an empty slice. + fn maybe_consume_bytes<'a>(&mut self, data: &'a [u8]) -> &'a [u8] { + if self.buffer.is_empty() { + data + } else { + let data = self.buffer.consume(data); + if self.buffer.is_full() { + let mut u32s = UnalignedBuffer::new(self.buffer.data()); + self.core.ingest_chunks(&mut u32s); + debug_assert!(u32s.remaining().is_empty()); + self.buffer.len = 0; + } + data + } + } + + pub(crate) fn finish(&self) -> u32 { + let mut hash = if self.total_len >= CHUNK_SIZE as u64 { + // We have processed at least one full chunk + self.core.finish() + } else { + self.seed.wrapping_add(PRIME_5) + }; + + hash = hash.wrapping_add(self.total_len as u32); + + let mut buffered_u32s = UnalignedBuffer::::new(self.buffer.data()); + for buffered_u32 in &mut buffered_u32s { + let k1 = buffered_u32.to_le().wrapping_mul(PRIME_3); + hash = hash.wrapping_add(k1); + hash = hash.rotate_left(17); + hash = hash.wrapping_mul(PRIME_4); + } + + let buffered_u8s = buffered_u32s.remaining(); + for &buffered_u8 in buffered_u8s { + let k1 = u32::from(buffered_u8).wrapping_mul(PRIME_5); + hash = hash.wrapping_add(k1); + hash = hash.rotate_left(11); + hash = hash.wrapping_mul(PRIME_1); + } + + // The final intermixing + hash ^= hash >> 15; + hash = hash.wrapping_mul(PRIME_2); + hash ^= hash >> 13; + hash = hash.wrapping_mul(PRIME_3); + hash ^= hash >> 16; + + hash + } + + pub fn seed(&self) -> u32 { + self.seed + } + + /// Get the total number of bytes hashed, truncated to 32 bits. + /// For the full 64-bit byte count, use `total_len_64` + pub fn total_len(&self) -> u32 { + self.total_len as u32 + } + + /// Get the total number of bytes hashed. + pub fn total_len_64(&self) -> u64 { + self.total_len + } +} + +impl Default for XxHash32 { + fn default() -> XxHash32 { + XxHash32::with_seed(0) + } +} + +impl Hasher for XxHash32 { + fn finish(&self) -> u64 { + u64::from(XxHash32::finish(self)) + } + + fn write(&mut self, bytes: &[u8]) { + XxHash32::write(self, bytes) + } +} + +#[cfg(feature = "std")] +pub use crate::std_support::thirty_two::RandomXxHashBuilder32; + +#[cfg(test)] +mod test { + use super::{RandomXxHashBuilder32, XxHash32}; + use std::collections::HashMap; + use std::hash::BuildHasherDefault; + use std::prelude::v1::*; + + #[test] + fn ingesting_byte_by_byte_is_equivalent_to_large_chunks() { + let bytes: Vec<_> = (0..32).map(|_| 0).collect(); + + let mut byte_by_byte = XxHash32::with_seed(0); + for byte in bytes.chunks(1) { + byte_by_byte.write(byte); + } + + let mut one_chunk = XxHash32::with_seed(0); + one_chunk.write(&bytes); + + assert_eq!(byte_by_byte.core, one_chunk.core); + } + + #[test] + fn hash_of_nothing_matches_c_implementation() { + let mut hasher = XxHash32::with_seed(0); + hasher.write(&[]); + assert_eq!(hasher.finish(), 0x02cc_5d05); + } + + #[test] + fn hash_of_single_byte_matches_c_implementation() { + let mut hasher = XxHash32::with_seed(0); + hasher.write(&[42]); + assert_eq!(hasher.finish(), 0xe0fe_705f); + } + + #[test] + fn hash_of_multiple_bytes_matches_c_implementation() { + let mut hasher = XxHash32::with_seed(0); + hasher.write(b"Hello, world!\0"); + assert_eq!(hasher.finish(), 0x9e5e_7e93); + } + + #[test] + fn hash_of_multiple_chunks_matches_c_implementation() { + let bytes: Vec<_> = (0..100).collect(); + let mut hasher = XxHash32::with_seed(0); + hasher.write(&bytes); + assert_eq!(hasher.finish(), 0x7f89_ba44); + } + + #[test] + fn hash_with_different_seed_matches_c_implementation() { + let mut hasher = XxHash32::with_seed(0x42c9_1977); + hasher.write(&[]); + assert_eq!(hasher.finish(), 0xd6bf_8459); + } + + #[test] + fn hash_with_different_seed_and_multiple_chunks_matches_c_implementation() { + let bytes: Vec<_> = (0..100).collect(); + let mut hasher = XxHash32::with_seed(0x42c9_1977); + hasher.write(&bytes); + assert_eq!(hasher.finish(), 0x6d2f_6c17); + } + + #[test] + fn can_be_used_in_a_hashmap_with_a_default_seed() { + let mut hash: HashMap<_, _, BuildHasherDefault> = Default::default(); + hash.insert(42, "the answer"); + assert_eq!(hash.get(&42), Some(&"the answer")); + } + + #[test] + fn can_be_used_in_a_hashmap_with_a_random_seed() { + let mut hash: HashMap<_, _, RandomXxHashBuilder32> = Default::default(); + hash.insert(42, "the answer"); + assert_eq!(hash.get(&42), Some(&"the answer")); + } + + #[cfg(feature = "serialize")] + type TestResult = Result>; + + #[cfg(feature = "serialize")] + #[test] + fn test_serialization_cycle() -> TestResult { + let mut hasher = XxHash32::with_seed(0); + hasher.write(b"Hello, world!\0"); + hasher.finish(); + + let serialized = serde_json::to_string(&hasher)?; + let unserialized: XxHash32 = serde_json::from_str(&serialized)?; + assert_eq!(hasher, unserialized); + Ok(()) + } + + #[cfg(feature = "serialize")] + #[test] + fn test_serialization_stability() -> TestResult { + let mut hasher = XxHash32::with_seed(0); + hasher.write(b"Hello, world!\0"); + hasher.finish(); + + let serialized = r#"{ + "total_len": 14, + "seed": 0, + "core": { + "v1": 606290984, + "v2": 2246822519, + "v3": 0, + "v4": 1640531535 + }, + "buffer": [ + 72, 101, 108, 108, 111, 44, 32, 119, + 111, 114, 108, 100, 33, 0, 0, 0 + ], + "buffer_usage": 14 + }"#; + + let unserialized: XxHash32 = serde_json::from_str(serialized).unwrap(); + assert_eq!(hasher, unserialized); + Ok(()) + } + + // This test validates wraparound/truncation behavior for very large inputs + // of a 32-bit hash, but runs very slowly in the normal "cargo test" + // build config since it hashes 4.3GB of data. It runs reasonably quick + // under "cargo test --release". + /* + #[test] + fn len_overflow_32bit() { + // Hash 4.3 billion (4_300_000_000) bytes, which overflows a u32. + let bytes200: Vec = (0..200).collect(); + let mut hasher = XxHash32::with_seed(0); + for _ in 0..(4_300_000_000u64 / 200u64) { + hasher.write(&bytes200); + } + assert_eq!(hasher.total_len_64(), 0x0000_0001_004c_cb00); + assert_eq!(hasher.total_len(), 0x004c_cb00); + // retult is tested against the C implementation + assert_eq!(hasher.finish(), 0x1522_4ca7); + } + */ +} -- cgit v1.2.3