use crate::UnalignedBuffer; use core::{cmp, hash::Hasher}; #[cfg(feature = "serialize")] use serde::{Deserialize, Serialize}; const CHUNK_SIZE: usize = 16; pub const PRIME_1: u32 = 2_654_435_761; pub const PRIME_2: u32 = 2_246_822_519; pub const PRIME_3: u32 = 3_266_489_917; pub const PRIME_4: u32 = 668_265_263; pub const PRIME_5: u32 = 374_761_393; #[cfg_attr(feature = "serialize", derive(Deserialize, Serialize))] #[derive(Copy, Clone, PartialEq)] struct XxCore { v1: u32, v2: u32, v3: u32, v4: u32, } /// Calculates the 32-bit hash. Care should be taken when using this /// hash. /// /// Although this struct implements `Hasher`, it only calculates a /// 32-bit number, leaving the upper bits as 0. This means it is /// unlikely to be correct to use this in places like a `HashMap`. #[cfg_attr(feature = "serialize", derive(Deserialize, Serialize))] #[derive(Debug, Copy, Clone, PartialEq)] pub struct XxHash32 { total_len: u64, seed: u32, core: XxCore, #[cfg_attr(feature = "serialize", serde(flatten))] buffer: Buffer, } impl XxCore { fn with_seed(seed: u32) -> XxCore { XxCore { v1: seed.wrapping_add(PRIME_1).wrapping_add(PRIME_2), v2: seed.wrapping_add(PRIME_2), v3: seed, v4: seed.wrapping_sub(PRIME_1), } } #[inline(always)] fn ingest_chunks(&mut self, values: I) where I: IntoIterator, { #[inline(always)] fn ingest_one_number(mut current_value: u32, mut value: u32) -> u32 { value = value.wrapping_mul(PRIME_2); current_value = current_value.wrapping_add(value); current_value = current_value.rotate_left(13); current_value.wrapping_mul(PRIME_1) } // By drawing these out, we can avoid going back and forth to // memory. It only really helps for large files, when we need // to iterate multiple times here. let mut v1 = self.v1; let mut v2 = self.v2; let mut v3 = self.v3; let mut v4 = self.v4; for [n1, n2, n3, n4] in values { v1 = ingest_one_number(v1, n1.to_le()); v2 = ingest_one_number(v2, n2.to_le()); v3 = ingest_one_number(v3, n3.to_le()); v4 = ingest_one_number(v4, n4.to_le()); } self.v1 = v1; self.v2 = v2; self.v3 = v3; self.v4 = v4; } #[inline(always)] fn finish(&self) -> u32 { // The original code pulls out local vars for v[1234] // here. Performance tests did not show that to be effective // here, presumably because this method is not called in a // tight loop. #[allow(unknown_lints, clippy::needless_late_init)] // keeping things parallel let mut hash; hash = self.v1.rotate_left(1); hash = hash.wrapping_add(self.v2.rotate_left(7)); hash = hash.wrapping_add(self.v3.rotate_left(12)); hash = hash.wrapping_add(self.v4.rotate_left(18)); hash } } impl core::fmt::Debug for XxCore { fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> Result<(), core::fmt::Error> { write!( f, "XxCore {{ {:016x} {:016x} {:016x} {:016x} }}", self.v1, self.v2, self.v3, self.v4 ) } } #[cfg_attr(feature = "serialize", derive(Serialize, Deserialize))] #[derive(Debug, Copy, Clone, Default, PartialEq)] #[repr(align(4))] #[cfg_attr(feature = "serialize", serde(transparent))] struct AlignToU32(T); #[cfg_attr(feature = "serialize", derive(Serialize, Deserialize))] #[derive(Debug, Copy, Clone, Default, PartialEq)] struct Buffer { #[cfg_attr(feature = "serialize", serde(rename = "buffer"))] data: AlignToU32<[u8; CHUNK_SIZE]>, #[cfg_attr(feature = "serialize", serde(rename = "buffer_usage"))] len: usize, } impl Buffer { fn data(&self) -> &[u8] { &self.data.0[..self.len] } /// Consumes as much of the parameter as it can, returning the unused part. fn consume<'a>(&mut self, data: &'a [u8]) -> &'a [u8] { let to_use = cmp::min(self.available(), data.len()); let (data, remaining) = data.split_at(to_use); self.data.0[self.len..][..to_use].copy_from_slice(data); self.len += to_use; remaining } fn set_data(&mut self, data: &[u8]) { debug_assert!(self.is_empty()); debug_assert!(data.len() < CHUNK_SIZE); self.data.0[..data.len()].copy_from_slice(data); self.len = data.len(); } fn available(&self) -> usize { CHUNK_SIZE - self.len } fn is_empty(&self) -> bool { self.len == 0 } fn is_full(&self) -> bool { self.len == CHUNK_SIZE } } impl XxHash32 { /// Constructs the hash with an initial seed pub fn with_seed(seed: u32) -> XxHash32 { XxHash32 { total_len: 0, seed, core: XxCore::with_seed(seed), buffer: Buffer::default(), } } pub(crate) fn write(&mut self, bytes: &[u8]) { let remaining = self.maybe_consume_bytes(bytes); if !remaining.is_empty() { let mut remaining = UnalignedBuffer::new(remaining); self.core.ingest_chunks(&mut remaining); self.buffer.set_data(remaining.remaining()); } self.total_len += bytes.len() as u64; } // Consume bytes and try to make `self.buffer` empty. // If there are not enough bytes, `self.buffer` can be non-empty, and this // function returns an empty slice. fn maybe_consume_bytes<'a>(&mut self, data: &'a [u8]) -> &'a [u8] { if self.buffer.is_empty() { data } else { let data = self.buffer.consume(data); if self.buffer.is_full() { let mut u32s = UnalignedBuffer::new(self.buffer.data()); self.core.ingest_chunks(&mut u32s); debug_assert!(u32s.remaining().is_empty()); self.buffer.len = 0; } data } } pub(crate) fn finish(&self) -> u32 { let mut hash = if self.total_len >= CHUNK_SIZE as u64 { // We have processed at least one full chunk self.core.finish() } else { self.seed.wrapping_add(PRIME_5) }; hash = hash.wrapping_add(self.total_len as u32); let mut buffered_u32s = UnalignedBuffer::::new(self.buffer.data()); for buffered_u32 in &mut buffered_u32s { let k1 = buffered_u32.to_le().wrapping_mul(PRIME_3); hash = hash.wrapping_add(k1); hash = hash.rotate_left(17); hash = hash.wrapping_mul(PRIME_4); } let buffered_u8s = buffered_u32s.remaining(); for &buffered_u8 in buffered_u8s { let k1 = u32::from(buffered_u8).wrapping_mul(PRIME_5); hash = hash.wrapping_add(k1); hash = hash.rotate_left(11); hash = hash.wrapping_mul(PRIME_1); } // The final intermixing hash ^= hash >> 15; hash = hash.wrapping_mul(PRIME_2); hash ^= hash >> 13; hash = hash.wrapping_mul(PRIME_3); hash ^= hash >> 16; hash } pub fn seed(&self) -> u32 { self.seed } /// Get the total number of bytes hashed, truncated to 32 bits. /// For the full 64-bit byte count, use `total_len_64` pub fn total_len(&self) -> u32 { self.total_len as u32 } /// Get the total number of bytes hashed. pub fn total_len_64(&self) -> u64 { self.total_len } } impl Default for XxHash32 { fn default() -> XxHash32 { XxHash32::with_seed(0) } } impl Hasher for XxHash32 { fn finish(&self) -> u64 { u64::from(XxHash32::finish(self)) } fn write(&mut self, bytes: &[u8]) { XxHash32::write(self, bytes) } } #[cfg(feature = "std")] pub use crate::std_support::thirty_two::RandomXxHashBuilder32; #[cfg(test)] mod test { use super::{RandomXxHashBuilder32, XxHash32}; use std::collections::HashMap; use std::hash::BuildHasherDefault; use std::prelude::v1::*; #[test] fn ingesting_byte_by_byte_is_equivalent_to_large_chunks() { let bytes: Vec<_> = (0..32).map(|_| 0).collect(); let mut byte_by_byte = XxHash32::with_seed(0); for byte in bytes.chunks(1) { byte_by_byte.write(byte); } let mut one_chunk = XxHash32::with_seed(0); one_chunk.write(&bytes); assert_eq!(byte_by_byte.core, one_chunk.core); } #[test] fn hash_of_nothing_matches_c_implementation() { let mut hasher = XxHash32::with_seed(0); hasher.write(&[]); assert_eq!(hasher.finish(), 0x02cc_5d05); } #[test] fn hash_of_single_byte_matches_c_implementation() { let mut hasher = XxHash32::with_seed(0); hasher.write(&[42]); assert_eq!(hasher.finish(), 0xe0fe_705f); } #[test] fn hash_of_multiple_bytes_matches_c_implementation() { let mut hasher = XxHash32::with_seed(0); hasher.write(b"Hello, world!\0"); assert_eq!(hasher.finish(), 0x9e5e_7e93); } #[test] fn hash_of_multiple_chunks_matches_c_implementation() { let bytes: Vec<_> = (0..100).collect(); let mut hasher = XxHash32::with_seed(0); hasher.write(&bytes); assert_eq!(hasher.finish(), 0x7f89_ba44); } #[test] fn hash_with_different_seed_matches_c_implementation() { let mut hasher = XxHash32::with_seed(0x42c9_1977); hasher.write(&[]); assert_eq!(hasher.finish(), 0xd6bf_8459); } #[test] fn hash_with_different_seed_and_multiple_chunks_matches_c_implementation() { let bytes: Vec<_> = (0..100).collect(); let mut hasher = XxHash32::with_seed(0x42c9_1977); hasher.write(&bytes); assert_eq!(hasher.finish(), 0x6d2f_6c17); } #[test] fn can_be_used_in_a_hashmap_with_a_default_seed() { let mut hash: HashMap<_, _, BuildHasherDefault> = Default::default(); hash.insert(42, "the answer"); assert_eq!(hash.get(&42), Some(&"the answer")); } #[test] fn can_be_used_in_a_hashmap_with_a_random_seed() { let mut hash: HashMap<_, _, RandomXxHashBuilder32> = Default::default(); hash.insert(42, "the answer"); assert_eq!(hash.get(&42), Some(&"the answer")); } #[cfg(feature = "serialize")] type TestResult = Result>; #[cfg(feature = "serialize")] #[test] fn test_serialization_cycle() -> TestResult { let mut hasher = XxHash32::with_seed(0); hasher.write(b"Hello, world!\0"); hasher.finish(); let serialized = serde_json::to_string(&hasher)?; let unserialized: XxHash32 = serde_json::from_str(&serialized)?; assert_eq!(hasher, unserialized); Ok(()) } #[cfg(feature = "serialize")] #[test] fn test_serialization_stability() -> TestResult { let mut hasher = XxHash32::with_seed(0); hasher.write(b"Hello, world!\0"); hasher.finish(); let serialized = r#"{ "total_len": 14, "seed": 0, "core": { "v1": 606290984, "v2": 2246822519, "v3": 0, "v4": 1640531535 }, "buffer": [ 72, 101, 108, 108, 111, 44, 32, 119, 111, 114, 108, 100, 33, 0, 0, 0 ], "buffer_usage": 14 }"#; let unserialized: XxHash32 = serde_json::from_str(serialized).unwrap(); assert_eq!(hasher, unserialized); Ok(()) } // This test validates wraparound/truncation behavior for very large inputs // of a 32-bit hash, but runs very slowly in the normal "cargo test" // build config since it hashes 4.3GB of data. It runs reasonably quick // under "cargo test --release". /* #[test] fn len_overflow_32bit() { // Hash 4.3 billion (4_300_000_000) bytes, which overflows a u32. let bytes200: Vec = (0..200).collect(); let mut hasher = XxHash32::with_seed(0); for _ in 0..(4_300_000_000u64 / 200u64) { hasher.write(&bytes200); } assert_eq!(hasher.total_len_64(), 0x0000_0001_004c_cb00); assert_eq!(hasher.total_len(), 0x004c_cb00); // retult is tested against the C implementation assert_eq!(hasher.finish(), 0x1522_4ca7); } */ }