diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-28 14:29:10 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-28 14:29:10 +0000 |
commit | 2aa4a82499d4becd2284cdb482213d541b8804dd (patch) | |
tree | b80bf8bf13c3766139fbacc530efd0dd9d54394c /third_party/rust/blake2b_simd/src/guts.rs | |
parent | Initial commit. (diff) | |
download | firefox-2aa4a82499d4becd2284cdb482213d541b8804dd.tar.xz firefox-2aa4a82499d4becd2284cdb482213d541b8804dd.zip |
Adding upstream version 86.0.1.upstream/86.0.1upstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'third_party/rust/blake2b_simd/src/guts.rs')
-rw-r--r-- | third_party/rust/blake2b_simd/src/guts.rs | 565 |
1 files changed, 565 insertions, 0 deletions
diff --git a/third_party/rust/blake2b_simd/src/guts.rs b/third_party/rust/blake2b_simd/src/guts.rs new file mode 100644 index 0000000000..9fcacf319c --- /dev/null +++ b/third_party/rust/blake2b_simd/src/guts.rs @@ -0,0 +1,565 @@ +use crate::*; +use arrayref::array_ref; +use core::cmp; + +#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] +pub const MAX_DEGREE: usize = 4; + +#[cfg(not(any(target_arch = "x86", target_arch = "x86_64")))] +pub const MAX_DEGREE: usize = 1; + +// Variants other than Portable are unreachable in no_std, unless CPU features +// are explicitly enabled for the build with e.g. RUSTFLAGS="-C target-feature=avx2". +// This might change in the future if is_x86_feature_detected moves into libcore. +#[allow(dead_code)] +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +enum Platform { + Portable, + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + SSE41, + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + AVX2, +} + +#[derive(Clone, Copy, Debug)] +pub struct Implementation(Platform); + +impl Implementation { + pub fn detect() -> Self { + // Try the different implementations in order of how fast/modern they + // are. Currently on non-x86, everything just uses portable. + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + { + if let Some(avx2_impl) = Self::avx2_if_supported() { + return avx2_impl; + } + } + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + { + if let Some(sse41_impl) = Self::sse41_if_supported() { + return sse41_impl; + } + } + Self::portable() + } + + pub fn portable() -> Self { + Implementation(Platform::Portable) + } + + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + #[allow(unreachable_code)] + pub fn sse41_if_supported() -> Option<Self> { + // Check whether SSE4.1 support is assumed by the build. + #[cfg(target_feature = "sse4.1")] + { + return Some(Implementation(Platform::SSE41)); + } + // Otherwise dynamically check for support if we can. + #[cfg(feature = "std")] + { + if is_x86_feature_detected!("sse4.1") { + return Some(Implementation(Platform::SSE41)); + } + } + None + } + + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + #[allow(unreachable_code)] + pub fn avx2_if_supported() -> Option<Self> { + // Check whether AVX2 support is assumed by the build. + #[cfg(target_feature = "avx2")] + { + return Some(Implementation(Platform::AVX2)); + } + // Otherwise dynamically check for support if we can. + #[cfg(feature = "std")] + { + if is_x86_feature_detected!("avx2") { + return Some(Implementation(Platform::AVX2)); + } + } + None + } + + pub fn degree(&self) -> usize { + match self.0 { + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + Platform::AVX2 => avx2::DEGREE, + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + Platform::SSE41 => sse41::DEGREE, + Platform::Portable => 1, + } + } + + pub fn compress1_loop( + &self, + input: &[u8], + words: &mut [Word; 8], + count: Count, + last_node: LastNode, + finalize: Finalize, + stride: Stride, + ) { + match self.0 { + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + Platform::AVX2 => unsafe { + avx2::compress1_loop(input, words, count, last_node, finalize, stride); + }, + // Note that there's an SSE version of compress1 in the official C + // implementation, but I haven't ported it yet. + _ => { + portable::compress1_loop(input, words, count, last_node, finalize, stride); + } + } + } + + pub fn compress2_loop(&self, jobs: &mut [Job; 2], finalize: Finalize, stride: Stride) { + match self.0 { + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + Platform::AVX2 | Platform::SSE41 => unsafe { + sse41::compress2_loop(jobs, finalize, stride) + }, + _ => panic!("unsupported"), + } + } + + pub fn compress4_loop(&self, jobs: &mut [Job; 4], finalize: Finalize, stride: Stride) { + match self.0 { + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + Platform::AVX2 => unsafe { avx2::compress4_loop(jobs, finalize, stride) }, + _ => panic!("unsupported"), + } + } +} + +pub struct Job<'a, 'b> { + pub input: &'a [u8], + pub words: &'b mut [Word; 8], + pub count: Count, + pub last_node: LastNode, +} + +impl<'a, 'b> core::fmt::Debug for Job<'a, 'b> { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + // NB: Don't print the words. Leaking them would allow length extension. + write!( + f, + "Job {{ input_len: {}, count: {}, last_node: {} }}", + self.input.len(), + self.count, + self.last_node.yes(), + ) + } +} + +// Finalize could just be a bool, but this is easier to read at callsites. +#[derive(Clone, Copy, Debug)] +pub enum Finalize { + Yes, + No, +} + +impl Finalize { + pub fn yes(&self) -> bool { + match self { + Finalize::Yes => true, + Finalize::No => false, + } + } +} + +// Like Finalize, this is easier to read at callsites. +#[derive(Clone, Copy, Debug)] +pub enum LastNode { + Yes, + No, +} + +impl LastNode { + pub fn yes(&self) -> bool { + match self { + LastNode::Yes => true, + LastNode::No => false, + } + } +} + +#[derive(Clone, Copy, Debug)] +pub enum Stride { + Serial, // BLAKE2b/BLAKE2s + Parallel, // BLAKE2bp/BLAKE2sp +} + +impl Stride { + pub fn padded_blockbytes(&self) -> usize { + match self { + Stride::Serial => BLOCKBYTES, + Stride::Parallel => blake2bp::DEGREE * BLOCKBYTES, + } + } +} + +pub(crate) fn count_low(count: Count) -> Word { + count as Word +} + +pub(crate) fn count_high(count: Count) -> Word { + (count >> 8 * size_of::<Word>()) as Word +} + +pub(crate) fn assemble_count(low: Word, high: Word) -> Count { + low as Count + ((high as Count) << 8 * size_of::<Word>()) +} + +pub(crate) fn flag_word(flag: bool) -> Word { + if flag { + !0 + } else { + 0 + } +} + +// Pull a array reference at the given offset straight from the input, if +// there's a full block of input available. If there's only a partial block, +// copy it into the provided buffer, and return an array reference that. Along +// with the array, return the number of bytes of real input, and whether the +// input can be finalized (i.e. whether there aren't any more bytes after this +// block). Note that this is written so that the optimizer can elide bounds +// checks, see: https://godbolt.org/z/0hH2bC +pub fn final_block<'a>( + input: &'a [u8], + offset: usize, + buffer: &'a mut [u8; BLOCKBYTES], + stride: Stride, +) -> (&'a [u8; BLOCKBYTES], usize, bool) { + let capped_offset = cmp::min(offset, input.len()); + let offset_slice = &input[capped_offset..]; + if offset_slice.len() >= BLOCKBYTES { + let block = array_ref!(offset_slice, 0, BLOCKBYTES); + let should_finalize = offset_slice.len() <= stride.padded_blockbytes(); + (block, BLOCKBYTES, should_finalize) + } else { + // Copy the final block to the front of the block buffer. The rest of + // the buffer is assumed to be initialized to zero. + buffer[..offset_slice.len()].copy_from_slice(offset_slice); + (buffer, offset_slice.len(), true) + } +} + +pub fn input_debug_asserts(input: &[u8], finalize: Finalize) { + // If we're not finalizing, the input must not be empty, and it must be an + // even multiple of the block size. + if !finalize.yes() { + debug_assert!(!input.is_empty()); + debug_assert_eq!(0, input.len() % BLOCKBYTES); + } +} + +#[cfg(test)] +mod test { + use super::*; + use arrayvec::ArrayVec; + use core::mem::size_of; + + #[test] + fn test_detection() { + assert_eq!(Platform::Portable, Implementation::portable().0); + + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + #[cfg(feature = "std")] + { + if is_x86_feature_detected!("avx2") { + assert_eq!(Platform::AVX2, Implementation::detect().0); + assert_eq!( + Platform::AVX2, + Implementation::avx2_if_supported().unwrap().0 + ); + assert_eq!( + Platform::SSE41, + Implementation::sse41_if_supported().unwrap().0 + ); + } else if is_x86_feature_detected!("sse4.1") { + assert_eq!(Platform::SSE41, Implementation::detect().0); + assert!(Implementation::avx2_if_supported().is_none()); + assert_eq!( + Platform::SSE41, + Implementation::sse41_if_supported().unwrap().0 + ); + } else { + assert_eq!(Platform::Portable, Implementation::detect().0); + assert!(Implementation::avx2_if_supported().is_none()); + assert!(Implementation::sse41_if_supported().is_none()); + } + } + } + + // TODO: Move all of these case tests into the implementation files. + fn exercise_cases<F>(mut f: F) + where + F: FnMut(Stride, usize, LastNode, Finalize, Count), + { + // Chose counts to hit the relevant overflow cases. + let counts = &[ + (0 as Count), + ((1 as Count) << (8 * size_of::<Word>())) - BLOCKBYTES as Count, + (0 as Count).wrapping_sub(BLOCKBYTES as Count), + ]; + for &stride in &[Stride::Serial, Stride::Parallel] { + let lengths = [ + 0, + 1, + BLOCKBYTES - 1, + BLOCKBYTES, + BLOCKBYTES + 1, + 2 * BLOCKBYTES - 1, + 2 * BLOCKBYTES, + 2 * BLOCKBYTES + 1, + stride.padded_blockbytes() - 1, + stride.padded_blockbytes(), + stride.padded_blockbytes() + 1, + 2 * stride.padded_blockbytes() - 1, + 2 * stride.padded_blockbytes(), + 2 * stride.padded_blockbytes() + 1, + ]; + for &length in &lengths { + for &last_node in &[LastNode::No, LastNode::Yes] { + for &finalize in &[Finalize::No, Finalize::Yes] { + if !finalize.yes() && (length == 0 || length % BLOCKBYTES != 0) { + // Skip these cases, they're invalid. + continue; + } + for &count in counts { + // eprintln!("\ncase -----"); + // dbg!(stride); + // dbg!(length); + // dbg!(last_node); + // dbg!(finalize); + // dbg!(count); + + f(stride, length, last_node, finalize, count); + } + } + } + } + } + } + + fn initial_test_words(input_index: usize) -> [Word; 8] { + crate::Params::new() + .node_offset(input_index as u64) + .to_words() + } + + // Use the portable implementation, one block at a time, to compute the + // final state words expected for a given test case. + fn reference_compression( + input: &[u8], + stride: Stride, + last_node: LastNode, + finalize: Finalize, + mut count: Count, + input_index: usize, + ) -> [Word; 8] { + let mut words = initial_test_words(input_index); + let mut offset = 0; + while offset == 0 || offset < input.len() { + let block_size = cmp::min(BLOCKBYTES, input.len() - offset); + let maybe_finalize = if offset + stride.padded_blockbytes() < input.len() { + Finalize::No + } else { + finalize + }; + portable::compress1_loop( + &input[offset..][..block_size], + &mut words, + count, + last_node, + maybe_finalize, + Stride::Serial, + ); + offset += stride.padded_blockbytes(); + count = count.wrapping_add(BLOCKBYTES as Count); + } + words + } + + // For various loop lengths and finalization parameters, make sure that the + // implementation gives the same answer as the portable implementation does + // when invoked one block at a time. (So even the portable implementation + // itself is being tested here, to make sure its loop is correct.) Note + // that this doesn't include any fixed test vectors; those are taken from + // the blake2-kat.json file (copied from upstream) and tested elsewhere. + fn exercise_compress1_loop(implementation: Implementation) { + let mut input = [0; 100 * BLOCKBYTES]; + paint_test_input(&mut input); + + exercise_cases(|stride, length, last_node, finalize, count| { + let reference_words = + reference_compression(&input[..length], stride, last_node, finalize, count, 0); + + let mut test_words = initial_test_words(0); + implementation.compress1_loop( + &input[..length], + &mut test_words, + count, + last_node, + finalize, + stride, + ); + assert_eq!(reference_words, test_words); + }); + } + + #[test] + fn test_compress1_loop_portable() { + exercise_compress1_loop(Implementation::portable()); + } + + #[test] + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + fn test_compress1_loop_sse41() { + // Currently this just falls back to portable, but we test it anyway. + if let Some(imp) = Implementation::sse41_if_supported() { + exercise_compress1_loop(imp); + } + } + + #[test] + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + fn test_compress1_loop_avx2() { + if let Some(imp) = Implementation::avx2_if_supported() { + exercise_compress1_loop(imp); + } + } + + // I use ArrayVec everywhere in here becuase currently these tests pass + // under no_std. I might decide that's not worth maintaining at some point, + // since really all we care about with no_std is that the library builds, + // but for now it's here. Everything is keyed off of this N constant so + // that it's easy to copy the code to exercise_compress4_loop. + fn exercise_compress2_loop(implementation: Implementation) { + const N: usize = 2; + + let mut input_buffer = [0; 100 * BLOCKBYTES]; + paint_test_input(&mut input_buffer); + let mut inputs = ArrayVec::<[_; N]>::new(); + for i in 0..N { + inputs.push(&input_buffer[i..]); + } + + exercise_cases(|stride, length, last_node, finalize, count| { + let mut reference_words = ArrayVec::<[_; N]>::new(); + for i in 0..N { + let words = reference_compression( + &inputs[i][..length], + stride, + last_node, + finalize, + count.wrapping_add((i * BLOCKBYTES) as Count), + i, + ); + reference_words.push(words); + } + + let mut test_words = ArrayVec::<[_; N]>::new(); + for i in 0..N { + test_words.push(initial_test_words(i)); + } + let mut jobs = ArrayVec::<[_; N]>::new(); + for (i, words) in test_words.iter_mut().enumerate() { + jobs.push(Job { + input: &inputs[i][..length], + words, + count: count.wrapping_add((i * BLOCKBYTES) as Count), + last_node, + }); + } + let mut jobs = jobs.into_inner().expect("full"); + implementation.compress2_loop(&mut jobs, finalize, stride); + + for i in 0..N { + assert_eq!(reference_words[i], test_words[i], "words {} unequal", i); + } + }); + } + + #[test] + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + fn test_compress2_loop_sse41() { + if let Some(imp) = Implementation::sse41_if_supported() { + exercise_compress2_loop(imp); + } + } + + #[test] + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + fn test_compress2_loop_avx2() { + // Currently this just falls back to SSE4.1, but we test it anyway. + if let Some(imp) = Implementation::avx2_if_supported() { + exercise_compress2_loop(imp); + } + } + + // Copied from exercise_compress2_loop, with a different value of N and an + // interior call to compress4_loop. + fn exercise_compress4_loop(implementation: Implementation) { + const N: usize = 4; + + let mut input_buffer = [0; 100 * BLOCKBYTES]; + paint_test_input(&mut input_buffer); + let mut inputs = ArrayVec::<[_; N]>::new(); + for i in 0..N { + inputs.push(&input_buffer[i..]); + } + + exercise_cases(|stride, length, last_node, finalize, count| { + let mut reference_words = ArrayVec::<[_; N]>::new(); + for i in 0..N { + let words = reference_compression( + &inputs[i][..length], + stride, + last_node, + finalize, + count.wrapping_add((i * BLOCKBYTES) as Count), + i, + ); + reference_words.push(words); + } + + let mut test_words = ArrayVec::<[_; N]>::new(); + for i in 0..N { + test_words.push(initial_test_words(i)); + } + let mut jobs = ArrayVec::<[_; N]>::new(); + for (i, words) in test_words.iter_mut().enumerate() { + jobs.push(Job { + input: &inputs[i][..length], + words, + count: count.wrapping_add((i * BLOCKBYTES) as Count), + last_node, + }); + } + let mut jobs = jobs.into_inner().expect("full"); + implementation.compress4_loop(&mut jobs, finalize, stride); + + for i in 0..N { + assert_eq!(reference_words[i], test_words[i], "words {} unequal", i); + } + }); + } + + #[test] + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + fn test_compress4_loop_avx2() { + if let Some(imp) = Implementation::avx2_if_supported() { + exercise_compress4_loop(imp); + } + } + + #[test] + fn sanity_check_count_size() { + assert_eq!(size_of::<Count>(), 2 * size_of::<Word>()); + } +} |