use crate::*; use arrayref::array_ref; use core::cmp; #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] pub const MAX_DEGREE: usize = 4; #[cfg(not(any(target_arch = "x86", target_arch = "x86_64")))] pub const MAX_DEGREE: usize = 1; // Variants other than Portable are unreachable in no_std, unless CPU features // are explicitly enabled for the build with e.g. RUSTFLAGS="-C target-feature=avx2". // This might change in the future if is_x86_feature_detected moves into libcore. #[allow(dead_code)] #[derive(Clone, Copy, Debug, Eq, PartialEq)] enum Platform { Portable, #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] SSE41, #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] AVX2, } #[derive(Clone, Copy, Debug)] pub struct Implementation(Platform); impl Implementation { pub fn detect() -> Self { // Try the different implementations in order of how fast/modern they // are. Currently on non-x86, everything just uses portable. #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] { if let Some(avx2_impl) = Self::avx2_if_supported() { return avx2_impl; } } #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] { if let Some(sse41_impl) = Self::sse41_if_supported() { return sse41_impl; } } Self::portable() } pub fn portable() -> Self { Implementation(Platform::Portable) } #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] #[allow(unreachable_code)] pub fn sse41_if_supported() -> Option { // Check whether SSE4.1 support is assumed by the build. #[cfg(target_feature = "sse4.1")] { return Some(Implementation(Platform::SSE41)); } // Otherwise dynamically check for support if we can. #[cfg(feature = "std")] { if is_x86_feature_detected!("sse4.1") { return Some(Implementation(Platform::SSE41)); } } None } #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] #[allow(unreachable_code)] pub fn avx2_if_supported() -> Option { // Check whether AVX2 support is assumed by the build. #[cfg(target_feature = "avx2")] { return Some(Implementation(Platform::AVX2)); } // Otherwise dynamically check for support if we can. #[cfg(feature = "std")] { if is_x86_feature_detected!("avx2") { return Some(Implementation(Platform::AVX2)); } } None } pub fn degree(&self) -> usize { match self.0 { #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] Platform::AVX2 => avx2::DEGREE, #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] Platform::SSE41 => sse41::DEGREE, Platform::Portable => 1, } } pub fn compress1_loop( &self, input: &[u8], words: &mut [Word; 8], count: Count, last_node: LastNode, finalize: Finalize, stride: Stride, ) { match self.0 { #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] Platform::AVX2 => unsafe { avx2::compress1_loop(input, words, count, last_node, finalize, stride); }, // Note that there's an SSE version of compress1 in the official C // implementation, but I haven't ported it yet. _ => { portable::compress1_loop(input, words, count, last_node, finalize, stride); } } } pub fn compress2_loop(&self, jobs: &mut [Job; 2], finalize: Finalize, stride: Stride) { match self.0 { #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] Platform::AVX2 | Platform::SSE41 => unsafe { sse41::compress2_loop(jobs, finalize, stride) }, _ => panic!("unsupported"), } } pub fn compress4_loop(&self, jobs: &mut [Job; 4], finalize: Finalize, stride: Stride) { match self.0 { #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] Platform::AVX2 => unsafe { avx2::compress4_loop(jobs, finalize, stride) }, _ => panic!("unsupported"), } } } pub struct Job<'a, 'b> { pub input: &'a [u8], pub words: &'b mut [Word; 8], pub count: Count, pub last_node: LastNode, } impl<'a, 'b> core::fmt::Debug for Job<'a, 'b> { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { // NB: Don't print the words. Leaking them would allow length extension. write!( f, "Job {{ input_len: {}, count: {}, last_node: {} }}", self.input.len(), self.count, self.last_node.yes(), ) } } // Finalize could just be a bool, but this is easier to read at callsites. #[derive(Clone, Copy, Debug)] pub enum Finalize { Yes, No, } impl Finalize { pub fn yes(&self) -> bool { match self { Finalize::Yes => true, Finalize::No => false, } } } // Like Finalize, this is easier to read at callsites. #[derive(Clone, Copy, Debug)] pub enum LastNode { Yes, No, } impl LastNode { pub fn yes(&self) -> bool { match self { LastNode::Yes => true, LastNode::No => false, } } } #[derive(Clone, Copy, Debug)] pub enum Stride { Serial, // BLAKE2b/BLAKE2s Parallel, // BLAKE2bp/BLAKE2sp } impl Stride { pub fn padded_blockbytes(&self) -> usize { match self { Stride::Serial => BLOCKBYTES, Stride::Parallel => blake2bp::DEGREE * BLOCKBYTES, } } } pub(crate) fn count_low(count: Count) -> Word { count as Word } pub(crate) fn count_high(count: Count) -> Word { (count >> 8 * size_of::()) as Word } pub(crate) fn assemble_count(low: Word, high: Word) -> Count { low as Count + ((high as Count) << 8 * size_of::()) } pub(crate) fn flag_word(flag: bool) -> Word { if flag { !0 } else { 0 } } // Pull a array reference at the given offset straight from the input, if // there's a full block of input available. If there's only a partial block, // copy it into the provided buffer, and return an array reference that. Along // with the array, return the number of bytes of real input, and whether the // input can be finalized (i.e. whether there aren't any more bytes after this // block). Note that this is written so that the optimizer can elide bounds // checks, see: https://godbolt.org/z/0hH2bC pub fn final_block<'a>( input: &'a [u8], offset: usize, buffer: &'a mut [u8; BLOCKBYTES], stride: Stride, ) -> (&'a [u8; BLOCKBYTES], usize, bool) { let capped_offset = cmp::min(offset, input.len()); let offset_slice = &input[capped_offset..]; if offset_slice.len() >= BLOCKBYTES { let block = array_ref!(offset_slice, 0, BLOCKBYTES); let should_finalize = offset_slice.len() <= stride.padded_blockbytes(); (block, BLOCKBYTES, should_finalize) } else { // Copy the final block to the front of the block buffer. The rest of // the buffer is assumed to be initialized to zero. buffer[..offset_slice.len()].copy_from_slice(offset_slice); (buffer, offset_slice.len(), true) } } pub fn input_debug_asserts(input: &[u8], finalize: Finalize) { // If we're not finalizing, the input must not be empty, and it must be an // even multiple of the block size. if !finalize.yes() { debug_assert!(!input.is_empty()); debug_assert_eq!(0, input.len() % BLOCKBYTES); } } #[cfg(test)] mod test { use super::*; use arrayvec::ArrayVec; use core::mem::size_of; #[test] fn test_detection() { assert_eq!(Platform::Portable, Implementation::portable().0); #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] #[cfg(feature = "std")] { if is_x86_feature_detected!("avx2") { assert_eq!(Platform::AVX2, Implementation::detect().0); assert_eq!( Platform::AVX2, Implementation::avx2_if_supported().unwrap().0 ); assert_eq!( Platform::SSE41, Implementation::sse41_if_supported().unwrap().0 ); } else if is_x86_feature_detected!("sse4.1") { assert_eq!(Platform::SSE41, Implementation::detect().0); assert!(Implementation::avx2_if_supported().is_none()); assert_eq!( Platform::SSE41, Implementation::sse41_if_supported().unwrap().0 ); } else { assert_eq!(Platform::Portable, Implementation::detect().0); assert!(Implementation::avx2_if_supported().is_none()); assert!(Implementation::sse41_if_supported().is_none()); } } } // TODO: Move all of these case tests into the implementation files. fn exercise_cases(mut f: F) where F: FnMut(Stride, usize, LastNode, Finalize, Count), { // Chose counts to hit the relevant overflow cases. let counts = &[ (0 as Count), ((1 as Count) << (8 * size_of::())) - BLOCKBYTES as Count, (0 as Count).wrapping_sub(BLOCKBYTES as Count), ]; for &stride in &[Stride::Serial, Stride::Parallel] { let lengths = [ 0, 1, BLOCKBYTES - 1, BLOCKBYTES, BLOCKBYTES + 1, 2 * BLOCKBYTES - 1, 2 * BLOCKBYTES, 2 * BLOCKBYTES + 1, stride.padded_blockbytes() - 1, stride.padded_blockbytes(), stride.padded_blockbytes() + 1, 2 * stride.padded_blockbytes() - 1, 2 * stride.padded_blockbytes(), 2 * stride.padded_blockbytes() + 1, ]; for &length in &lengths { for &last_node in &[LastNode::No, LastNode::Yes] { for &finalize in &[Finalize::No, Finalize::Yes] { if !finalize.yes() && (length == 0 || length % BLOCKBYTES != 0) { // Skip these cases, they're invalid. continue; } for &count in counts { // eprintln!("\ncase -----"); // dbg!(stride); // dbg!(length); // dbg!(last_node); // dbg!(finalize); // dbg!(count); f(stride, length, last_node, finalize, count); } } } } } } fn initial_test_words(input_index: usize) -> [Word; 8] { crate::Params::new() .node_offset(input_index as u64) .to_words() } // Use the portable implementation, one block at a time, to compute the // final state words expected for a given test case. fn reference_compression( input: &[u8], stride: Stride, last_node: LastNode, finalize: Finalize, mut count: Count, input_index: usize, ) -> [Word; 8] { let mut words = initial_test_words(input_index); let mut offset = 0; while offset == 0 || offset < input.len() { let block_size = cmp::min(BLOCKBYTES, input.len() - offset); let maybe_finalize = if offset + stride.padded_blockbytes() < input.len() { Finalize::No } else { finalize }; portable::compress1_loop( &input[offset..][..block_size], &mut words, count, last_node, maybe_finalize, Stride::Serial, ); offset += stride.padded_blockbytes(); count = count.wrapping_add(BLOCKBYTES as Count); } words } // For various loop lengths and finalization parameters, make sure that the // implementation gives the same answer as the portable implementation does // when invoked one block at a time. (So even the portable implementation // itself is being tested here, to make sure its loop is correct.) Note // that this doesn't include any fixed test vectors; those are taken from // the blake2-kat.json file (copied from upstream) and tested elsewhere. fn exercise_compress1_loop(implementation: Implementation) { let mut input = [0; 100 * BLOCKBYTES]; paint_test_input(&mut input); exercise_cases(|stride, length, last_node, finalize, count| { let reference_words = reference_compression(&input[..length], stride, last_node, finalize, count, 0); let mut test_words = initial_test_words(0); implementation.compress1_loop( &input[..length], &mut test_words, count, last_node, finalize, stride, ); assert_eq!(reference_words, test_words); }); } #[test] fn test_compress1_loop_portable() { exercise_compress1_loop(Implementation::portable()); } #[test] #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] fn test_compress1_loop_sse41() { // Currently this just falls back to portable, but we test it anyway. if let Some(imp) = Implementation::sse41_if_supported() { exercise_compress1_loop(imp); } } #[test] #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] fn test_compress1_loop_avx2() { if let Some(imp) = Implementation::avx2_if_supported() { exercise_compress1_loop(imp); } } // I use ArrayVec everywhere in here becuase currently these tests pass // under no_std. I might decide that's not worth maintaining at some point, // since really all we care about with no_std is that the library builds, // but for now it's here. Everything is keyed off of this N constant so // that it's easy to copy the code to exercise_compress4_loop. fn exercise_compress2_loop(implementation: Implementation) { const N: usize = 2; let mut input_buffer = [0; 100 * BLOCKBYTES]; paint_test_input(&mut input_buffer); let mut inputs = ArrayVec::<[_; N]>::new(); for i in 0..N { inputs.push(&input_buffer[i..]); } exercise_cases(|stride, length, last_node, finalize, count| { let mut reference_words = ArrayVec::<[_; N]>::new(); for i in 0..N { let words = reference_compression( &inputs[i][..length], stride, last_node, finalize, count.wrapping_add((i * BLOCKBYTES) as Count), i, ); reference_words.push(words); } let mut test_words = ArrayVec::<[_; N]>::new(); for i in 0..N { test_words.push(initial_test_words(i)); } let mut jobs = ArrayVec::<[_; N]>::new(); for (i, words) in test_words.iter_mut().enumerate() { jobs.push(Job { input: &inputs[i][..length], words, count: count.wrapping_add((i * BLOCKBYTES) as Count), last_node, }); } let mut jobs = jobs.into_inner().expect("full"); implementation.compress2_loop(&mut jobs, finalize, stride); for i in 0..N { assert_eq!(reference_words[i], test_words[i], "words {} unequal", i); } }); } #[test] #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] fn test_compress2_loop_sse41() { if let Some(imp) = Implementation::sse41_if_supported() { exercise_compress2_loop(imp); } } #[test] #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] fn test_compress2_loop_avx2() { // Currently this just falls back to SSE4.1, but we test it anyway. if let Some(imp) = Implementation::avx2_if_supported() { exercise_compress2_loop(imp); } } // Copied from exercise_compress2_loop, with a different value of N and an // interior call to compress4_loop. fn exercise_compress4_loop(implementation: Implementation) { const N: usize = 4; let mut input_buffer = [0; 100 * BLOCKBYTES]; paint_test_input(&mut input_buffer); let mut inputs = ArrayVec::<[_; N]>::new(); for i in 0..N { inputs.push(&input_buffer[i..]); } exercise_cases(|stride, length, last_node, finalize, count| { let mut reference_words = ArrayVec::<[_; N]>::new(); for i in 0..N { let words = reference_compression( &inputs[i][..length], stride, last_node, finalize, count.wrapping_add((i * BLOCKBYTES) as Count), i, ); reference_words.push(words); } let mut test_words = ArrayVec::<[_; N]>::new(); for i in 0..N { test_words.push(initial_test_words(i)); } let mut jobs = ArrayVec::<[_; N]>::new(); for (i, words) in test_words.iter_mut().enumerate() { jobs.push(Job { input: &inputs[i][..length], words, count: count.wrapping_add((i * BLOCKBYTES) as Count), last_node, }); } let mut jobs = jobs.into_inner().expect("full"); implementation.compress4_loop(&mut jobs, finalize, stride); for i in 0..N { assert_eq!(reference_words[i], test_words[i], "words {} unequal", i); } }); } #[test] #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] fn test_compress4_loop_avx2() { if let Some(imp) = Implementation::avx2_if_supported() { exercise_compress4_loop(imp); } } #[test] fn sanity_check_count_size() { assert_eq!(size_of::(), 2 * size_of::()); } }