diff options
Diffstat (limited to 'third_party/rust/blake2b_simd')
-rw-r--r-- | third_party/rust/blake2b_simd/.cargo-checksum.json | 1 | ||||
-rw-r--r-- | third_party/rust/blake2b_simd/Cargo.toml | 35 | ||||
-rw-r--r-- | third_party/rust/blake2b_simd/README.md | 42 | ||||
-rw-r--r-- | third_party/rust/blake2b_simd/src/avx2.rs | 928 | ||||
-rw-r--r-- | third_party/rust/blake2b_simd/src/blake2bp.rs | 570 | ||||
-rw-r--r-- | third_party/rust/blake2b_simd/src/guts.rs | 565 | ||||
-rw-r--r-- | third_party/rust/blake2b_simd/src/lib.rs | 674 | ||||
-rw-r--r-- | third_party/rust/blake2b_simd/src/many.rs | 529 | ||||
-rw-r--r-- | third_party/rust/blake2b_simd/src/portable.rs | 166 | ||||
-rw-r--r-- | third_party/rust/blake2b_simd/src/sse41.rs | 454 | ||||
-rw-r--r-- | third_party/rust/blake2b_simd/src/test.rs | 201 |
11 files changed, 4165 insertions, 0 deletions
diff --git a/third_party/rust/blake2b_simd/.cargo-checksum.json b/third_party/rust/blake2b_simd/.cargo-checksum.json new file mode 100644 index 0000000000..bbeeada8d6 --- /dev/null +++ b/third_party/rust/blake2b_simd/.cargo-checksum.json @@ -0,0 +1 @@ +{"files":{"Cargo.toml":"648c10063fa1a16a961df45f194f50982bdf3d41d04586a48d2cc6d69e0252c1","README.md":"2253eba78d5af06642073c5dfd41253fb8be73d3a0e823bc3d7642c9d0ad0c6c","src/avx2.rs":"a97ec761e4e7f70ff6311f4c1e67cb5136ac66cfc51bc49525b81f9e23814d81","src/blake2bp.rs":"83577d4a22db3b92030d9bd4563aa9ad440f23c64a6ad5f10a9d709f22d50589","src/guts.rs":"1189cab87b18eaaf2abd5bcb3d7d799c75401a312cee6f1f65fdaad30203eb6f","src/lib.rs":"67723a3abc30dc7f3d488f434ced884b5ce962a807991c8f1cc9940df869c342","src/many.rs":"60d07e4d7ad63949fb5432ad05f7c6a525a3eee39d325f7d4e65e901b466be95","src/portable.rs":"c47baa15b311bc95d49f3d189111fe45756fb7d623a1f48f0050ae591817aedf","src/sse41.rs":"7a644b1056b804ada9ddc7586552a4a5c769e576d610ffe7ec74065f7eaff491","src/test.rs":"1685eec6fedc30fca1332cbb78c85e6c9b56eca962b6c6343c91ba69eefac754"},"package":"b83b7baab1e671718d78204225800d6b170e648188ac7dc992e9d6bddf87d0c0"}
\ No newline at end of file diff --git a/third_party/rust/blake2b_simd/Cargo.toml b/third_party/rust/blake2b_simd/Cargo.toml new file mode 100644 index 0000000000..ca92a4cc2d --- /dev/null +++ b/third_party/rust/blake2b_simd/Cargo.toml @@ -0,0 +1,35 @@ +# THIS FILE IS AUTOMATICALLY GENERATED BY CARGO +# +# When uploading crates to the registry Cargo will automatically +# "normalize" Cargo.toml files for maximal compatibility +# with all versions of Cargo and also rewrite `path` dependencies +# to registry (e.g., crates.io) dependencies +# +# If you believe there's an error in this file please file an +# issue against the rust-lang/cargo repository. If you're +# editing this file be aware that the upstream Cargo.toml +# will likely look very different (and much more reasonable) + +[package] +edition = "2018" +name = "blake2b_simd" +version = "0.5.9" +authors = ["Jack O'Connor"] +description = "a pure Rust BLAKE2b implementation with dynamic SIMD" +documentation = "https://docs.rs/blake2b_simd" +readme = "README.md" +license = "MIT" +repository = "https://github.com/oconnor663/blake2_simd" +[dependencies.arrayref] +version = "0.3.5" + +[dependencies.arrayvec] +version = "0.5.0" +default-features = false + +[dependencies.constant_time_eq] +version = "0.1.3" + +[features] +default = ["std"] +std = [] diff --git a/third_party/rust/blake2b_simd/README.md b/third_party/rust/blake2b_simd/README.md new file mode 100644 index 0000000000..54caef739d --- /dev/null +++ b/third_party/rust/blake2b_simd/README.md @@ -0,0 +1,42 @@ +# blake2b_simd [![GitHub](https://img.shields.io/github/tag/oconnor663/blake2_simd.svg?label=GitHub)](https://github.com/oconnor663/blake2_simd) [![crates.io](https://img.shields.io/crates/v/blake2b_simd.svg)](https://crates.io/crates/blake2b_simd) [![Build Status](https://travis-ci.org/oconnor663/blake2_simd.svg?branch=master)](https://travis-ci.org/oconnor663/blake2_simd) + +An implementation of the BLAKE2b and BLAKE2bp hash functions. See also +[`blake2s_simd`](../blake2s). + +This crate includes: + +- 100% stable Rust. +- SIMD implementations based on Samuel Neves' [`blake2-avx2`](https://github.com/sneves/blake2-avx2). + These are very fast. For benchmarks, see [the Performance section of the + README](https://github.com/oconnor663/blake2_simd#performance). +- Portable, safe implementations for other platforms. +- Dynamic CPU feature detection. Binaries include multiple implementations by default and + choose the fastest one the processor supports at runtime. +- All the features from the [the BLAKE2 spec](https://blake2.net/blake2.pdf), like adjustable + length, keying, and associated data for tree hashing. +- `no_std` support. The `std` Cargo feature is on by default, for CPU feature detection and + for implementing `std::io::Write`. +- Support for computing multiple BLAKE2b hashes in parallel, matching the efficiency of + BLAKE2bp. See the [`many`](many/index.html) module. + +# Example + +``` +use blake2b_simd::{blake2b, Params}; + +let expected = "ca002330e69d3e6b84a46a56a6533fd79d51d97a3bb7cad6c2ff43b354185d6d\ + c1e723fb3db4ae0737e120378424c714bb982d9dc5bbd7a0ab318240ddd18f8d"; +let hash = blake2b(b"foo"); +assert_eq!(expected, &hash.to_hex()); + +let hash = Params::new() + .hash_length(16) + .key(b"The Magic Words are Squeamish Ossifrage") + .personal(b"L. P. Waterhouse") + .to_state() + .update(b"foo") + .update(b"bar") + .update(b"baz") + .finalize(); +assert_eq!("ee8ff4e9be887297cf79348dc35dab56", &hash.to_hex()); +``` diff --git a/third_party/rust/blake2b_simd/src/avx2.rs b/third_party/rust/blake2b_simd/src/avx2.rs new file mode 100644 index 0000000000..268dd82d36 --- /dev/null +++ b/third_party/rust/blake2b_simd/src/avx2.rs @@ -0,0 +1,928 @@ +#[cfg(target_arch = "x86")] +use core::arch::x86::*; +#[cfg(target_arch = "x86_64")] +use core::arch::x86_64::*; + +use crate::guts::{ + assemble_count, count_high, count_low, final_block, flag_word, input_debug_asserts, Finalize, + Job, LastNode, Stride, +}; +use crate::{Count, Word, BLOCKBYTES, IV, SIGMA}; +use arrayref::{array_refs, mut_array_refs}; +use core::cmp; +use core::mem; + +pub const DEGREE: usize = 4; + +#[inline(always)] +unsafe fn loadu(src: *const [Word; DEGREE]) -> __m256i { + // This is an unaligned load, so the pointer cast is allowed. + _mm256_loadu_si256(src as *const __m256i) +} + +#[inline(always)] +unsafe fn storeu(src: __m256i, dest: *mut [Word; DEGREE]) { + // This is an unaligned store, so the pointer cast is allowed. + _mm256_storeu_si256(dest as *mut __m256i, src) +} + +#[inline(always)] +unsafe fn loadu_128(mem_addr: &[u8; 16]) -> __m128i { + _mm_loadu_si128(mem_addr.as_ptr() as *const __m128i) +} + +#[inline(always)] +unsafe fn add(a: __m256i, b: __m256i) -> __m256i { + _mm256_add_epi64(a, b) +} + +#[inline(always)] +unsafe fn eq(a: __m256i, b: __m256i) -> __m256i { + _mm256_cmpeq_epi64(a, b) +} + +#[inline(always)] +unsafe fn and(a: __m256i, b: __m256i) -> __m256i { + _mm256_and_si256(a, b) +} + +#[inline(always)] +unsafe fn negate_and(a: __m256i, b: __m256i) -> __m256i { + // Note that "and not" implies the reverse of the actual arg order. + _mm256_andnot_si256(a, b) +} + +#[inline(always)] +unsafe fn xor(a: __m256i, b: __m256i) -> __m256i { + _mm256_xor_si256(a, b) +} + +#[inline(always)] +unsafe fn set1(x: u64) -> __m256i { + _mm256_set1_epi64x(x as i64) +} + +#[inline(always)] +unsafe fn set4(a: u64, b: u64, c: u64, d: u64) -> __m256i { + _mm256_setr_epi64x(a as i64, b as i64, c as i64, d as i64) +} + +// Adapted from https://github.com/rust-lang-nursery/stdsimd/pull/479. +macro_rules! _MM_SHUFFLE { + ($z:expr, $y:expr, $x:expr, $w:expr) => { + ($z << 6) | ($y << 4) | ($x << 2) | $w + }; +} + +#[inline(always)] +unsafe fn rot32(x: __m256i) -> __m256i { + _mm256_shuffle_epi32(x, _MM_SHUFFLE!(2, 3, 0, 1)) +} + +#[inline(always)] +unsafe fn rot24(x: __m256i) -> __m256i { + let rotate24 = _mm256_setr_epi8( + 3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10, 3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, + 14, 15, 8, 9, 10, + ); + _mm256_shuffle_epi8(x, rotate24) +} + +#[inline(always)] +unsafe fn rot16(x: __m256i) -> __m256i { + let rotate16 = _mm256_setr_epi8( + 2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9, 2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, + 13, 14, 15, 8, 9, + ); + _mm256_shuffle_epi8(x, rotate16) +} + +#[inline(always)] +unsafe fn rot63(x: __m256i) -> __m256i { + _mm256_or_si256(_mm256_srli_epi64(x, 63), add(x, x)) +} + +#[inline(always)] +unsafe fn g1(a: &mut __m256i, b: &mut __m256i, c: &mut __m256i, d: &mut __m256i, m: &mut __m256i) { + *a = add(*a, *m); + *a = add(*a, *b); + *d = xor(*d, *a); + *d = rot32(*d); + *c = add(*c, *d); + *b = xor(*b, *c); + *b = rot24(*b); +} + +#[inline(always)] +unsafe fn g2(a: &mut __m256i, b: &mut __m256i, c: &mut __m256i, d: &mut __m256i, m: &mut __m256i) { + *a = add(*a, *m); + *a = add(*a, *b); + *d = xor(*d, *a); + *d = rot16(*d); + *c = add(*c, *d); + *b = xor(*b, *c); + *b = rot63(*b); +} + +// Note the optimization here of leaving b as the unrotated row, rather than a. +// All the message loads below are adjusted to compensate for this. See +// discussion at https://github.com/sneves/blake2-avx2/pull/4 +#[inline(always)] +unsafe fn diagonalize(a: &mut __m256i, _b: &mut __m256i, c: &mut __m256i, d: &mut __m256i) { + *a = _mm256_permute4x64_epi64(*a, _MM_SHUFFLE!(2, 1, 0, 3)); + *d = _mm256_permute4x64_epi64(*d, _MM_SHUFFLE!(1, 0, 3, 2)); + *c = _mm256_permute4x64_epi64(*c, _MM_SHUFFLE!(0, 3, 2, 1)); +} + +#[inline(always)] +unsafe fn undiagonalize(a: &mut __m256i, _b: &mut __m256i, c: &mut __m256i, d: &mut __m256i) { + *a = _mm256_permute4x64_epi64(*a, _MM_SHUFFLE!(0, 3, 2, 1)); + *d = _mm256_permute4x64_epi64(*d, _MM_SHUFFLE!(1, 0, 3, 2)); + *c = _mm256_permute4x64_epi64(*c, _MM_SHUFFLE!(2, 1, 0, 3)); +} + +#[inline(always)] +unsafe fn compress_block( + block: &[u8; BLOCKBYTES], + words: &mut [Word; 8], + count: Count, + last_block: Word, + last_node: Word, +) { + let (words_low, words_high) = mut_array_refs!(words, DEGREE, DEGREE); + let (iv_low, iv_high) = array_refs!(&IV, DEGREE, DEGREE); + let mut a = loadu(words_low); + let mut b = loadu(words_high); + let mut c = loadu(iv_low); + let flags = set4(count_low(count), count_high(count), last_block, last_node); + let mut d = xor(loadu(iv_high), flags); + + let msg_chunks = array_refs!(block, 16, 16, 16, 16, 16, 16, 16, 16); + let m0 = _mm256_broadcastsi128_si256(loadu_128(msg_chunks.0)); + let m1 = _mm256_broadcastsi128_si256(loadu_128(msg_chunks.1)); + let m2 = _mm256_broadcastsi128_si256(loadu_128(msg_chunks.2)); + let m3 = _mm256_broadcastsi128_si256(loadu_128(msg_chunks.3)); + let m4 = _mm256_broadcastsi128_si256(loadu_128(msg_chunks.4)); + let m5 = _mm256_broadcastsi128_si256(loadu_128(msg_chunks.5)); + let m6 = _mm256_broadcastsi128_si256(loadu_128(msg_chunks.6)); + let m7 = _mm256_broadcastsi128_si256(loadu_128(msg_chunks.7)); + + let iv0 = a; + let iv1 = b; + let mut t0; + let mut t1; + let mut b0; + + // round 1 + t0 = _mm256_unpacklo_epi64(m0, m1); + t1 = _mm256_unpacklo_epi64(m2, m3); + b0 = _mm256_blend_epi32(t0, t1, 0xF0); + g1(&mut a, &mut b, &mut c, &mut d, &mut b0); + t0 = _mm256_unpackhi_epi64(m0, m1); + t1 = _mm256_unpackhi_epi64(m2, m3); + b0 = _mm256_blend_epi32(t0, t1, 0xF0); + g2(&mut a, &mut b, &mut c, &mut d, &mut b0); + diagonalize(&mut a, &mut b, &mut c, &mut d); + t0 = _mm256_unpacklo_epi64(m7, m4); + t1 = _mm256_unpacklo_epi64(m5, m6); + b0 = _mm256_blend_epi32(t0, t1, 0xF0); + g1(&mut a, &mut b, &mut c, &mut d, &mut b0); + t0 = _mm256_unpackhi_epi64(m7, m4); + t1 = _mm256_unpackhi_epi64(m5, m6); + b0 = _mm256_blend_epi32(t0, t1, 0xF0); + g2(&mut a, &mut b, &mut c, &mut d, &mut b0); + undiagonalize(&mut a, &mut b, &mut c, &mut d); + + // round 2 + t0 = _mm256_unpacklo_epi64(m7, m2); + t1 = _mm256_unpackhi_epi64(m4, m6); + b0 = _mm256_blend_epi32(t0, t1, 0xF0); + g1(&mut a, &mut b, &mut c, &mut d, &mut b0); + t0 = _mm256_unpacklo_epi64(m5, m4); + t1 = _mm256_alignr_epi8(m3, m7, 8); + b0 = _mm256_blend_epi32(t0, t1, 0xF0); + g2(&mut a, &mut b, &mut c, &mut d, &mut b0); + diagonalize(&mut a, &mut b, &mut c, &mut d); + t0 = _mm256_unpackhi_epi64(m2, m0); + t1 = _mm256_blend_epi32(m5, m0, 0x33); + b0 = _mm256_blend_epi32(t0, t1, 0xF0); + g1(&mut a, &mut b, &mut c, &mut d, &mut b0); + t0 = _mm256_alignr_epi8(m6, m1, 8); + t1 = _mm256_blend_epi32(m3, m1, 0x33); + b0 = _mm256_blend_epi32(t0, t1, 0xF0); + g2(&mut a, &mut b, &mut c, &mut d, &mut b0); + undiagonalize(&mut a, &mut b, &mut c, &mut d); + + // round 3 + t0 = _mm256_alignr_epi8(m6, m5, 8); + t1 = _mm256_unpackhi_epi64(m2, m7); + b0 = _mm256_blend_epi32(t0, t1, 0xF0); + g1(&mut a, &mut b, &mut c, &mut d, &mut b0); + t0 = _mm256_unpacklo_epi64(m4, m0); + t1 = _mm256_blend_epi32(m6, m1, 0x33); + b0 = _mm256_blend_epi32(t0, t1, 0xF0); + g2(&mut a, &mut b, &mut c, &mut d, &mut b0); + diagonalize(&mut a, &mut b, &mut c, &mut d); + t0 = _mm256_alignr_epi8(m5, m4, 8); + t1 = _mm256_unpackhi_epi64(m1, m3); + b0 = _mm256_blend_epi32(t0, t1, 0xF0); + g1(&mut a, &mut b, &mut c, &mut d, &mut b0); + t0 = _mm256_unpacklo_epi64(m2, m7); + t1 = _mm256_blend_epi32(m0, m3, 0x33); + b0 = _mm256_blend_epi32(t0, t1, 0xF0); + g2(&mut a, &mut b, &mut c, &mut d, &mut b0); + undiagonalize(&mut a, &mut b, &mut c, &mut d); + + // round 4 + t0 = _mm256_unpackhi_epi64(m3, m1); + t1 = _mm256_unpackhi_epi64(m6, m5); + b0 = _mm256_blend_epi32(t0, t1, 0xF0); + g1(&mut a, &mut b, &mut c, &mut d, &mut b0); + t0 = _mm256_unpackhi_epi64(m4, m0); + t1 = _mm256_unpacklo_epi64(m6, m7); + b0 = _mm256_blend_epi32(t0, t1, 0xF0); + g2(&mut a, &mut b, &mut c, &mut d, &mut b0); + diagonalize(&mut a, &mut b, &mut c, &mut d); + t0 = _mm256_alignr_epi8(m1, m7, 8); + t1 = _mm256_shuffle_epi32(m2, _MM_SHUFFLE!(1, 0, 3, 2)); + b0 = _mm256_blend_epi32(t0, t1, 0xF0); + g1(&mut a, &mut b, &mut c, &mut d, &mut b0); + t0 = _mm256_unpacklo_epi64(m4, m3); + t1 = _mm256_unpacklo_epi64(m5, m0); + b0 = _mm256_blend_epi32(t0, t1, 0xF0); + g2(&mut a, &mut b, &mut c, &mut d, &mut b0); + undiagonalize(&mut a, &mut b, &mut c, &mut d); + + // round 5 + t0 = _mm256_unpackhi_epi64(m4, m2); + t1 = _mm256_unpacklo_epi64(m1, m5); + b0 = _mm256_blend_epi32(t0, t1, 0xF0); + g1(&mut a, &mut b, &mut c, &mut d, &mut b0); + t0 = _mm256_blend_epi32(m3, m0, 0x33); + t1 = _mm256_blend_epi32(m7, m2, 0x33); + b0 = _mm256_blend_epi32(t0, t1, 0xF0); + g2(&mut a, &mut b, &mut c, &mut d, &mut b0); + diagonalize(&mut a, &mut b, &mut c, &mut d); + t0 = _mm256_alignr_epi8(m7, m1, 8); + t1 = _mm256_alignr_epi8(m3, m5, 8); + b0 = _mm256_blend_epi32(t0, t1, 0xF0); + g1(&mut a, &mut b, &mut c, &mut d, &mut b0); + t0 = _mm256_unpackhi_epi64(m6, m0); + t1 = _mm256_unpacklo_epi64(m6, m4); + b0 = _mm256_blend_epi32(t0, t1, 0xF0); + g2(&mut a, &mut b, &mut c, &mut d, &mut b0); + undiagonalize(&mut a, &mut b, &mut c, &mut d); + + // round 6 + t0 = _mm256_unpacklo_epi64(m1, m3); + t1 = _mm256_unpacklo_epi64(m0, m4); + b0 = _mm256_blend_epi32(t0, t1, 0xF0); + g1(&mut a, &mut b, &mut c, &mut d, &mut b0); + t0 = _mm256_unpacklo_epi64(m6, m5); + t1 = _mm256_unpackhi_epi64(m5, m1); + b0 = _mm256_blend_epi32(t0, t1, 0xF0); + g2(&mut a, &mut b, &mut c, &mut d, &mut b0); + diagonalize(&mut a, &mut b, &mut c, &mut d); + t0 = _mm256_alignr_epi8(m2, m0, 8); + t1 = _mm256_unpackhi_epi64(m3, m7); + b0 = _mm256_blend_epi32(t0, t1, 0xF0); + g1(&mut a, &mut b, &mut c, &mut d, &mut b0); + t0 = _mm256_unpackhi_epi64(m4, m6); + t1 = _mm256_alignr_epi8(m7, m2, 8); + b0 = _mm256_blend_epi32(t0, t1, 0xF0); + g2(&mut a, &mut b, &mut c, &mut d, &mut b0); + undiagonalize(&mut a, &mut b, &mut c, &mut d); + + // round 7 + t0 = _mm256_blend_epi32(m0, m6, 0x33); + t1 = _mm256_unpacklo_epi64(m7, m2); + b0 = _mm256_blend_epi32(t0, t1, 0xF0); + g1(&mut a, &mut b, &mut c, &mut d, &mut b0); + t0 = _mm256_unpackhi_epi64(m2, m7); + t1 = _mm256_alignr_epi8(m5, m6, 8); + b0 = _mm256_blend_epi32(t0, t1, 0xF0); + g2(&mut a, &mut b, &mut c, &mut d, &mut b0); + diagonalize(&mut a, &mut b, &mut c, &mut d); + t0 = _mm256_unpacklo_epi64(m4, m0); + t1 = _mm256_blend_epi32(m4, m3, 0x33); + b0 = _mm256_blend_epi32(t0, t1, 0xF0); + g1(&mut a, &mut b, &mut c, &mut d, &mut b0); + t0 = _mm256_unpackhi_epi64(m5, m3); + t1 = _mm256_shuffle_epi32(m1, _MM_SHUFFLE!(1, 0, 3, 2)); + b0 = _mm256_blend_epi32(t0, t1, 0xF0); + g2(&mut a, &mut b, &mut c, &mut d, &mut b0); + undiagonalize(&mut a, &mut b, &mut c, &mut d); + + // round 8 + t0 = _mm256_unpackhi_epi64(m6, m3); + t1 = _mm256_blend_epi32(m1, m6, 0x33); + b0 = _mm256_blend_epi32(t0, t1, 0xF0); + g1(&mut a, &mut b, &mut c, &mut d, &mut b0); + t0 = _mm256_alignr_epi8(m7, m5, 8); + t1 = _mm256_unpackhi_epi64(m0, m4); + b0 = _mm256_blend_epi32(t0, t1, 0xF0); + g2(&mut a, &mut b, &mut c, &mut d, &mut b0); + diagonalize(&mut a, &mut b, &mut c, &mut d); + t0 = _mm256_blend_epi32(m2, m1, 0x33); + t1 = _mm256_alignr_epi8(m4, m7, 8); + b0 = _mm256_blend_epi32(t0, t1, 0xF0); + g1(&mut a, &mut b, &mut c, &mut d, &mut b0); + t0 = _mm256_unpacklo_epi64(m5, m0); + t1 = _mm256_unpacklo_epi64(m2, m3); + b0 = _mm256_blend_epi32(t0, t1, 0xF0); + g2(&mut a, &mut b, &mut c, &mut d, &mut b0); + undiagonalize(&mut a, &mut b, &mut c, &mut d); + + // round 9 + t0 = _mm256_unpacklo_epi64(m3, m7); + t1 = _mm256_alignr_epi8(m0, m5, 8); + b0 = _mm256_blend_epi32(t0, t1, 0xF0); + g1(&mut a, &mut b, &mut c, &mut d, &mut b0); + t0 = _mm256_unpackhi_epi64(m7, m4); + t1 = _mm256_alignr_epi8(m4, m1, 8); + b0 = _mm256_blend_epi32(t0, t1, 0xF0); + g2(&mut a, &mut b, &mut c, &mut d, &mut b0); + diagonalize(&mut a, &mut b, &mut c, &mut d); + t0 = _mm256_unpacklo_epi64(m5, m6); + t1 = _mm256_unpackhi_epi64(m6, m0); + b0 = _mm256_blend_epi32(t0, t1, 0xF0); + g1(&mut a, &mut b, &mut c, &mut d, &mut b0); + t0 = _mm256_alignr_epi8(m1, m2, 8); + t1 = _mm256_alignr_epi8(m2, m3, 8); + b0 = _mm256_blend_epi32(t0, t1, 0xF0); + g2(&mut a, &mut b, &mut c, &mut d, &mut b0); + undiagonalize(&mut a, &mut b, &mut c, &mut d); + + // round 10 + t0 = _mm256_unpacklo_epi64(m5, m4); + t1 = _mm256_unpackhi_epi64(m3, m0); + b0 = _mm256_blend_epi32(t0, t1, 0xF0); + g1(&mut a, &mut b, &mut c, &mut d, &mut b0); + t0 = _mm256_unpacklo_epi64(m1, m2); + t1 = _mm256_blend_epi32(m2, m3, 0x33); + b0 = _mm256_blend_epi32(t0, t1, 0xF0); + g2(&mut a, &mut b, &mut c, &mut d, &mut b0); + diagonalize(&mut a, &mut b, &mut c, &mut d); + t0 = _mm256_unpackhi_epi64(m6, m7); + t1 = _mm256_unpackhi_epi64(m4, m1); + b0 = _mm256_blend_epi32(t0, t1, 0xF0); + g1(&mut a, &mut b, &mut c, &mut d, &mut b0); + t0 = _mm256_blend_epi32(m5, m0, 0x33); + t1 = _mm256_unpacklo_epi64(m7, m6); + b0 = _mm256_blend_epi32(t0, t1, 0xF0); + g2(&mut a, &mut b, &mut c, &mut d, &mut b0); + undiagonalize(&mut a, &mut b, &mut c, &mut d); + + // round 11 + t0 = _mm256_unpacklo_epi64(m0, m1); + t1 = _mm256_unpacklo_epi64(m2, m3); + b0 = _mm256_blend_epi32(t0, t1, 0xF0); + g1(&mut a, &mut b, &mut c, &mut d, &mut b0); + t0 = _mm256_unpackhi_epi64(m0, m1); + t1 = _mm256_unpackhi_epi64(m2, m3); + b0 = _mm256_blend_epi32(t0, t1, 0xF0); + g2(&mut a, &mut b, &mut c, &mut d, &mut b0); + diagonalize(&mut a, &mut b, &mut c, &mut d); + t0 = _mm256_unpacklo_epi64(m7, m4); + t1 = _mm256_unpacklo_epi64(m5, m6); + b0 = _mm256_blend_epi32(t0, t1, 0xF0); + g1(&mut a, &mut b, &mut c, &mut d, &mut b0); + t0 = _mm256_unpackhi_epi64(m7, m4); + t1 = _mm256_unpackhi_epi64(m5, m6); + b0 = _mm256_blend_epi32(t0, t1, 0xF0); + g2(&mut a, &mut b, &mut c, &mut d, &mut b0); + undiagonalize(&mut a, &mut b, &mut c, &mut d); + + // round 12 + t0 = _mm256_unpacklo_epi64(m7, m2); + t1 = _mm256_unpackhi_epi64(m4, m6); + b0 = _mm256_blend_epi32(t0, t1, 0xF0); + g1(&mut a, &mut b, &mut c, &mut d, &mut b0); + t0 = _mm256_unpacklo_epi64(m5, m4); + t1 = _mm256_alignr_epi8(m3, m7, 8); + b0 = _mm256_blend_epi32(t0, t1, 0xF0); + g2(&mut a, &mut b, &mut c, &mut d, &mut b0); + diagonalize(&mut a, &mut b, &mut c, &mut d); + t0 = _mm256_unpackhi_epi64(m2, m0); + t1 = _mm256_blend_epi32(m5, m0, 0x33); + b0 = _mm256_blend_epi32(t0, t1, 0xF0); + g1(&mut a, &mut b, &mut c, &mut d, &mut b0); + t0 = _mm256_alignr_epi8(m6, m1, 8); + t1 = _mm256_blend_epi32(m3, m1, 0x33); + b0 = _mm256_blend_epi32(t0, t1, 0xF0); + g2(&mut a, &mut b, &mut c, &mut d, &mut b0); + undiagonalize(&mut a, &mut b, &mut c, &mut d); + + a = xor(a, c); + b = xor(b, d); + a = xor(a, iv0); + b = xor(b, iv1); + + storeu(a, words_low); + storeu(b, words_high); +} + +#[target_feature(enable = "avx2")] +pub unsafe fn compress1_loop( + input: &[u8], + words: &mut [Word; 8], + mut count: Count, + last_node: LastNode, + finalize: Finalize, + stride: Stride, +) { + input_debug_asserts(input, finalize); + + let mut local_words = *words; + + let mut fin_offset = input.len().saturating_sub(1); + fin_offset -= fin_offset % stride.padded_blockbytes(); + let mut buf = [0; BLOCKBYTES]; + let (fin_block, fin_len, _) = final_block(input, fin_offset, &mut buf, stride); + let fin_last_block = flag_word(finalize.yes()); + let fin_last_node = flag_word(finalize.yes() && last_node.yes()); + + let mut offset = 0; + loop { + let block; + let count_delta; + let last_block; + let last_node; + if offset == fin_offset { + block = fin_block; + count_delta = fin_len; + last_block = fin_last_block; + last_node = fin_last_node; + } else { + // This unsafe cast avoids bounds checks. There's guaranteed to be + // enough input because `offset < fin_offset`. + block = &*(input.as_ptr().add(offset) as *const [u8; BLOCKBYTES]); + count_delta = BLOCKBYTES; + last_block = flag_word(false); + last_node = flag_word(false); + }; + + count = count.wrapping_add(count_delta as Count); + compress_block(block, &mut local_words, count, last_block, last_node); + + // Check for termination before bumping the offset, to avoid overflow. + if offset == fin_offset { + break; + } + + offset += stride.padded_blockbytes(); + } + + *words = local_words; +} + +// Performance note: Factoring out a G function here doesn't hurt performance, +// unlike in the case of BLAKE2s where it hurts substantially. In fact, on my +// machine, it helps a tiny bit. But the difference it tiny, so I'm going to +// stick to the approach used by https://github.com/sneves/blake2-avx2 +// until/unless I can be sure the (tiny) improvement is consistent across +// different Intel microarchitectures. Smaller code size is nice, but a +// divergence between the BLAKE2b and BLAKE2s implementations is less nice. +#[inline(always)] +unsafe fn round(v: &mut [__m256i; 16], m: &[__m256i; 16], r: usize) { + v[0] = add(v[0], m[SIGMA[r][0] as usize]); + v[1] = add(v[1], m[SIGMA[r][2] as usize]); + v[2] = add(v[2], m[SIGMA[r][4] as usize]); + v[3] = add(v[3], m[SIGMA[r][6] as usize]); + v[0] = add(v[0], v[4]); + v[1] = add(v[1], v[5]); + v[2] = add(v[2], v[6]); + v[3] = add(v[3], v[7]); + v[12] = xor(v[12], v[0]); + v[13] = xor(v[13], v[1]); + v[14] = xor(v[14], v[2]); + v[15] = xor(v[15], v[3]); + v[12] = rot32(v[12]); + v[13] = rot32(v[13]); + v[14] = rot32(v[14]); + v[15] = rot32(v[15]); + v[8] = add(v[8], v[12]); + v[9] = add(v[9], v[13]); + v[10] = add(v[10], v[14]); + v[11] = add(v[11], v[15]); + v[4] = xor(v[4], v[8]); + v[5] = xor(v[5], v[9]); + v[6] = xor(v[6], v[10]); + v[7] = xor(v[7], v[11]); + v[4] = rot24(v[4]); + v[5] = rot24(v[5]); + v[6] = rot24(v[6]); + v[7] = rot24(v[7]); + v[0] = add(v[0], m[SIGMA[r][1] as usize]); + v[1] = add(v[1], m[SIGMA[r][3] as usize]); + v[2] = add(v[2], m[SIGMA[r][5] as usize]); + v[3] = add(v[3], m[SIGMA[r][7] as usize]); + v[0] = add(v[0], v[4]); + v[1] = add(v[1], v[5]); + v[2] = add(v[2], v[6]); + v[3] = add(v[3], v[7]); + v[12] = xor(v[12], v[0]); + v[13] = xor(v[13], v[1]); + v[14] = xor(v[14], v[2]); + v[15] = xor(v[15], v[3]); + v[12] = rot16(v[12]); + v[13] = rot16(v[13]); + v[14] = rot16(v[14]); + v[15] = rot16(v[15]); + v[8] = add(v[8], v[12]); + v[9] = add(v[9], v[13]); + v[10] = add(v[10], v[14]); + v[11] = add(v[11], v[15]); + v[4] = xor(v[4], v[8]); + v[5] = xor(v[5], v[9]); + v[6] = xor(v[6], v[10]); + v[7] = xor(v[7], v[11]); + v[4] = rot63(v[4]); + v[5] = rot63(v[5]); + v[6] = rot63(v[6]); + v[7] = rot63(v[7]); + + v[0] = add(v[0], m[SIGMA[r][8] as usize]); + v[1] = add(v[1], m[SIGMA[r][10] as usize]); + v[2] = add(v[2], m[SIGMA[r][12] as usize]); + v[3] = add(v[3], m[SIGMA[r][14] as usize]); + v[0] = add(v[0], v[5]); + v[1] = add(v[1], v[6]); + v[2] = add(v[2], v[7]); + v[3] = add(v[3], v[4]); + v[15] = xor(v[15], v[0]); + v[12] = xor(v[12], v[1]); + v[13] = xor(v[13], v[2]); + v[14] = xor(v[14], v[3]); + v[15] = rot32(v[15]); + v[12] = rot32(v[12]); + v[13] = rot32(v[13]); + v[14] = rot32(v[14]); + v[10] = add(v[10], v[15]); + v[11] = add(v[11], v[12]); + v[8] = add(v[8], v[13]); + v[9] = add(v[9], v[14]); + v[5] = xor(v[5], v[10]); + v[6] = xor(v[6], v[11]); + v[7] = xor(v[7], v[8]); + v[4] = xor(v[4], v[9]); + v[5] = rot24(v[5]); + v[6] = rot24(v[6]); + v[7] = rot24(v[7]); + v[4] = rot24(v[4]); + v[0] = add(v[0], m[SIGMA[r][9] as usize]); + v[1] = add(v[1], m[SIGMA[r][11] as usize]); + v[2] = add(v[2], m[SIGMA[r][13] as usize]); + v[3] = add(v[3], m[SIGMA[r][15] as usize]); + v[0] = add(v[0], v[5]); + v[1] = add(v[1], v[6]); + v[2] = add(v[2], v[7]); + v[3] = add(v[3], v[4]); + v[15] = xor(v[15], v[0]); + v[12] = xor(v[12], v[1]); + v[13] = xor(v[13], v[2]); + v[14] = xor(v[14], v[3]); + v[15] = rot16(v[15]); + v[12] = rot16(v[12]); + v[13] = rot16(v[13]); + v[14] = rot16(v[14]); + v[10] = add(v[10], v[15]); + v[11] = add(v[11], v[12]); + v[8] = add(v[8], v[13]); + v[9] = add(v[9], v[14]); + v[5] = xor(v[5], v[10]); + v[6] = xor(v[6], v[11]); + v[7] = xor(v[7], v[8]); + v[4] = xor(v[4], v[9]); + v[5] = rot63(v[5]); + v[6] = rot63(v[6]); + v[7] = rot63(v[7]); + v[4] = rot63(v[4]); +} + +// We'd rather make this a regular function with #[inline(always)], but for +// some reason that blows up compile times by about 10 seconds, at least in +// some cases (BLAKE2b avx2.rs). This macro seems to get the same performance +// result, without the compile time issue. +macro_rules! compress4_transposed { + ( + $h_vecs:expr, + $msg_vecs:expr, + $count_low:expr, + $count_high:expr, + $lastblock:expr, + $lastnode:expr, + ) => { + let h_vecs: &mut [__m256i; 8] = $h_vecs; + let msg_vecs: &[__m256i; 16] = $msg_vecs; + let count_low: __m256i = $count_low; + let count_high: __m256i = $count_high; + let lastblock: __m256i = $lastblock; + let lastnode: __m256i = $lastnode; + + let mut v = [ + h_vecs[0], + h_vecs[1], + h_vecs[2], + h_vecs[3], + h_vecs[4], + h_vecs[5], + h_vecs[6], + h_vecs[7], + set1(IV[0]), + set1(IV[1]), + set1(IV[2]), + set1(IV[3]), + xor(set1(IV[4]), count_low), + xor(set1(IV[5]), count_high), + xor(set1(IV[6]), lastblock), + xor(set1(IV[7]), lastnode), + ]; + + round(&mut v, &msg_vecs, 0); + round(&mut v, &msg_vecs, 1); + round(&mut v, &msg_vecs, 2); + round(&mut v, &msg_vecs, 3); + round(&mut v, &msg_vecs, 4); + round(&mut v, &msg_vecs, 5); + round(&mut v, &msg_vecs, 6); + round(&mut v, &msg_vecs, 7); + round(&mut v, &msg_vecs, 8); + round(&mut v, &msg_vecs, 9); + round(&mut v, &msg_vecs, 10); + round(&mut v, &msg_vecs, 11); + + h_vecs[0] = xor(xor(h_vecs[0], v[0]), v[8]); + h_vecs[1] = xor(xor(h_vecs[1], v[1]), v[9]); + h_vecs[2] = xor(xor(h_vecs[2], v[2]), v[10]); + h_vecs[3] = xor(xor(h_vecs[3], v[3]), v[11]); + h_vecs[4] = xor(xor(h_vecs[4], v[4]), v[12]); + h_vecs[5] = xor(xor(h_vecs[5], v[5]), v[13]); + h_vecs[6] = xor(xor(h_vecs[6], v[6]), v[14]); + h_vecs[7] = xor(xor(h_vecs[7], v[7]), v[15]); + }; +} + +#[inline(always)] +unsafe fn interleave128(a: __m256i, b: __m256i) -> (__m256i, __m256i) { + ( + _mm256_permute2x128_si256(a, b, 0x20), + _mm256_permute2x128_si256(a, b, 0x31), + ) +} + +// There are several ways to do a transposition. We could do it naively, with 8 separate +// _mm256_set_epi64x instructions, referencing each of the 64 words explicitly. Or we could copy +// the vecs into contiguous storage and then use gather instructions. This third approach is to use +// a series of unpack instructions to interleave the vectors. In my benchmarks, interleaving is the +// fastest approach. To test this, run `cargo +nightly bench --bench libtest load_4` in the +// https://github.com/oconnor663/bao_experiments repo. +#[inline(always)] +unsafe fn transpose_vecs( + vec_a: __m256i, + vec_b: __m256i, + vec_c: __m256i, + vec_d: __m256i, +) -> [__m256i; DEGREE] { + // Interleave 64-bit lates. The low unpack is lanes 00/22 and the high is 11/33. + let ab_02 = _mm256_unpacklo_epi64(vec_a, vec_b); + let ab_13 = _mm256_unpackhi_epi64(vec_a, vec_b); + let cd_02 = _mm256_unpacklo_epi64(vec_c, vec_d); + let cd_13 = _mm256_unpackhi_epi64(vec_c, vec_d); + + // Interleave 128-bit lanes. + let (abcd_0, abcd_2) = interleave128(ab_02, cd_02); + let (abcd_1, abcd_3) = interleave128(ab_13, cd_13); + + [abcd_0, abcd_1, abcd_2, abcd_3] +} + +#[inline(always)] +unsafe fn transpose_state_vecs(jobs: &[Job; DEGREE]) -> [__m256i; 8] { + // Load all the state words into transposed vectors, where the first vector + // has the first word of each state, etc. Transposing once at the beginning + // and once at the end is more efficient that repeating it for each block. + let words0 = array_refs!(&jobs[0].words, DEGREE, DEGREE); + let words1 = array_refs!(&jobs[1].words, DEGREE, DEGREE); + let words2 = array_refs!(&jobs[2].words, DEGREE, DEGREE); + let words3 = array_refs!(&jobs[3].words, DEGREE, DEGREE); + let [h0, h1, h2, h3] = transpose_vecs( + loadu(words0.0), + loadu(words1.0), + loadu(words2.0), + loadu(words3.0), + ); + let [h4, h5, h6, h7] = transpose_vecs( + loadu(words0.1), + loadu(words1.1), + loadu(words2.1), + loadu(words3.1), + ); + [h0, h1, h2, h3, h4, h5, h6, h7] +} + +#[inline(always)] +unsafe fn untranspose_state_vecs(h_vecs: &[__m256i; 8], jobs: &mut [Job; DEGREE]) { + // Un-transpose the updated state vectors back into the caller's arrays. + let [job0, job1, job2, job3] = jobs; + let words0 = mut_array_refs!(&mut job0.words, DEGREE, DEGREE); + let words1 = mut_array_refs!(&mut job1.words, DEGREE, DEGREE); + let words2 = mut_array_refs!(&mut job2.words, DEGREE, DEGREE); + let words3 = mut_array_refs!(&mut job3.words, DEGREE, DEGREE); + let out = transpose_vecs(h_vecs[0], h_vecs[1], h_vecs[2], h_vecs[3]); + storeu(out[0], words0.0); + storeu(out[1], words1.0); + storeu(out[2], words2.0); + storeu(out[3], words3.0); + let out = transpose_vecs(h_vecs[4], h_vecs[5], h_vecs[6], h_vecs[7]); + storeu(out[0], words0.1); + storeu(out[1], words1.1); + storeu(out[2], words2.1); + storeu(out[3], words3.1); +} + +#[inline(always)] +unsafe fn transpose_msg_vecs(blocks: [*const [u8; BLOCKBYTES]; DEGREE]) -> [__m256i; 16] { + // These input arrays have no particular alignment, so we use unaligned + // loads to read from them. + let block0 = blocks[0] as *const [Word; DEGREE]; + let block1 = blocks[1] as *const [Word; DEGREE]; + let block2 = blocks[2] as *const [Word; DEGREE]; + let block3 = blocks[3] as *const [Word; DEGREE]; + let [m0, m1, m2, m3] = transpose_vecs( + loadu(block0.add(0)), + loadu(block1.add(0)), + loadu(block2.add(0)), + loadu(block3.add(0)), + ); + let [m4, m5, m6, m7] = transpose_vecs( + loadu(block0.add(1)), + loadu(block1.add(1)), + loadu(block2.add(1)), + loadu(block3.add(1)), + ); + let [m8, m9, m10, m11] = transpose_vecs( + loadu(block0.add(2)), + loadu(block1.add(2)), + loadu(block2.add(2)), + loadu(block3.add(2)), + ); + let [m12, m13, m14, m15] = transpose_vecs( + loadu(block0.add(3)), + loadu(block1.add(3)), + loadu(block2.add(3)), + loadu(block3.add(3)), + ); + [ + m0, m1, m2, m3, m4, m5, m6, m7, m8, m9, m10, m11, m12, m13, m14, m15, + ] +} + +#[inline(always)] +unsafe fn load_counts(jobs: &[Job; DEGREE]) -> (__m256i, __m256i) { + ( + set4( + count_low(jobs[0].count), + count_low(jobs[1].count), + count_low(jobs[2].count), + count_low(jobs[3].count), + ), + set4( + count_high(jobs[0].count), + count_high(jobs[1].count), + count_high(jobs[2].count), + count_high(jobs[3].count), + ), + ) +} + +#[inline(always)] +unsafe fn store_counts(jobs: &mut [Job; DEGREE], low: __m256i, high: __m256i) { + let low_ints: [Word; DEGREE] = mem::transmute(low); + let high_ints: [Word; DEGREE] = mem::transmute(high); + for i in 0..DEGREE { + jobs[i].count = assemble_count(low_ints[i], high_ints[i]); + } +} + +#[inline(always)] +unsafe fn add_to_counts(lo: &mut __m256i, hi: &mut __m256i, delta: __m256i) { + // If the low counts reach zero, that means they wrapped, unless the delta + // was also zero. + *lo = add(*lo, delta); + let lo_reached_zero = eq(*lo, set1(0)); + let delta_was_zero = eq(delta, set1(0)); + let hi_inc = and(set1(1), negate_and(delta_was_zero, lo_reached_zero)); + *hi = add(*hi, hi_inc); +} + +#[inline(always)] +unsafe fn flags_vec(flags: [bool; DEGREE]) -> __m256i { + set4( + flag_word(flags[0]), + flag_word(flags[1]), + flag_word(flags[2]), + flag_word(flags[3]), + ) +} + +#[target_feature(enable = "avx2")] +pub unsafe fn compress4_loop(jobs: &mut [Job; DEGREE], finalize: Finalize, stride: Stride) { + // If we're not finalizing, there can't be a partial block at the end. + for job in jobs.iter() { + input_debug_asserts(job.input, finalize); + } + + let msg_ptrs = [ + jobs[0].input.as_ptr(), + jobs[1].input.as_ptr(), + jobs[2].input.as_ptr(), + jobs[3].input.as_ptr(), + ]; + let mut h_vecs = transpose_state_vecs(&jobs); + let (mut counts_lo, mut counts_hi) = load_counts(&jobs); + + // Prepare the final blocks (note, which could be empty if the input is + // empty). Do all this before entering the main loop. + let min_len = jobs.iter().map(|job| job.input.len()).min().unwrap(); + let mut fin_offset = min_len.saturating_sub(1); + fin_offset -= fin_offset % stride.padded_blockbytes(); + // Performance note, making these buffers mem::uninitialized() seems to + // cause problems in the optimizer. + let mut buf0: [u8; BLOCKBYTES] = [0; BLOCKBYTES]; + let mut buf1: [u8; BLOCKBYTES] = [0; BLOCKBYTES]; + let mut buf2: [u8; BLOCKBYTES] = [0; BLOCKBYTES]; + let mut buf3: [u8; BLOCKBYTES] = [0; BLOCKBYTES]; + let (block0, len0, finalize0) = final_block(jobs[0].input, fin_offset, &mut buf0, stride); + let (block1, len1, finalize1) = final_block(jobs[1].input, fin_offset, &mut buf1, stride); + let (block2, len2, finalize2) = final_block(jobs[2].input, fin_offset, &mut buf2, stride); + let (block3, len3, finalize3) = final_block(jobs[3].input, fin_offset, &mut buf3, stride); + let fin_blocks: [*const [u8; BLOCKBYTES]; DEGREE] = [block0, block1, block2, block3]; + let fin_counts_delta = set4(len0 as Word, len1 as Word, len2 as Word, len3 as Word); + let fin_last_block; + let fin_last_node; + if finalize.yes() { + fin_last_block = flags_vec([finalize0, finalize1, finalize2, finalize3]); + fin_last_node = flags_vec([ + finalize0 && jobs[0].last_node.yes(), + finalize1 && jobs[1].last_node.yes(), + finalize2 && jobs[2].last_node.yes(), + finalize3 && jobs[3].last_node.yes(), + ]); + } else { + fin_last_block = set1(0); + fin_last_node = set1(0); + } + + // The main loop. + let mut offset = 0; + loop { + let blocks; + let counts_delta; + let last_block; + let last_node; + if offset == fin_offset { + blocks = fin_blocks; + counts_delta = fin_counts_delta; + last_block = fin_last_block; + last_node = fin_last_node; + } else { + blocks = [ + msg_ptrs[0].add(offset) as *const [u8; BLOCKBYTES], + msg_ptrs[1].add(offset) as *const [u8; BLOCKBYTES], + msg_ptrs[2].add(offset) as *const [u8; BLOCKBYTES], + msg_ptrs[3].add(offset) as *const [u8; BLOCKBYTES], + ]; + counts_delta = set1(BLOCKBYTES as Word); + last_block = set1(0); + last_node = set1(0); + }; + + let m_vecs = transpose_msg_vecs(blocks); + add_to_counts(&mut counts_lo, &mut counts_hi, counts_delta); + compress4_transposed!( + &mut h_vecs, + &m_vecs, + counts_lo, + counts_hi, + last_block, + last_node, + ); + + // Check for termination before bumping the offset, to avoid overflow. + if offset == fin_offset { + break; + } + + offset += stride.padded_blockbytes(); + } + + // Write out the results. + untranspose_state_vecs(&h_vecs, &mut *jobs); + store_counts(&mut *jobs, counts_lo, counts_hi); + let max_consumed = offset.saturating_add(stride.padded_blockbytes()); + for job in jobs.iter_mut() { + let consumed = cmp::min(max_consumed, job.input.len()); + job.input = &job.input[consumed..]; + } +} diff --git a/third_party/rust/blake2b_simd/src/blake2bp.rs b/third_party/rust/blake2b_simd/src/blake2bp.rs new file mode 100644 index 0000000000..7bdfad64f6 --- /dev/null +++ b/third_party/rust/blake2b_simd/src/blake2bp.rs @@ -0,0 +1,570 @@ +//! BLAKE2bp, a variant of BLAKE2b that uses SIMD more efficiently. +//! +//! The AVX2 implementation of BLAKE2bp is about twice as fast that of BLAKE2b. +//! However, note that it's a different hash function, and it gives a different +//! hash from BLAKE2b for the same input. +//! +//! # Example +//! +//! ``` +//! use blake2b_simd::blake2bp; +//! +//! let hash = blake2bp::Params::new() +//! .hash_length(16) +//! .key(b"The Magic Words are Squeamish Ossifrage") +//! .to_state() +//! .update(b"foo") +//! .update(b"bar") +//! .update(b"baz") +//! .finalize(); +//! assert_eq!("e69c7d2c42a5ac14948772231c68c552", &hash.to_hex()); +//! ``` + +use crate::guts::{Finalize, Implementation, Job, LastNode, Stride}; +use crate::many; +use crate::Count; +use crate::Hash; +use crate::Word; +use crate::BLOCKBYTES; +use crate::KEYBYTES; +use crate::OUTBYTES; +use core::cmp; +use core::fmt; +use core::mem::size_of; + +#[cfg(feature = "std")] +use std; + +pub(crate) const DEGREE: usize = 4; + +/// Compute the BLAKE2bp hash of a slice of bytes all at once, using default +/// parameters. +/// +/// # Example +/// +/// ``` +/// # use blake2b_simd::blake2bp::blake2bp; +/// let expected = "8ca9ccee7946afcb686fe7556628b5ba1bf9a691da37ca58cd049354d99f3704\ +/// 2c007427e5f219b9ab5063707ec6823872dee413ee014b4d02f2ebb6abb5f643"; +/// let hash = blake2bp(b"foo"); +/// assert_eq!(expected, &hash.to_hex()); +/// ``` +pub fn blake2bp(input: &[u8]) -> Hash { + Params::new().hash(input) +} + +/// A parameter builder for BLAKE2bp, just like the [`Params`](../struct.Params.html) type for +/// BLAKE2b. +/// +/// This builder only supports configuring the hash length and a secret key. This matches the +/// options provided by the [reference +/// implementation](https://github.com/BLAKE2/BLAKE2/blob/320c325437539ae91091ce62efec1913cd8093c2/ref/blake2.h#L162-L165). +/// +/// # Example +/// +/// ``` +/// use blake2b_simd::blake2bp; +/// let mut state = blake2bp::Params::new().hash_length(32).to_state(); +/// ``` +#[derive(Clone)] +pub struct Params { + hash_length: u8, + key_length: u8, + key: [u8; KEYBYTES], + implementation: Implementation, +} + +impl Params { + /// Equivalent to `Params::default()`. + pub fn new() -> Self { + Self { + hash_length: OUTBYTES as u8, + key_length: 0, + key: [0; KEYBYTES], + implementation: Implementation::detect(), + } + } + + fn to_words(&self) -> ([[Word; 8]; DEGREE], [Word; 8]) { + let mut base_params = crate::Params::new(); + base_params + .hash_length(self.hash_length as usize) + .key(&self.key[..self.key_length as usize]) + .fanout(DEGREE as u8) + .max_depth(2) + .max_leaf_length(0) + // Note that inner_hash_length is always OUTBYTES, regardless of the hash_length + // parameter. This isn't documented in the spec, but it matches the behavior of the + // reference implementation: https://github.com/BLAKE2/BLAKE2/blob/320c325437539ae91091ce62efec1913cd8093c2/ref/blake2bp-ref.c#L55 + .inner_hash_length(OUTBYTES); + let leaf_words = |worker_index| { + base_params + .clone() + .node_offset(worker_index) + .node_depth(0) + // Note that setting the last_node flag here has no effect, + // because it isn't included in the state words. + .to_words() + }; + let leaf_words = [leaf_words(0), leaf_words(1), leaf_words(2), leaf_words(3)]; + let root_words = base_params + .clone() + .node_offset(0) + .node_depth(1) + // Note that setting the last_node flag here has no effect, because + // it isn't included in the state words. Also note that because + // we're only preserving its state words, the root node won't hash + // any key bytes. + .to_words(); + (leaf_words, root_words) + } + + /// Hash an input all at once with these parameters. + pub fn hash(&self, input: &[u8]) -> Hash { + // If there's a key, just fall back to using the State. + if self.key_length > 0 { + return self.to_state().update(input).finalize(); + } + let (mut leaf_words, mut root_words) = self.to_words(); + // Hash each leaf in parallel. + let jobs = leaf_words.iter_mut().enumerate().map(|(i, words)| { + let input_start = cmp::min(input.len(), i * BLOCKBYTES); + Job { + input: &input[input_start..], + words, + count: 0, + last_node: if i == DEGREE - 1 { + LastNode::Yes + } else { + LastNode::No + }, + } + }); + many::compress_many(jobs, self.implementation, Finalize::Yes, Stride::Parallel); + // Hash each leaf into the root. + finalize_root_words( + &leaf_words, + &mut root_words, + self.hash_length, + self.implementation, + ) + } + + /// Construct a BLAKE2bp `State` object based on these parameters. + pub fn to_state(&self) -> State { + State::with_params(self) + } + + /// Set the length of the final hash, from 1 to `OUTBYTES` (64). Apart from controlling the + /// length of the final `Hash`, this is also associated data, and changing it will result in a + /// totally different hash. + pub fn hash_length(&mut self, length: usize) -> &mut Self { + assert!( + 1 <= length && length <= OUTBYTES, + "Bad hash length: {}", + length + ); + self.hash_length = length as u8; + self + } + + /// Use a secret key, so that BLAKE2bp acts as a MAC. The maximum key length is `KEYBYTES` + /// (64). An empty key is equivalent to having no key at all. + pub fn key(&mut self, key: &[u8]) -> &mut Self { + assert!(key.len() <= KEYBYTES, "Bad key length: {}", key.len()); + self.key_length = key.len() as u8; + self.key = [0; KEYBYTES]; + self.key[..key.len()].copy_from_slice(key); + self + } +} + +impl Default for Params { + fn default() -> Self { + Self::new() + } +} + +impl fmt::Debug for Params { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!( + f, + "Params {{ hash_length: {}, key_length: {} }}", + self.hash_length, + // NB: Don't print the key itself. Debug shouldn't leak secrets. + self.key_length, + ) + } +} + +/// An incremental hasher for BLAKE2bp, just like the [`State`](../struct.State.html) type for +/// BLAKE2b. +/// +/// # Example +/// +/// ``` +/// use blake2b_simd::blake2bp; +/// +/// let mut state = blake2bp::State::new(); +/// state.update(b"foo"); +/// state.update(b"bar"); +/// let hash = state.finalize(); +/// +/// let expected = "e654427b6ef02949471712263e59071abbb6aa94855674c1daeed6cfaf127c33\ +/// dfa3205f7f7f71e4f0673d25fa82a368488911f446bccd323af3ab03f53e56e5"; +/// assert_eq!(expected, &hash.to_hex()); +/// ``` +#[derive(Clone)] +pub struct State { + leaf_words: [[Word; 8]; DEGREE], + root_words: [Word; 8], + // Note that this buffer is twice as large as what compress4 needs. That guarantees that we + // have enough input when we compress to know we don't need to finalize any of the leaves. + buf: [u8; 2 * DEGREE * BLOCKBYTES], + buf_len: u16, + // Note that this is the *per-leaf* count. + count: Count, + hash_length: u8, + implementation: Implementation, + is_keyed: bool, +} + +impl State { + /// Equivalent to `State::default()` or `Params::default().to_state()`. + pub fn new() -> Self { + Self::with_params(&Params::default()) + } + + fn with_params(params: &Params) -> Self { + let (leaf_words, root_words) = params.to_words(); + + // If a key is set, initalize the buffer to contain the key bytes. Note + // that only the leaves hash key bytes. The root doesn't, even though + // the key length it still set in its parameters. Again this isn't + // documented in the spec, but it matches the behavior of the reference + // implementation: + // https://github.com/BLAKE2/BLAKE2/blob/320c325437539ae91091ce62efec1913cd8093c2/ref/blake2bp-ref.c#L128 + // This particular behavior (though not the inner hash length behavior + // above) is also corroborated by the official test vectors; see + // tests/vector_tests.rs. + let mut buf = [0; 2 * DEGREE * BLOCKBYTES]; + let mut buf_len = 0; + if params.key_length > 0 { + for i in 0..DEGREE { + let keybytes = ¶ms.key[..params.key_length as usize]; + buf[i * BLOCKBYTES..][..keybytes.len()].copy_from_slice(keybytes); + buf_len = BLOCKBYTES * DEGREE; + } + } + + Self { + leaf_words, + root_words, + buf, + buf_len: buf_len as u16, + count: 0, // count gets updated in self.compress() + hash_length: params.hash_length, + implementation: params.implementation, + is_keyed: params.key_length > 0, + } + } + + fn fill_buf(&mut self, input: &mut &[u8]) { + let take = cmp::min(self.buf.len() - self.buf_len as usize, input.len()); + self.buf[self.buf_len as usize..][..take].copy_from_slice(&input[..take]); + self.buf_len += take as u16; + *input = &input[take..]; + } + + fn compress_to_leaves( + leaves: &mut [[Word; 8]; DEGREE], + input: &[u8], + count: &mut Count, + implementation: Implementation, + ) { + // Input is assumed to be an even number of blocks for each leaf. Since + // we're not finilizing, debug asserts will fire otherwise. + let jobs = leaves.iter_mut().enumerate().map(|(i, words)| { + Job { + input: &input[i * BLOCKBYTES..], + words, + count: *count, + last_node: LastNode::No, // irrelevant when not finalizing + } + }); + many::compress_many(jobs, implementation, Finalize::No, Stride::Parallel); + // Note that count is the bytes input *per-leaf*. + *count = count.wrapping_add((input.len() / DEGREE) as Count); + } + + /// Add input to the hash. You can call `update` any number of times. + pub fn update(&mut self, mut input: &[u8]) -> &mut Self { + // If we have a partial buffer, try to complete it. If we complete it and there's more + // input waiting, we need to compress to make more room. However, because we need to be + // sure that *none* of the leaves would need to be finalized as part of this round of + // compression, we need to buffer more than we would for BLAKE2b. + if self.buf_len > 0 { + self.fill_buf(&mut input); + // The buffer is large enough for two compressions. If we've filled + // the buffer and there's still more input coming, then we have to + // do at least one compression. If there's enough input still + // coming that all the leaves are guaranteed to get more, do both + // compressions in the buffer. Otherwise, do just one and shift the + // back half of the buffer to the front. + if !input.is_empty() { + if input.len() > (DEGREE - 1) * BLOCKBYTES { + // Enough input coming to do both compressions. + Self::compress_to_leaves( + &mut self.leaf_words, + &self.buf, + &mut self.count, + self.implementation, + ); + self.buf_len = 0; + } else { + // Only enough input coming for one compression. + Self::compress_to_leaves( + &mut self.leaf_words, + &self.buf[..DEGREE * BLOCKBYTES], + &mut self.count, + self.implementation, + ); + self.buf_len = (DEGREE * BLOCKBYTES) as u16; + let (buf_front, buf_back) = self.buf.split_at_mut(DEGREE * BLOCKBYTES); + buf_front.copy_from_slice(buf_back); + } + } + } + + // Now we directly compress as much input as possible, without copying + // it into the buffer. We need to make sure we buffer at least one byte + // for each of the leaves, so that we know we don't need to finalize + // them. + let needed_tail = (DEGREE - 1) * BLOCKBYTES + 1; + let mut bulk_bytes = input.len().saturating_sub(needed_tail); + bulk_bytes -= bulk_bytes % (DEGREE * BLOCKBYTES); + if bulk_bytes > 0 { + Self::compress_to_leaves( + &mut self.leaf_words, + &input[..bulk_bytes], + &mut self.count, + self.implementation, + ); + input = &input[bulk_bytes..]; + } + + // Buffer any remaining input, to be either compressed or finalized in + // a subsequent call. + self.fill_buf(&mut input); + debug_assert_eq!(0, input.len()); + self + } + + /// Finalize the state and return a `Hash`. This method is idempotent, and calling it multiple + /// times will give the same result. It's also possible to `update` with more input in between. + pub fn finalize(&self) -> Hash { + // Hash whatever's remaining in the buffer and finalize the leaves. + let buf_len = self.buf_len as usize; + let mut leaves_copy = self.leaf_words; + let jobs = leaves_copy + .iter_mut() + .enumerate() + .map(|(leaf_index, leaf_words)| { + let input = &self.buf[cmp::min(leaf_index * BLOCKBYTES, buf_len)..buf_len]; + Job { + input, + words: leaf_words, + count: self.count, + last_node: if leaf_index == DEGREE - 1 { + LastNode::Yes + } else { + LastNode::No + }, + } + }); + many::compress_many(jobs, self.implementation, Finalize::Yes, Stride::Parallel); + + // Concatenate each leaf into the root and hash that. + let mut root_words_copy = self.root_words; + finalize_root_words( + &leaves_copy, + &mut root_words_copy, + self.hash_length, + self.implementation, + ) + } + + /// Return the total number of bytes input so far. + /// + /// Note that `count` doesn't include the bytes of the key block, if any. + /// It's exactly the total number of input bytes fed to `update`. + pub fn count(&self) -> Count { + // Remember that self.count is *per-leaf*. + let mut ret = self + .count + .wrapping_mul(DEGREE as Count) + .wrapping_add(self.buf_len as Count); + if self.is_keyed { + ret -= (DEGREE * BLOCKBYTES) as Count; + } + ret + } +} + +#[cfg(feature = "std")] +impl std::io::Write for State { + fn write(&mut self, buf: &[u8]) -> std::io::Result<usize> { + self.update(buf); + Ok(buf.len()) + } + + fn flush(&mut self) -> std::io::Result<()> { + Ok(()) + } +} + +impl fmt::Debug for State { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!( + f, + "State {{ count: {}, hash_length: {} }}", + self.count(), + self.hash_length, + ) + } +} + +impl Default for State { + fn default() -> Self { + Self::with_params(&Params::default()) + } +} + +// Compress each of the four finalized hashes into the root words as input, +// using two compressions. Note that even if a future version of this +// implementation supports the hash_length parameter and sets it as associated +// data for all nodes, this step must still use the untruncated output of each +// leaf. Note also that, as mentioned above, the root node doesn't hash any key +// bytes. +fn finalize_root_words( + leaf_words: &[[Word; 8]; DEGREE], + root_words: &mut [Word; 8], + hash_length: u8, + imp: Implementation, +) -> Hash { + debug_assert_eq!(OUTBYTES, 8 * size_of::<Word>()); + let mut block = [0; DEGREE * OUTBYTES]; + for (word, chunk) in leaf_words + .iter() + .flat_map(|words| words.iter()) + .zip(block.chunks_exact_mut(size_of::<Word>())) + { + chunk.copy_from_slice(&word.to_le_bytes()); + } + imp.compress1_loop( + &block, + root_words, + 0, + LastNode::Yes, + Finalize::Yes, + Stride::Serial, + ); + Hash { + bytes: crate::state_words_to_bytes(&root_words), + len: hash_length, + } +} + +pub(crate) fn force_portable(params: &mut Params) { + params.implementation = Implementation::portable(); +} + +#[cfg(test)] +pub(crate) mod test { + use super::*; + use crate::paint_test_input; + + // This is a simple reference implementation without the complicated buffering or parameter + // support of the real implementation. We need this because the official test vectors don't + // include any inputs large enough to exercise all the branches in the buffering logic. + fn blake2bp_reference(input: &[u8]) -> Hash { + let mut leaves = arrayvec::ArrayVec::<[_; DEGREE]>::new(); + for leaf_index in 0..DEGREE { + leaves.push( + crate::Params::new() + .fanout(DEGREE as u8) + .max_depth(2) + .node_offset(leaf_index as u64) + .inner_hash_length(OUTBYTES) + .to_state(), + ); + } + leaves[DEGREE - 1].set_last_node(true); + for (i, chunk) in input.chunks(BLOCKBYTES).enumerate() { + leaves[i % DEGREE].update(chunk); + } + let mut root = crate::Params::new() + .fanout(DEGREE as u8) + .max_depth(2) + .node_depth(1) + .inner_hash_length(OUTBYTES) + .last_node(true) + .to_state(); + for leaf in &mut leaves { + root.update(leaf.finalize().as_bytes()); + } + root.finalize() + } + + #[test] + fn test_against_reference() { + let mut buf = [0; 21 * BLOCKBYTES]; + paint_test_input(&mut buf); + // - 8 blocks is just enought to fill the double buffer. + // - 9 blocks triggers the "perform one compression on the double buffer" case. + // - 11 blocks is the largest input where only one compression may be performed, on the + // first half of the buffer, because there's not enough input to avoid needing to + // finalize the second half. + // - 12 blocks triggers the "perform both compressions in the double buffer" case. + // - 15 blocks is the largest input where, after compressing 8 blocks from the buffer, + // there's not enough input to hash directly from memory. + // - 16 blocks triggers "after emptying the buffer, hash directly from memory". + for num_blocks in 0..=20 { + for &extra in &[0, 1, BLOCKBYTES - 1] { + for &portable in &[false, true] { + // eprintln!("\ncase -----"); + // dbg!(num_blocks); + // dbg!(extra); + // dbg!(portable); + + // First hash the input all at once, as a sanity check. + let mut params = Params::new(); + if portable { + force_portable(&mut params); + } + let input = &buf[..num_blocks * BLOCKBYTES + extra]; + let expected = blake2bp_reference(&input); + let mut state = params.to_state(); + let found = state.update(input).finalize(); + assert_eq!(expected, found); + + // Then, do it again, but buffer 1 byte of input first. That causes the buffering + // branch to trigger. + let mut state = params.to_state(); + let maybe_one = cmp::min(1, input.len()); + state.update(&input[..maybe_one]); + assert_eq!(maybe_one as Count, state.count()); + // Do a throwaway finalize here to check for idempotency. + state.finalize(); + state.update(&input[maybe_one..]); + assert_eq!(input.len() as Count, state.count()); + let found = state.finalize(); + assert_eq!(expected, found); + + // Finally, do it again with the all-at-once interface. + assert_eq!(expected, blake2bp(input)); + } + } + } + } +} diff --git a/third_party/rust/blake2b_simd/src/guts.rs b/third_party/rust/blake2b_simd/src/guts.rs new file mode 100644 index 0000000000..9fcacf319c --- /dev/null +++ b/third_party/rust/blake2b_simd/src/guts.rs @@ -0,0 +1,565 @@ +use crate::*; +use arrayref::array_ref; +use core::cmp; + +#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] +pub const MAX_DEGREE: usize = 4; + +#[cfg(not(any(target_arch = "x86", target_arch = "x86_64")))] +pub const MAX_DEGREE: usize = 1; + +// Variants other than Portable are unreachable in no_std, unless CPU features +// are explicitly enabled for the build with e.g. RUSTFLAGS="-C target-feature=avx2". +// This might change in the future if is_x86_feature_detected moves into libcore. +#[allow(dead_code)] +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +enum Platform { + Portable, + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + SSE41, + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + AVX2, +} + +#[derive(Clone, Copy, Debug)] +pub struct Implementation(Platform); + +impl Implementation { + pub fn detect() -> Self { + // Try the different implementations in order of how fast/modern they + // are. Currently on non-x86, everything just uses portable. + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + { + if let Some(avx2_impl) = Self::avx2_if_supported() { + return avx2_impl; + } + } + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + { + if let Some(sse41_impl) = Self::sse41_if_supported() { + return sse41_impl; + } + } + Self::portable() + } + + pub fn portable() -> Self { + Implementation(Platform::Portable) + } + + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + #[allow(unreachable_code)] + pub fn sse41_if_supported() -> Option<Self> { + // Check whether SSE4.1 support is assumed by the build. + #[cfg(target_feature = "sse4.1")] + { + return Some(Implementation(Platform::SSE41)); + } + // Otherwise dynamically check for support if we can. + #[cfg(feature = "std")] + { + if is_x86_feature_detected!("sse4.1") { + return Some(Implementation(Platform::SSE41)); + } + } + None + } + + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + #[allow(unreachable_code)] + pub fn avx2_if_supported() -> Option<Self> { + // Check whether AVX2 support is assumed by the build. + #[cfg(target_feature = "avx2")] + { + return Some(Implementation(Platform::AVX2)); + } + // Otherwise dynamically check for support if we can. + #[cfg(feature = "std")] + { + if is_x86_feature_detected!("avx2") { + return Some(Implementation(Platform::AVX2)); + } + } + None + } + + pub fn degree(&self) -> usize { + match self.0 { + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + Platform::AVX2 => avx2::DEGREE, + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + Platform::SSE41 => sse41::DEGREE, + Platform::Portable => 1, + } + } + + pub fn compress1_loop( + &self, + input: &[u8], + words: &mut [Word; 8], + count: Count, + last_node: LastNode, + finalize: Finalize, + stride: Stride, + ) { + match self.0 { + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + Platform::AVX2 => unsafe { + avx2::compress1_loop(input, words, count, last_node, finalize, stride); + }, + // Note that there's an SSE version of compress1 in the official C + // implementation, but I haven't ported it yet. + _ => { + portable::compress1_loop(input, words, count, last_node, finalize, stride); + } + } + } + + pub fn compress2_loop(&self, jobs: &mut [Job; 2], finalize: Finalize, stride: Stride) { + match self.0 { + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + Platform::AVX2 | Platform::SSE41 => unsafe { + sse41::compress2_loop(jobs, finalize, stride) + }, + _ => panic!("unsupported"), + } + } + + pub fn compress4_loop(&self, jobs: &mut [Job; 4], finalize: Finalize, stride: Stride) { + match self.0 { + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + Platform::AVX2 => unsafe { avx2::compress4_loop(jobs, finalize, stride) }, + _ => panic!("unsupported"), + } + } +} + +pub struct Job<'a, 'b> { + pub input: &'a [u8], + pub words: &'b mut [Word; 8], + pub count: Count, + pub last_node: LastNode, +} + +impl<'a, 'b> core::fmt::Debug for Job<'a, 'b> { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + // NB: Don't print the words. Leaking them would allow length extension. + write!( + f, + "Job {{ input_len: {}, count: {}, last_node: {} }}", + self.input.len(), + self.count, + self.last_node.yes(), + ) + } +} + +// Finalize could just be a bool, but this is easier to read at callsites. +#[derive(Clone, Copy, Debug)] +pub enum Finalize { + Yes, + No, +} + +impl Finalize { + pub fn yes(&self) -> bool { + match self { + Finalize::Yes => true, + Finalize::No => false, + } + } +} + +// Like Finalize, this is easier to read at callsites. +#[derive(Clone, Copy, Debug)] +pub enum LastNode { + Yes, + No, +} + +impl LastNode { + pub fn yes(&self) -> bool { + match self { + LastNode::Yes => true, + LastNode::No => false, + } + } +} + +#[derive(Clone, Copy, Debug)] +pub enum Stride { + Serial, // BLAKE2b/BLAKE2s + Parallel, // BLAKE2bp/BLAKE2sp +} + +impl Stride { + pub fn padded_blockbytes(&self) -> usize { + match self { + Stride::Serial => BLOCKBYTES, + Stride::Parallel => blake2bp::DEGREE * BLOCKBYTES, + } + } +} + +pub(crate) fn count_low(count: Count) -> Word { + count as Word +} + +pub(crate) fn count_high(count: Count) -> Word { + (count >> 8 * size_of::<Word>()) as Word +} + +pub(crate) fn assemble_count(low: Word, high: Word) -> Count { + low as Count + ((high as Count) << 8 * size_of::<Word>()) +} + +pub(crate) fn flag_word(flag: bool) -> Word { + if flag { + !0 + } else { + 0 + } +} + +// Pull a array reference at the given offset straight from the input, if +// there's a full block of input available. If there's only a partial block, +// copy it into the provided buffer, and return an array reference that. Along +// with the array, return the number of bytes of real input, and whether the +// input can be finalized (i.e. whether there aren't any more bytes after this +// block). Note that this is written so that the optimizer can elide bounds +// checks, see: https://godbolt.org/z/0hH2bC +pub fn final_block<'a>( + input: &'a [u8], + offset: usize, + buffer: &'a mut [u8; BLOCKBYTES], + stride: Stride, +) -> (&'a [u8; BLOCKBYTES], usize, bool) { + let capped_offset = cmp::min(offset, input.len()); + let offset_slice = &input[capped_offset..]; + if offset_slice.len() >= BLOCKBYTES { + let block = array_ref!(offset_slice, 0, BLOCKBYTES); + let should_finalize = offset_slice.len() <= stride.padded_blockbytes(); + (block, BLOCKBYTES, should_finalize) + } else { + // Copy the final block to the front of the block buffer. The rest of + // the buffer is assumed to be initialized to zero. + buffer[..offset_slice.len()].copy_from_slice(offset_slice); + (buffer, offset_slice.len(), true) + } +} + +pub fn input_debug_asserts(input: &[u8], finalize: Finalize) { + // If we're not finalizing, the input must not be empty, and it must be an + // even multiple of the block size. + if !finalize.yes() { + debug_assert!(!input.is_empty()); + debug_assert_eq!(0, input.len() % BLOCKBYTES); + } +} + +#[cfg(test)] +mod test { + use super::*; + use arrayvec::ArrayVec; + use core::mem::size_of; + + #[test] + fn test_detection() { + assert_eq!(Platform::Portable, Implementation::portable().0); + + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + #[cfg(feature = "std")] + { + if is_x86_feature_detected!("avx2") { + assert_eq!(Platform::AVX2, Implementation::detect().0); + assert_eq!( + Platform::AVX2, + Implementation::avx2_if_supported().unwrap().0 + ); + assert_eq!( + Platform::SSE41, + Implementation::sse41_if_supported().unwrap().0 + ); + } else if is_x86_feature_detected!("sse4.1") { + assert_eq!(Platform::SSE41, Implementation::detect().0); + assert!(Implementation::avx2_if_supported().is_none()); + assert_eq!( + Platform::SSE41, + Implementation::sse41_if_supported().unwrap().0 + ); + } else { + assert_eq!(Platform::Portable, Implementation::detect().0); + assert!(Implementation::avx2_if_supported().is_none()); + assert!(Implementation::sse41_if_supported().is_none()); + } + } + } + + // TODO: Move all of these case tests into the implementation files. + fn exercise_cases<F>(mut f: F) + where + F: FnMut(Stride, usize, LastNode, Finalize, Count), + { + // Chose counts to hit the relevant overflow cases. + let counts = &[ + (0 as Count), + ((1 as Count) << (8 * size_of::<Word>())) - BLOCKBYTES as Count, + (0 as Count).wrapping_sub(BLOCKBYTES as Count), + ]; + for &stride in &[Stride::Serial, Stride::Parallel] { + let lengths = [ + 0, + 1, + BLOCKBYTES - 1, + BLOCKBYTES, + BLOCKBYTES + 1, + 2 * BLOCKBYTES - 1, + 2 * BLOCKBYTES, + 2 * BLOCKBYTES + 1, + stride.padded_blockbytes() - 1, + stride.padded_blockbytes(), + stride.padded_blockbytes() + 1, + 2 * stride.padded_blockbytes() - 1, + 2 * stride.padded_blockbytes(), + 2 * stride.padded_blockbytes() + 1, + ]; + for &length in &lengths { + for &last_node in &[LastNode::No, LastNode::Yes] { + for &finalize in &[Finalize::No, Finalize::Yes] { + if !finalize.yes() && (length == 0 || length % BLOCKBYTES != 0) { + // Skip these cases, they're invalid. + continue; + } + for &count in counts { + // eprintln!("\ncase -----"); + // dbg!(stride); + // dbg!(length); + // dbg!(last_node); + // dbg!(finalize); + // dbg!(count); + + f(stride, length, last_node, finalize, count); + } + } + } + } + } + } + + fn initial_test_words(input_index: usize) -> [Word; 8] { + crate::Params::new() + .node_offset(input_index as u64) + .to_words() + } + + // Use the portable implementation, one block at a time, to compute the + // final state words expected for a given test case. + fn reference_compression( + input: &[u8], + stride: Stride, + last_node: LastNode, + finalize: Finalize, + mut count: Count, + input_index: usize, + ) -> [Word; 8] { + let mut words = initial_test_words(input_index); + let mut offset = 0; + while offset == 0 || offset < input.len() { + let block_size = cmp::min(BLOCKBYTES, input.len() - offset); + let maybe_finalize = if offset + stride.padded_blockbytes() < input.len() { + Finalize::No + } else { + finalize + }; + portable::compress1_loop( + &input[offset..][..block_size], + &mut words, + count, + last_node, + maybe_finalize, + Stride::Serial, + ); + offset += stride.padded_blockbytes(); + count = count.wrapping_add(BLOCKBYTES as Count); + } + words + } + + // For various loop lengths and finalization parameters, make sure that the + // implementation gives the same answer as the portable implementation does + // when invoked one block at a time. (So even the portable implementation + // itself is being tested here, to make sure its loop is correct.) Note + // that this doesn't include any fixed test vectors; those are taken from + // the blake2-kat.json file (copied from upstream) and tested elsewhere. + fn exercise_compress1_loop(implementation: Implementation) { + let mut input = [0; 100 * BLOCKBYTES]; + paint_test_input(&mut input); + + exercise_cases(|stride, length, last_node, finalize, count| { + let reference_words = + reference_compression(&input[..length], stride, last_node, finalize, count, 0); + + let mut test_words = initial_test_words(0); + implementation.compress1_loop( + &input[..length], + &mut test_words, + count, + last_node, + finalize, + stride, + ); + assert_eq!(reference_words, test_words); + }); + } + + #[test] + fn test_compress1_loop_portable() { + exercise_compress1_loop(Implementation::portable()); + } + + #[test] + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + fn test_compress1_loop_sse41() { + // Currently this just falls back to portable, but we test it anyway. + if let Some(imp) = Implementation::sse41_if_supported() { + exercise_compress1_loop(imp); + } + } + + #[test] + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + fn test_compress1_loop_avx2() { + if let Some(imp) = Implementation::avx2_if_supported() { + exercise_compress1_loop(imp); + } + } + + // I use ArrayVec everywhere in here becuase currently these tests pass + // under no_std. I might decide that's not worth maintaining at some point, + // since really all we care about with no_std is that the library builds, + // but for now it's here. Everything is keyed off of this N constant so + // that it's easy to copy the code to exercise_compress4_loop. + fn exercise_compress2_loop(implementation: Implementation) { + const N: usize = 2; + + let mut input_buffer = [0; 100 * BLOCKBYTES]; + paint_test_input(&mut input_buffer); + let mut inputs = ArrayVec::<[_; N]>::new(); + for i in 0..N { + inputs.push(&input_buffer[i..]); + } + + exercise_cases(|stride, length, last_node, finalize, count| { + let mut reference_words = ArrayVec::<[_; N]>::new(); + for i in 0..N { + let words = reference_compression( + &inputs[i][..length], + stride, + last_node, + finalize, + count.wrapping_add((i * BLOCKBYTES) as Count), + i, + ); + reference_words.push(words); + } + + let mut test_words = ArrayVec::<[_; N]>::new(); + for i in 0..N { + test_words.push(initial_test_words(i)); + } + let mut jobs = ArrayVec::<[_; N]>::new(); + for (i, words) in test_words.iter_mut().enumerate() { + jobs.push(Job { + input: &inputs[i][..length], + words, + count: count.wrapping_add((i * BLOCKBYTES) as Count), + last_node, + }); + } + let mut jobs = jobs.into_inner().expect("full"); + implementation.compress2_loop(&mut jobs, finalize, stride); + + for i in 0..N { + assert_eq!(reference_words[i], test_words[i], "words {} unequal", i); + } + }); + } + + #[test] + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + fn test_compress2_loop_sse41() { + if let Some(imp) = Implementation::sse41_if_supported() { + exercise_compress2_loop(imp); + } + } + + #[test] + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + fn test_compress2_loop_avx2() { + // Currently this just falls back to SSE4.1, but we test it anyway. + if let Some(imp) = Implementation::avx2_if_supported() { + exercise_compress2_loop(imp); + } + } + + // Copied from exercise_compress2_loop, with a different value of N and an + // interior call to compress4_loop. + fn exercise_compress4_loop(implementation: Implementation) { + const N: usize = 4; + + let mut input_buffer = [0; 100 * BLOCKBYTES]; + paint_test_input(&mut input_buffer); + let mut inputs = ArrayVec::<[_; N]>::new(); + for i in 0..N { + inputs.push(&input_buffer[i..]); + } + + exercise_cases(|stride, length, last_node, finalize, count| { + let mut reference_words = ArrayVec::<[_; N]>::new(); + for i in 0..N { + let words = reference_compression( + &inputs[i][..length], + stride, + last_node, + finalize, + count.wrapping_add((i * BLOCKBYTES) as Count), + i, + ); + reference_words.push(words); + } + + let mut test_words = ArrayVec::<[_; N]>::new(); + for i in 0..N { + test_words.push(initial_test_words(i)); + } + let mut jobs = ArrayVec::<[_; N]>::new(); + for (i, words) in test_words.iter_mut().enumerate() { + jobs.push(Job { + input: &inputs[i][..length], + words, + count: count.wrapping_add((i * BLOCKBYTES) as Count), + last_node, + }); + } + let mut jobs = jobs.into_inner().expect("full"); + implementation.compress4_loop(&mut jobs, finalize, stride); + + for i in 0..N { + assert_eq!(reference_words[i], test_words[i], "words {} unequal", i); + } + }); + } + + #[test] + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + fn test_compress4_loop_avx2() { + if let Some(imp) = Implementation::avx2_if_supported() { + exercise_compress4_loop(imp); + } + } + + #[test] + fn sanity_check_count_size() { + assert_eq!(size_of::<Count>(), 2 * size_of::<Word>()); + } +} diff --git a/third_party/rust/blake2b_simd/src/lib.rs b/third_party/rust/blake2b_simd/src/lib.rs new file mode 100644 index 0000000000..cadef7f9a9 --- /dev/null +++ b/third_party/rust/blake2b_simd/src/lib.rs @@ -0,0 +1,674 @@ +//! [![GitHub](https://img.shields.io/github/tag/oconnor663/blake2_simd.svg?label=GitHub)](https://github.com/oconnor663/blake2_simd) [![crates.io](https://img.shields.io/crates/v/blake2b_simd.svg)](https://crates.io/crates/blake2b_simd) [![Build Status](https://travis-ci.org/oconnor663/blake2_simd.svg?branch=master)](https://travis-ci.org/oconnor663/blake2_simd) +//! +//! An implementation of the BLAKE2b and BLAKE2bp hash functions. See also +//! [`blake2s_simd`](https://docs.rs/blake2s_simd). +//! +//! This crate includes: +//! +//! - 100% stable Rust. +//! - SIMD implementations based on Samuel Neves' [`blake2-avx2`](https://github.com/sneves/blake2-avx2). +//! These are very fast. For benchmarks, see [the Performance section of the +//! README](https://github.com/oconnor663/blake2_simd#performance). +//! - Portable, safe implementations for other platforms. +//! - Dynamic CPU feature detection. Binaries include multiple implementations by default and +//! choose the fastest one the processor supports at runtime. +//! - All the features from the [the BLAKE2 spec](https://blake2.net/blake2.pdf), like adjustable +//! length, keying, and associated data for tree hashing. +//! - `no_std` support. The `std` Cargo feature is on by default, for CPU feature detection and +//! for implementing `std::io::Write`. +//! - Support for computing multiple BLAKE2b hashes in parallel, matching the efficiency of +//! BLAKE2bp. See the [`many`](many/index.html) module. +//! +//! # Example +//! +//! ``` +//! use blake2b_simd::{blake2b, Params}; +//! +//! let expected = "ca002330e69d3e6b84a46a56a6533fd79d51d97a3bb7cad6c2ff43b354185d6d\ +//! c1e723fb3db4ae0737e120378424c714bb982d9dc5bbd7a0ab318240ddd18f8d"; +//! let hash = blake2b(b"foo"); +//! assert_eq!(expected, &hash.to_hex()); +//! +//! let hash = Params::new() +//! .hash_length(16) +//! .key(b"The Magic Words are Squeamish Ossifrage") +//! .personal(b"L. P. Waterhouse") +//! .to_state() +//! .update(b"foo") +//! .update(b"bar") +//! .update(b"baz") +//! .finalize(); +//! assert_eq!("ee8ff4e9be887297cf79348dc35dab56", &hash.to_hex()); +//! ``` + +#![cfg_attr(not(feature = "std"), no_std)] + +use arrayref::{array_refs, mut_array_refs}; +use core::cmp; +use core::fmt; +use core::mem::size_of; + +#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] +mod avx2; +mod portable; +#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] +mod sse41; + +pub mod blake2bp; +mod guts; +pub mod many; + +#[cfg(test)] +mod test; + +type Word = u64; +type Count = u128; + +/// The max hash length. +pub const OUTBYTES: usize = 8 * size_of::<Word>(); +/// The max key length. +pub const KEYBYTES: usize = 8 * size_of::<Word>(); +/// The max salt length. +pub const SALTBYTES: usize = 2 * size_of::<Word>(); +/// The max personalization length. +pub const PERSONALBYTES: usize = 2 * size_of::<Word>(); +/// The number input bytes passed to each call to the compression function. Small benchmarks need +/// to use an even multiple of `BLOCKBYTES`, or else their apparent throughput will be low. +pub const BLOCKBYTES: usize = 16 * size_of::<Word>(); + +const IV: [Word; 8] = [ + 0x6A09E667F3BCC908, + 0xBB67AE8584CAA73B, + 0x3C6EF372FE94F82B, + 0xA54FF53A5F1D36F1, + 0x510E527FADE682D1, + 0x9B05688C2B3E6C1F, + 0x1F83D9ABFB41BD6B, + 0x5BE0CD19137E2179, +]; + +const SIGMA: [[u8; 16]; 12] = [ + [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15], + [14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3], + [11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4], + [7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8], + [9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13], + [2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9], + [12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11], + [13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10], + [6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5], + [10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0], + [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15], + [14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3], +]; + +/// Compute the BLAKE2b hash of a slice of bytes all at once, using default +/// parameters. +/// +/// # Example +/// +/// ``` +/// # use blake2b_simd::{blake2b, Params}; +/// let expected = "ca002330e69d3e6b84a46a56a6533fd79d51d97a3bb7cad6c2ff43b354185d6d\ +/// c1e723fb3db4ae0737e120378424c714bb982d9dc5bbd7a0ab318240ddd18f8d"; +/// let hash = blake2b(b"foo"); +/// assert_eq!(expected, &hash.to_hex()); +/// ``` +pub fn blake2b(input: &[u8]) -> Hash { + Params::new().hash(input) +} + +/// A parameter builder that exposes all the non-default BLAKE2 features. +/// +/// Apart from `hash_length`, which controls the length of the final `Hash`, +/// all of these parameters are just associated data that gets mixed with the +/// input. For more details, see [the BLAKE2 spec](https://blake2.net/blake2.pdf). +/// +/// Several of the parameters have a valid range defined in the spec and +/// documented below. Trying to set an invalid parameter will panic. +/// +/// # Example +/// +/// ``` +/// # use blake2b_simd::Params; +/// // Create a Params object with a secret key and a non-default length. +/// let mut params = Params::new(); +/// params.key(b"my secret key"); +/// params.hash_length(16); +/// +/// // Use those params to hash an input all at once. +/// let hash = params.hash(b"my input"); +/// +/// // Or use those params to build an incremental State. +/// let mut state = params.to_state(); +/// ``` +#[derive(Clone)] +pub struct Params { + hash_length: u8, + key_length: u8, + key_block: [u8; BLOCKBYTES], + salt: [u8; SALTBYTES], + personal: [u8; PERSONALBYTES], + fanout: u8, + max_depth: u8, + max_leaf_length: u32, + node_offset: u64, + node_depth: u8, + inner_hash_length: u8, + last_node: guts::LastNode, + implementation: guts::Implementation, +} + +impl Params { + /// Equivalent to `Params::default()`. + #[inline] + pub fn new() -> Self { + Self { + hash_length: OUTBYTES as u8, + key_length: 0, + key_block: [0; BLOCKBYTES], + salt: [0; SALTBYTES], + personal: [0; PERSONALBYTES], + // NOTE: fanout and max_depth don't default to zero! + fanout: 1, + max_depth: 1, + max_leaf_length: 0, + node_offset: 0, + node_depth: 0, + inner_hash_length: 0, + last_node: guts::LastNode::No, + implementation: guts::Implementation::detect(), + } + } + + #[inline(always)] + fn to_words(&self) -> [Word; 8] { + let (salt_left, salt_right) = array_refs!(&self.salt, SALTBYTES / 2, SALTBYTES / 2); + let (personal_left, personal_right) = + array_refs!(&self.personal, PERSONALBYTES / 2, PERSONALBYTES / 2); + [ + IV[0] + ^ self.hash_length as u64 + ^ (self.key_length as u64) << 8 + ^ (self.fanout as u64) << 16 + ^ (self.max_depth as u64) << 24 + ^ (self.max_leaf_length as u64) << 32, + IV[1] ^ self.node_offset, + IV[2] ^ self.node_depth as u64 ^ (self.inner_hash_length as u64) << 8, + IV[3], + IV[4] ^ Word::from_le_bytes(*salt_left), + IV[5] ^ Word::from_le_bytes(*salt_right), + IV[6] ^ Word::from_le_bytes(*personal_left), + IV[7] ^ Word::from_le_bytes(*personal_right), + ] + } + + /// Hash an input all at once with these parameters. + #[inline] + pub fn hash(&self, input: &[u8]) -> Hash { + // If there's a key, just fall back to using the State. + if self.key_length > 0 { + return self.to_state().update(input).finalize(); + } + let mut words = self.to_words(); + self.implementation.compress1_loop( + input, + &mut words, + 0, + self.last_node, + guts::Finalize::Yes, + guts::Stride::Serial, + ); + Hash { + bytes: state_words_to_bytes(&words), + len: self.hash_length, + } + } + + /// Construct a `State` object based on these parameters, for hashing input + /// incrementally. + pub fn to_state(&self) -> State { + State::with_params(self) + } + + /// Set the length of the final hash in bytes, from 1 to `OUTBYTES` (64). Apart from + /// controlling the length of the final `Hash`, this is also associated data, and changing it + /// will result in a totally different hash. + #[inline] + pub fn hash_length(&mut self, length: usize) -> &mut Self { + assert!( + 1 <= length && length <= OUTBYTES, + "Bad hash length: {}", + length + ); + self.hash_length = length as u8; + self + } + + /// Use a secret key, so that BLAKE2 acts as a MAC. The maximum key length is `KEYBYTES` (64). + /// An empty key is equivalent to having no key at all. + #[inline] + pub fn key(&mut self, key: &[u8]) -> &mut Self { + assert!(key.len() <= KEYBYTES, "Bad key length: {}", key.len()); + self.key_length = key.len() as u8; + self.key_block = [0; BLOCKBYTES]; + self.key_block[..key.len()].copy_from_slice(key); + self + } + + /// At most `SALTBYTES` (16). Shorter salts are padded with null bytes. An empty salt is + /// equivalent to having no salt at all. + #[inline] + pub fn salt(&mut self, salt: &[u8]) -> &mut Self { + assert!(salt.len() <= SALTBYTES, "Bad salt length: {}", salt.len()); + self.salt = [0; SALTBYTES]; + self.salt[..salt.len()].copy_from_slice(salt); + self + } + + /// At most `PERSONALBYTES` (16). Shorter personalizations are padded with null bytes. An empty + /// personalization is equivalent to having no personalization at all. + #[inline] + pub fn personal(&mut self, personalization: &[u8]) -> &mut Self { + assert!( + personalization.len() <= PERSONALBYTES, + "Bad personalization length: {}", + personalization.len() + ); + self.personal = [0; PERSONALBYTES]; + self.personal[..personalization.len()].copy_from_slice(personalization); + self + } + + /// From 0 (meaning unlimited) to 255. The default is 1 (meaning sequential). + #[inline] + pub fn fanout(&mut self, fanout: u8) -> &mut Self { + self.fanout = fanout; + self + } + + /// From 0 (meaning BLAKE2X B2 hashes), through 1 (the default, meaning sequential) to 255 (meaning unlimited). + #[inline] + pub fn max_depth(&mut self, depth: u8) -> &mut Self { + self.max_depth = depth; + self + } + + /// From 0 (the default, meaning unlimited or sequential) to `2^32 - 1`. + #[inline] + pub fn max_leaf_length(&mut self, length: u32) -> &mut Self { + self.max_leaf_length = length; + self + } + + /// From 0 (the default, meaning first, leftmost, leaf, or sequential) to `2^64 - 1`. + #[inline] + pub fn node_offset(&mut self, offset: u64) -> &mut Self { + self.node_offset = offset; + self + } + + /// From 0 (the default, meaning leaf or sequential) to 255. + #[inline] + pub fn node_depth(&mut self, depth: u8) -> &mut Self { + self.node_depth = depth; + self + } + + /// From 0 (the default, meaning sequential) to `OUTBYTES` (64). + #[inline] + pub fn inner_hash_length(&mut self, length: usize) -> &mut Self { + assert!(length <= OUTBYTES, "Bad inner hash length: {}", length); + self.inner_hash_length = length as u8; + self + } + + /// Indicates the rightmost node in a row. This can also be changed on the + /// `State` object, potentially after hashing has begun. See + /// [`State::set_last_node`]. + /// + /// [`State::set_last_node`]: struct.State.html#method.set_last_node + #[inline] + pub fn last_node(&mut self, last_node: bool) -> &mut Self { + self.last_node = if last_node { + guts::LastNode::Yes + } else { + guts::LastNode::No + }; + self + } +} + +impl Default for Params { + fn default() -> Self { + Self::new() + } +} + +impl fmt::Debug for Params { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!( + f, + "Params {{ hash_length: {}, key_length: {}, salt: {:?}, personal: {:?}, fanout: {}, \ + max_depth: {}, max_leaf_length: {}, node_offset: {}, node_depth: {}, \ + inner_hash_length: {}, last_node: {} }}", + self.hash_length, + // NB: Don't print the key itself. Debug shouldn't leak secrets. + self.key_length, + &self.salt, + &self.personal, + self.fanout, + self.max_depth, + self.max_leaf_length, + self.node_offset, + self.node_depth, + self.inner_hash_length, + self.last_node.yes(), + ) + } +} + +/// An incremental hasher for BLAKE2b. +/// +/// To construct a `State` with non-default parameters, see `Params::to_state`. +/// +/// # Example +/// +/// ``` +/// use blake2b_simd::{State, blake2b}; +/// +/// let mut state = blake2b_simd::State::new(); +/// +/// state.update(b"foo"); +/// assert_eq!(blake2b(b"foo"), state.finalize()); +/// +/// state.update(b"bar"); +/// assert_eq!(blake2b(b"foobar"), state.finalize()); +/// ``` +#[derive(Clone)] +pub struct State { + words: [Word; 8], + count: Count, + buf: [u8; BLOCKBYTES], + buflen: u8, + last_node: guts::LastNode, + hash_length: u8, + implementation: guts::Implementation, + is_keyed: bool, +} + +impl State { + /// Equivalent to `State::default()` or `Params::default().to_state()`. + pub fn new() -> Self { + Self::with_params(&Params::default()) + } + + fn with_params(params: &Params) -> Self { + let mut state = Self { + words: params.to_words(), + count: 0, + buf: [0; BLOCKBYTES], + buflen: 0, + last_node: params.last_node, + hash_length: params.hash_length, + implementation: params.implementation, + is_keyed: params.key_length > 0, + }; + if state.is_keyed { + state.buf = params.key_block; + state.buflen = state.buf.len() as u8; + } + state + } + + fn fill_buf(&mut self, input: &mut &[u8]) { + let take = cmp::min(BLOCKBYTES - self.buflen as usize, input.len()); + self.buf[self.buflen as usize..self.buflen as usize + take].copy_from_slice(&input[..take]); + self.buflen += take as u8; + *input = &input[take..]; + } + + // If the state already has some input in its buffer, try to fill the buffer and perform a + // compression. However, only do the compression if there's more input coming, otherwise it + // will give the wrong hash it the caller finalizes immediately after. + fn compress_buffer_if_possible(&mut self, input: &mut &[u8]) { + if self.buflen > 0 { + self.fill_buf(input); + if !input.is_empty() { + self.implementation.compress1_loop( + &self.buf, + &mut self.words, + self.count, + self.last_node, + guts::Finalize::No, + guts::Stride::Serial, + ); + self.count = self.count.wrapping_add(BLOCKBYTES as Count); + self.buflen = 0; + } + } + } + + /// Add input to the hash. You can call `update` any number of times. + pub fn update(&mut self, mut input: &[u8]) -> &mut Self { + // If we have a partial buffer, try to complete it. + self.compress_buffer_if_possible(&mut input); + // While there's more than a block of input left (which also means we cleared the buffer + // above), compress blocks directly without copying. + let mut end = input.len().saturating_sub(1); + end -= end % BLOCKBYTES; + if end > 0 { + self.implementation.compress1_loop( + &input[..end], + &mut self.words, + self.count, + self.last_node, + guts::Finalize::No, + guts::Stride::Serial, + ); + self.count = self.count.wrapping_add(end as Count); + input = &input[end..]; + } + // Buffer any remaining input, to be either compressed or finalized in a subsequent call. + // Note that this represents some copying overhead, which in theory we could avoid in + // all-at-once setting. A function hardcoded for exactly BLOCKSIZE input bytes is about 10% + // faster than using this implementation for the same input. + self.fill_buf(&mut input); + self + } + + /// Finalize the state and return a `Hash`. This method is idempotent, and calling it multiple + /// times will give the same result. It's also possible to `update` with more input in between. + pub fn finalize(&self) -> Hash { + let mut words_copy = self.words; + self.implementation.compress1_loop( + &self.buf[..self.buflen as usize], + &mut words_copy, + self.count, + self.last_node, + guts::Finalize::Yes, + guts::Stride::Serial, + ); + Hash { + bytes: state_words_to_bytes(&words_copy), + len: self.hash_length, + } + } + + /// Set a flag indicating that this is the last node of its level in a tree hash. This is + /// equivalent to [`Params::last_node`], except that it can be set at any time before calling + /// `finalize`. That allows callers to begin hashing a node without knowing ahead of time + /// whether it's the last in its level. For more details about the intended use of this flag + /// [the BLAKE2 spec]. + /// + /// [`Params::last_node`]: struct.Params.html#method.last_node + /// [the BLAKE2 spec]: https://blake2.net/blake2.pdf + pub fn set_last_node(&mut self, last_node: bool) -> &mut Self { + self.last_node = if last_node { + guts::LastNode::Yes + } else { + guts::LastNode::No + }; + self + } + + /// Return the total number of bytes input so far. + /// + /// Note that `count` doesn't include the bytes of the key block, if any. + /// It's exactly the total number of input bytes fed to `update`. + pub fn count(&self) -> Count { + let mut ret = self.count.wrapping_add(self.buflen as Count); + if self.is_keyed { + ret -= BLOCKBYTES as Count; + } + ret + } +} + +#[inline(always)] +fn state_words_to_bytes(state_words: &[Word; 8]) -> [u8; OUTBYTES] { + let mut bytes = [0; OUTBYTES]; + { + const W: usize = size_of::<Word>(); + let refs = mut_array_refs!(&mut bytes, W, W, W, W, W, W, W, W); + *refs.0 = state_words[0].to_le_bytes(); + *refs.1 = state_words[1].to_le_bytes(); + *refs.2 = state_words[2].to_le_bytes(); + *refs.3 = state_words[3].to_le_bytes(); + *refs.4 = state_words[4].to_le_bytes(); + *refs.5 = state_words[5].to_le_bytes(); + *refs.6 = state_words[6].to_le_bytes(); + *refs.7 = state_words[7].to_le_bytes(); + } + bytes +} + +#[cfg(feature = "std")] +impl std::io::Write for State { + fn write(&mut self, buf: &[u8]) -> std::io::Result<usize> { + self.update(buf); + Ok(buf.len()) + } + + fn flush(&mut self) -> std::io::Result<()> { + Ok(()) + } +} + +impl fmt::Debug for State { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + // NB: Don't print the words. Leaking them would allow length extension. + write!( + f, + "State {{ count: {}, hash_length: {}, last_node: {} }}", + self.count(), + self.hash_length, + self.last_node.yes(), + ) + } +} + +impl Default for State { + fn default() -> Self { + Self::with_params(&Params::default()) + } +} + +type HexString = arrayvec::ArrayString<[u8; 2 * OUTBYTES]>; + +/// A finalized BLAKE2 hash, with constant-time equality. +#[derive(Clone, Copy)] +pub struct Hash { + bytes: [u8; OUTBYTES], + len: u8, +} + +impl Hash { + /// Convert the hash to a byte slice. Note that if you're using BLAKE2 as a MAC, you need + /// constant time equality, which `&[u8]` doesn't provide. + pub fn as_bytes(&self) -> &[u8] { + &self.bytes[..self.len as usize] + } + + /// Convert the hash to a byte array. Note that if you're using BLAKE2 as a + /// MAC, you need constant time equality, which arrays don't provide. This + /// panics in debug mode if the length of the hash isn't `OUTBYTES`. + #[inline] + pub fn as_array(&self) -> &[u8; OUTBYTES] { + debug_assert_eq!(self.len as usize, OUTBYTES); + &self.bytes + } + + /// Convert the hash to a lowercase hexadecimal + /// [`ArrayString`](https://docs.rs/arrayvec/0.4/arrayvec/struct.ArrayString.html). + pub fn to_hex(&self) -> HexString { + bytes_to_hex(self.as_bytes()) + } +} + +fn bytes_to_hex(bytes: &[u8]) -> HexString { + let mut s = arrayvec::ArrayString::new(); + let table = b"0123456789abcdef"; + for &b in bytes { + s.push(table[(b >> 4) as usize] as char); + s.push(table[(b & 0xf) as usize] as char); + } + s +} + +/// This implementation is constant time, if the two hashes are the same length. +impl PartialEq for Hash { + fn eq(&self, other: &Hash) -> bool { + constant_time_eq::constant_time_eq(&self.as_bytes(), &other.as_bytes()) + } +} + +/// This implementation is constant time, if the slice is the same length as the hash. +impl PartialEq<[u8]> for Hash { + fn eq(&self, other: &[u8]) -> bool { + constant_time_eq::constant_time_eq(&self.as_bytes(), other) + } +} + +impl Eq for Hash {} + +impl AsRef<[u8]> for Hash { + fn as_ref(&self) -> &[u8] { + self.as_bytes() + } +} + +impl fmt::Debug for Hash { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!(f, "Hash(0x{})", self.to_hex()) + } +} + +// Paint a byte pattern that won't repeat, so that we don't accidentally miss +// buffer offset bugs. This is the same as what Bao uses in its tests. +#[cfg(test)] +fn paint_test_input(buf: &mut [u8]) { + let mut offset = 0; + let mut counter: u32 = 1; + while offset < buf.len() { + let bytes = counter.to_le_bytes(); + let take = cmp::min(bytes.len(), buf.len() - offset); + buf[offset..][..take].copy_from_slice(&bytes[..take]); + counter += 1; + offset += take; + } +} + +// This module is pub for internal benchmarks only. Please don't use it. +#[doc(hidden)] +pub mod benchmarks { + use super::*; + + pub fn force_portable(params: &mut Params) { + params.implementation = guts::Implementation::portable(); + } + + pub fn force_portable_blake2bp(params: &mut blake2bp::Params) { + blake2bp::force_portable(params); + } +} diff --git a/third_party/rust/blake2b_simd/src/many.rs b/third_party/rust/blake2b_simd/src/many.rs new file mode 100644 index 0000000000..1588a3fb92 --- /dev/null +++ b/third_party/rust/blake2b_simd/src/many.rs @@ -0,0 +1,529 @@ +//! Interfaces for hashing multiple inputs at once, using SIMD more +//! efficiently. +//! +//! The throughput of these interfaces is comparable to BLAKE2bp, about twice +//! the throughput of regular BLAKE2b when AVX2 is available. +//! +//! These interfaces can accept any number of inputs, and the implementation +//! does its best to parallelize them. In general, the more inputs you can pass +//! in at once the better. If you need to batch your inputs in smaller groups, +//! see the [`degree`](fn.degree.html) function for a good batch size. +//! +//! The implementation keeps working in parallel even when inputs are of +//! different lengths, by managing a working set of jobs whose input isn't yet +//! exhausted. However, if one or two inputs are much longer than the others, +//! and they're encountered only at the end, there might not be any remaining +//! work to parallelize them with. In this case, sorting the inputs +//! longest-first can improve parallelism. +//! +//! # Example +//! +//! ``` +//! use blake2b_simd::{blake2b, State, many::update_many}; +//! +//! let mut states = [ +//! State::new(), +//! State::new(), +//! State::new(), +//! State::new(), +//! ]; +//! +//! let inputs = [ +//! &b"foo"[..], +//! &b"bar"[..], +//! &b"baz"[..], +//! &b"bing"[..], +//! ]; +//! +//! update_many(states.iter_mut().zip(inputs.iter())); +//! +//! for (state, input) in states.iter_mut().zip(inputs.iter()) { +//! assert_eq!(blake2b(input), state.finalize()); +//! } +//! ``` + +use crate::guts::{self, Finalize, Implementation, Job, LastNode, Stride}; +use crate::state_words_to_bytes; +use crate::Count; +use crate::Hash; +use crate::Params; +use crate::State; +use crate::Word; +use crate::BLOCKBYTES; +use arrayref::array_mut_ref; +use arrayvec::ArrayVec; +use core::fmt; + +/// The largest possible value of [`degree`](fn.degree.html) on the target +/// platform. +/// +/// Note that this constant reflects the parallelism degree supported by this +/// crate, so it will change over time as support is added or removed. For +/// example, when Rust stabilizes AVX-512 support and this crate adds an +/// AVX-512 implementation, this constant will double on x86 targets. If that +/// implementation is an optional feature (e.g. because it's nightly-only), the +/// value of this constant will depend on that optional feature also. +pub const MAX_DEGREE: usize = guts::MAX_DEGREE; + +/// The parallelism degree of the implementation, detected at runtime. If you +/// hash your inputs in small batches, making the batch size a multiple of +/// `degree` will generally give good performance. +/// +/// For example, an x86 processor that supports AVX2 can compute four BLAKE2b +/// hashes in parallel, so `degree` returns 4 on that machine. If you call +/// [`hash_many`] with only three inputs, that's not enough to use the AVX2 +/// implementation, and your average throughput will be lower. Likewise if you +/// call it with five inputs of equal length, the first four will be hashed in +/// parallel with AVX2, but the last one will have to be hashed by itself, and +/// again your average throughput will be lower. +/// +/// As noted in the module level docs, performance is more complicated if your +/// inputs are of different lengths. When parallelizing long and short inputs +/// together, the longer ones will have bytes left over, and the implementation +/// will try to parallelize those leftover bytes with subsequent inputs. The +/// more inputs available in that case, the more the implementation will be +/// able to parallelize. +/// +/// If you need a constant batch size, for example to collect inputs in an +/// array, see [`MAX_DEGREE`]. +/// +/// [`hash_many`]: fn.hash_many.html +/// [`MAX_DEGREE`]: constant.MAX_DEGREE.html +pub fn degree() -> usize { + guts::Implementation::detect().degree() +} + +type JobsVec<'a, 'b> = ArrayVec<[Job<'a, 'b>; guts::MAX_DEGREE]>; + +#[inline(always)] +fn fill_jobs_vec<'a, 'b>( + jobs_iter: &mut impl Iterator<Item = Job<'a, 'b>>, + vec: &mut JobsVec<'a, 'b>, + target_len: usize, +) { + while vec.len() < target_len { + if let Some(job) = jobs_iter.next() { + vec.push(job); + } else { + break; + } + } +} + +#[inline(always)] +fn evict_finished<'a, 'b>(vec: &mut JobsVec<'a, 'b>, num_jobs: usize) { + // Iterate backwards so that removal doesn't cause an out-of-bounds panic. + for i in (0..num_jobs).rev() { + // Note that is_empty() is only valid because we know all these jobs + // have been run at least once. Otherwise we could confuse the empty + // input for a finished job, which would be incorrect. + // + // Avoid a panic branch here in release mode. + debug_assert!(vec.len() > i); + if vec.len() > i && vec[i].input.is_empty() { + // Note that calling pop_at() repeatedly has some overhead, because + // later elements need to be shifted up. However, the JobsVec is + // small, and this approach guarantees that jobs are encountered in + // order. + vec.pop_at(i); + } + } +} + +pub(crate) fn compress_many<'a, 'b, I>( + jobs: I, + imp: Implementation, + finalize: Finalize, + stride: Stride, +) where + I: IntoIterator<Item = Job<'a, 'b>>, +{ + // Fuse is important for correctness, since each of these blocks tries to + // advance the iterator, even if a previous block emptied it. + let mut jobs_iter = jobs.into_iter().fuse(); + let mut jobs_vec = JobsVec::new(); + + if imp.degree() >= 4 { + loop { + fill_jobs_vec(&mut jobs_iter, &mut jobs_vec, 4); + if jobs_vec.len() < 4 { + break; + } + let jobs_array = array_mut_ref!(jobs_vec, 0, 4); + imp.compress4_loop(jobs_array, finalize, stride); + evict_finished(&mut jobs_vec, 4); + } + } + + if imp.degree() >= 2 { + loop { + fill_jobs_vec(&mut jobs_iter, &mut jobs_vec, 2); + if jobs_vec.len() < 2 { + break; + } + let jobs_array = array_mut_ref!(jobs_vec, 0, 2); + imp.compress2_loop(jobs_array, finalize, stride); + evict_finished(&mut jobs_vec, 2); + } + } + + for job in jobs_vec.into_iter().chain(jobs_iter) { + let Job { + input, + words, + count, + last_node, + } = job; + imp.compress1_loop(input, words, count, last_node, finalize, stride); + } +} + +/// Update any number of `State` objects at once. +/// +/// # Example +/// +/// ``` +/// use blake2b_simd::{blake2b, State, many::update_many}; +/// +/// let mut states = [ +/// State::new(), +/// State::new(), +/// State::new(), +/// State::new(), +/// ]; +/// +/// let inputs = [ +/// &b"foo"[..], +/// &b"bar"[..], +/// &b"baz"[..], +/// &b"bing"[..], +/// ]; +/// +/// update_many(states.iter_mut().zip(inputs.iter())); +/// +/// for (state, input) in states.iter_mut().zip(inputs.iter()) { +/// assert_eq!(blake2b(input), state.finalize()); +/// } +/// ``` +pub fn update_many<'a, 'b, I, T>(pairs: I) +where + I: IntoIterator<Item = (&'a mut State, &'b T)>, + T: 'b + AsRef<[u8]> + ?Sized, +{ + // Get the guts::Implementation from the first state, if any. + let mut peekable_pairs = pairs.into_iter().peekable(); + let implementation = if let Some((state, _)) = peekable_pairs.peek() { + state.implementation + } else { + // No work items, just short circuit. + return; + }; + + // Adapt the pairs iterator into a Jobs iterator, but skip over the Jobs + // where there's not actually any work to do (e.g. because there's not much + // input and it's all just going in the State buffer). + let jobs = peekable_pairs.flat_map(|(state, input_t)| { + let mut input = input_t.as_ref(); + // For each pair, if the State has some input in its buffer, try to + // finish that buffer. If there wasn't enough input to do that -- + // or if the input was empty to begin with -- skip this pair. + state.compress_buffer_if_possible(&mut input); + if input.is_empty() { + return None; + } + // Now we know the buffer is empty and there's more input. Make sure we + // buffer the final block, because update() doesn't finalize. + let mut last_block_start = input.len() - 1; + last_block_start -= last_block_start % BLOCKBYTES; + let (blocks, last_block) = input.split_at(last_block_start); + state.buf[..last_block.len()].copy_from_slice(last_block); + state.buflen = last_block.len() as u8; + // Finally, if the full blocks slice is non-empty, prepare that job for + // compression, and bump the State count. + if blocks.is_empty() { + None + } else { + let count = state.count; + state.count = state.count.wrapping_add(blocks.len() as Count); + Some(Job { + input: blocks, + words: &mut state.words, + count, + last_node: state.last_node, + }) + } + }); + + // Run all the Jobs in the iterator. + compress_many(jobs, implementation, Finalize::No, Stride::Serial); +} + +/// A job for the [`hash_many`] function. After calling [`hash_many`] on a +/// collection of `HashManyJob` objects, you can call [`to_hash`] on each job +/// to get the result. +/// +/// [`hash_many`]: fn.hash_many.html +/// [`to_hash`]: struct.HashManyJob.html#method.to_hash +#[derive(Clone)] +pub struct HashManyJob<'a> { + words: [Word; 8], + count: Count, + last_node: LastNode, + hash_length: u8, + input: &'a [u8], + finished: bool, + implementation: guts::Implementation, +} + +impl<'a> HashManyJob<'a> { + /// Construct a new `HashManyJob` from a set of hashing parameters and an + /// input. + #[inline] + pub fn new(params: &Params, input: &'a [u8]) -> Self { + let mut words = params.to_words(); + let mut count = 0; + let mut finished = false; + // If we have key bytes, compress them into the state words. If there's + // no additional input, this compression needs to finalize and set + // finished=true. + if params.key_length > 0 { + let mut finalization = Finalize::No; + if input.is_empty() { + finalization = Finalize::Yes; + finished = true; + } + params.implementation.compress1_loop( + ¶ms.key_block, + &mut words, + 0, + params.last_node, + finalization, + Stride::Serial, + ); + count = BLOCKBYTES as Count; + } + Self { + words, + count, + last_node: params.last_node, + hash_length: params.hash_length, + input, + finished, + implementation: params.implementation, + } + } + + /// Get the hash from a finished job. If you call this before calling + /// [`hash_many`], it will panic in debug mode. + /// + /// [`hash_many`]: fn.hash_many.html + #[inline] + pub fn to_hash(&self) -> Hash { + debug_assert!(self.finished, "job hasn't been run yet"); + Hash { + bytes: state_words_to_bytes(&self.words), + len: self.hash_length, + } + } +} + +impl<'a> fmt::Debug for HashManyJob<'a> { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + // NB: Don't print the words. Leaking them would allow length extension. + write!( + f, + "HashManyJob {{ count: {}, hash_length: {}, last_node: {}, input_len: {} }}", + self.count, + self.hash_length, + self.last_node.yes(), + self.input.len(), + ) + } +} + +/// Hash any number of complete inputs all at once. +/// +/// This is slightly more efficient than using `update_many` with `State` +/// objects, because it doesn't need to do any buffering. +/// +/// Running `hash_many` on the same `HashManyJob` object more than once has no +/// effect. +/// +/// # Example +/// +/// ``` +/// use blake2b_simd::{blake2b, Params, many::{HashManyJob, hash_many}}; +/// +/// let inputs = [ +/// &b"foo"[..], +/// &b"bar"[..], +/// &b"baz"[..], +/// &b"bing"[..], +/// ]; +/// +/// let mut params = Params::new(); +/// params.hash_length(16); +/// +/// let mut jobs = [ +/// HashManyJob::new(¶ms, inputs[0]), +/// HashManyJob::new(¶ms, inputs[1]), +/// HashManyJob::new(¶ms, inputs[2]), +/// HashManyJob::new(¶ms, inputs[3]), +/// ]; +/// +/// hash_many(jobs.iter_mut()); +/// +/// for (input, job) in inputs.iter().zip(jobs.iter()) { +/// let expected = params.hash(input); +/// assert_eq!(expected, job.to_hash()); +/// } +/// ``` +pub fn hash_many<'a, 'b, I>(hash_many_jobs: I) +where + 'b: 'a, + I: IntoIterator<Item = &'a mut HashManyJob<'b>>, +{ + // Get the guts::Implementation from the first job, if any. + let mut peekable_jobs = hash_many_jobs.into_iter().peekable(); + let implementation = if let Some(job) = peekable_jobs.peek() { + job.implementation + } else { + // No work items, just short circuit. + return; + }; + + // In the jobs iterator, skip HashManyJobs that have already been run. This + // is less because we actually expect callers to call hash_many twice + // (though they're allowed to if they want), and more because + // HashManyJob::new might need to finalize if there are key bytes but no + // input. Tying the job lifetime to the Params reference is an alternative, + // but I've found it too constraining in practice. We could also put key + // bytes in every HashManyJob, but that would add unnecessary storage and + // zeroing for all callers. + let unfinished_jobs = peekable_jobs.into_iter().filter(|j| !j.finished); + let jobs = unfinished_jobs.map(|j| { + j.finished = true; + Job { + input: j.input, + words: &mut j.words, + count: j.count, + last_node: j.last_node, + } + }); + compress_many(jobs, implementation, Finalize::Yes, Stride::Serial); +} + +#[cfg(test)] +mod test { + use super::*; + use crate::guts; + use crate::paint_test_input; + use crate::BLOCKBYTES; + use arrayvec::ArrayVec; + + #[test] + fn test_degree() { + assert!(degree() <= MAX_DEGREE); + + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + #[cfg(feature = "std")] + { + if is_x86_feature_detected!("avx2") { + assert!(degree() >= 4); + } + if is_x86_feature_detected!("sse4.1") { + assert!(degree() >= 2); + } + } + } + + #[test] + fn test_hash_many() { + // Use a length of inputs that will exercise all of the power-of-two loops. + const LEN: usize = 2 * guts::MAX_DEGREE - 1; + + // Rerun LEN inputs LEN different times, with the empty input starting in a + // different spot each time. + let mut input = [0; LEN * BLOCKBYTES]; + paint_test_input(&mut input); + for start_offset in 0..LEN { + let mut inputs: [&[u8]; LEN] = [&[]; LEN]; + for i in 0..LEN { + let chunks = (i + start_offset) % LEN; + inputs[i] = &input[..chunks * BLOCKBYTES]; + } + + let mut params: ArrayVec<[Params; LEN]> = ArrayVec::new(); + for i in 0..LEN { + let mut p = Params::new(); + p.node_offset(i as u64); + if i % 2 == 1 { + p.last_node(true); + p.key(b"foo"); + } + params.push(p); + } + + let mut jobs: ArrayVec<[HashManyJob; LEN]> = ArrayVec::new(); + for i in 0..LEN { + jobs.push(HashManyJob::new(¶ms[i], inputs[i])); + } + + hash_many(&mut jobs); + + // Check the outputs. + for i in 0..LEN { + let expected = params[i].hash(inputs[i]); + assert_eq!(expected, jobs[i].to_hash()); + } + } + } + + #[test] + fn test_update_many() { + // Use a length of inputs that will exercise all of the power-of-two loops. + const LEN: usize = 2 * guts::MAX_DEGREE - 1; + + // Rerun LEN inputs LEN different times, with the empty input starting in a + // different spot each time. + let mut input = [0; LEN * BLOCKBYTES]; + paint_test_input(&mut input); + for start_offset in 0..LEN { + let mut inputs: [&[u8]; LEN] = [&[]; LEN]; + for i in 0..LEN { + let chunks = (i + start_offset) % LEN; + inputs[i] = &input[..chunks * BLOCKBYTES]; + } + + let mut params: ArrayVec<[Params; LEN]> = ArrayVec::new(); + for i in 0..LEN { + let mut p = Params::new(); + p.node_offset(i as u64); + if i % 2 == 1 { + p.last_node(true); + p.key(b"foo"); + } + params.push(p); + } + + let mut states: ArrayVec<[State; LEN]> = ArrayVec::new(); + for i in 0..LEN { + states.push(params[i].to_state()); + } + + // Run each input twice through, to exercise buffering. + update_many(states.iter_mut().zip(inputs.iter())); + update_many(states.iter_mut().zip(inputs.iter())); + + // Check the outputs. + for i in 0..LEN { + let mut reference_state = params[i].to_state(); + // Again, run the input twice. + reference_state.update(inputs[i]); + reference_state.update(inputs[i]); + assert_eq!(reference_state.finalize(), states[i].finalize()); + assert_eq!(2 * inputs[i].len() as Count, states[i].count()); + } + } + } +} diff --git a/third_party/rust/blake2b_simd/src/portable.rs b/third_party/rust/blake2b_simd/src/portable.rs new file mode 100644 index 0000000000..5a667ef2a8 --- /dev/null +++ b/third_party/rust/blake2b_simd/src/portable.rs @@ -0,0 +1,166 @@ +use arrayref::{array_ref, array_refs}; + +use super::*; +use crate::guts::{ + count_high, count_low, final_block, flag_word, input_debug_asserts, Finalize, LastNode, Stride, +}; + +// G is the mixing function, called eight times per round in the compression +// function. V is the 16-word state vector of the compression function, usually +// described as a 4x4 matrix. A, B, C, and D are the mixing indices, set by the +// caller first to the four columns of V, and then to its four diagonals. X and +// Y are words of input, chosen by the caller according to the message +// schedule, SIGMA. +#[inline(always)] +fn g(v: &mut [Word; 16], a: usize, b: usize, c: usize, d: usize, x: Word, y: Word) { + v[a] = v[a].wrapping_add(v[b]).wrapping_add(x); + v[d] = (v[d] ^ v[a]).rotate_right(32); + v[c] = v[c].wrapping_add(v[d]); + v[b] = (v[b] ^ v[c]).rotate_right(24); + v[a] = v[a].wrapping_add(v[b]).wrapping_add(y); + v[d] = (v[d] ^ v[a]).rotate_right(16); + v[c] = v[c].wrapping_add(v[d]); + v[b] = (v[b] ^ v[c]).rotate_right(63); +} + +#[inline(always)] +fn round(r: usize, m: &[Word; 16], v: &mut [Word; 16]) { + // Select the message schedule based on the round. + let s = SIGMA[r]; + + // Mix the columns. + g(v, 0, 4, 8, 12, m[s[0] as usize], m[s[1] as usize]); + g(v, 1, 5, 9, 13, m[s[2] as usize], m[s[3] as usize]); + g(v, 2, 6, 10, 14, m[s[4] as usize], m[s[5] as usize]); + g(v, 3, 7, 11, 15, m[s[6] as usize], m[s[7] as usize]); + + // Mix the rows. + g(v, 0, 5, 10, 15, m[s[8] as usize], m[s[9] as usize]); + g(v, 1, 6, 11, 12, m[s[10] as usize], m[s[11] as usize]); + g(v, 2, 7, 8, 13, m[s[12] as usize], m[s[13] as usize]); + g(v, 3, 4, 9, 14, m[s[14] as usize], m[s[15] as usize]); +} + +#[inline(always)] +fn compress_block( + block: &[u8; BLOCKBYTES], + words: &mut [Word; 8], + count: Count, + last_block: Word, + last_node: Word, +) { + // Initialize the compression state. + let mut v = [ + words[0], + words[1], + words[2], + words[3], + words[4], + words[5], + words[6], + words[7], + IV[0], + IV[1], + IV[2], + IV[3], + IV[4] ^ count_low(count), + IV[5] ^ count_high(count), + IV[6] ^ last_block, + IV[7] ^ last_node, + ]; + + // Parse the message bytes as ints in little endian order. + const W: usize = size_of::<Word>(); + let msg_refs = array_refs!(block, W, W, W, W, W, W, W, W, W, W, W, W, W, W, W, W); + let m = [ + Word::from_le_bytes(*msg_refs.0), + Word::from_le_bytes(*msg_refs.1), + Word::from_le_bytes(*msg_refs.2), + Word::from_le_bytes(*msg_refs.3), + Word::from_le_bytes(*msg_refs.4), + Word::from_le_bytes(*msg_refs.5), + Word::from_le_bytes(*msg_refs.6), + Word::from_le_bytes(*msg_refs.7), + Word::from_le_bytes(*msg_refs.8), + Word::from_le_bytes(*msg_refs.9), + Word::from_le_bytes(*msg_refs.10), + Word::from_le_bytes(*msg_refs.11), + Word::from_le_bytes(*msg_refs.12), + Word::from_le_bytes(*msg_refs.13), + Word::from_le_bytes(*msg_refs.14), + Word::from_le_bytes(*msg_refs.15), + ]; + + round(0, &m, &mut v); + round(1, &m, &mut v); + round(2, &m, &mut v); + round(3, &m, &mut v); + round(4, &m, &mut v); + round(5, &m, &mut v); + round(6, &m, &mut v); + round(7, &m, &mut v); + round(8, &m, &mut v); + round(9, &m, &mut v); + round(10, &m, &mut v); + round(11, &m, &mut v); + + words[0] ^= v[0] ^ v[8]; + words[1] ^= v[1] ^ v[9]; + words[2] ^= v[2] ^ v[10]; + words[3] ^= v[3] ^ v[11]; + words[4] ^= v[4] ^ v[12]; + words[5] ^= v[5] ^ v[13]; + words[6] ^= v[6] ^ v[14]; + words[7] ^= v[7] ^ v[15]; +} + +pub fn compress1_loop( + input: &[u8], + words: &mut [Word; 8], + mut count: Count, + last_node: LastNode, + finalize: Finalize, + stride: Stride, +) { + input_debug_asserts(input, finalize); + + let mut local_words = *words; + + let mut fin_offset = input.len().saturating_sub(1); + fin_offset -= fin_offset % stride.padded_blockbytes(); + let mut buf = [0; BLOCKBYTES]; + let (fin_block, fin_len, _) = final_block(input, fin_offset, &mut buf, stride); + let fin_last_block = flag_word(finalize.yes()); + let fin_last_node = flag_word(finalize.yes() && last_node.yes()); + + let mut offset = 0; + loop { + let block; + let count_delta; + let last_block; + let last_node; + if offset == fin_offset { + block = fin_block; + count_delta = fin_len; + last_block = fin_last_block; + last_node = fin_last_node; + } else { + block = array_ref!(input, offset, BLOCKBYTES); + count_delta = BLOCKBYTES; + last_block = flag_word(false); + last_node = flag_word(false); + }; + + count = count.wrapping_add(count_delta as Count); + compress_block(block, &mut local_words, count, last_block, last_node); + + // Check for termination before bumping the offset, to avoid overflow. + if offset == fin_offset { + break; + } + + offset += stride.padded_blockbytes(); + } + + *words = local_words; +} diff --git a/third_party/rust/blake2b_simd/src/sse41.rs b/third_party/rust/blake2b_simd/src/sse41.rs new file mode 100644 index 0000000000..3a55dc8c35 --- /dev/null +++ b/third_party/rust/blake2b_simd/src/sse41.rs @@ -0,0 +1,454 @@ +#[cfg(target_arch = "x86")] +use core::arch::x86::*; +#[cfg(target_arch = "x86_64")] +use core::arch::x86_64::*; + +use crate::guts::{ + assemble_count, count_high, count_low, final_block, flag_word, input_debug_asserts, Finalize, + Job, Stride, +}; +use crate::{Word, BLOCKBYTES, IV, SIGMA}; +use arrayref::{array_refs, mut_array_refs}; +use core::cmp; +use core::mem; + +pub const DEGREE: usize = 2; + +#[inline(always)] +unsafe fn loadu(src: *const [Word; DEGREE]) -> __m128i { + // This is an unaligned load, so the pointer cast is allowed. + _mm_loadu_si128(src as *const __m128i) +} + +#[inline(always)] +unsafe fn storeu(src: __m128i, dest: *mut [Word; DEGREE]) { + // This is an unaligned store, so the pointer cast is allowed. + _mm_storeu_si128(dest as *mut __m128i, src) +} + +#[inline(always)] +unsafe fn add(a: __m128i, b: __m128i) -> __m128i { + _mm_add_epi64(a, b) +} + +#[inline(always)] +unsafe fn eq(a: __m128i, b: __m128i) -> __m128i { + _mm_cmpeq_epi64(a, b) +} + +#[inline(always)] +unsafe fn and(a: __m128i, b: __m128i) -> __m128i { + _mm_and_si128(a, b) +} + +#[inline(always)] +unsafe fn negate_and(a: __m128i, b: __m128i) -> __m128i { + // Note that "and not" implies the reverse of the actual arg order. + _mm_andnot_si128(a, b) +} + +#[inline(always)] +unsafe fn xor(a: __m128i, b: __m128i) -> __m128i { + _mm_xor_si128(a, b) +} + +#[inline(always)] +unsafe fn set1(x: u64) -> __m128i { + _mm_set1_epi64x(x as i64) +} + +#[inline(always)] +unsafe fn set2(a: u64, b: u64) -> __m128i { + // There's no _mm_setr_epi64x, so note the arg order is backwards. + _mm_set_epi64x(b as i64, a as i64) +} + +// Adapted from https://github.com/rust-lang-nursery/stdsimd/pull/479. +macro_rules! _MM_SHUFFLE { + ($z:expr, $y:expr, $x:expr, $w:expr) => { + ($z << 6) | ($y << 4) | ($x << 2) | $w + }; +} + +#[inline(always)] +unsafe fn rot32(x: __m128i) -> __m128i { + _mm_shuffle_epi32(x, _MM_SHUFFLE!(2, 3, 0, 1)) +} + +#[inline(always)] +unsafe fn rot24(x: __m128i) -> __m128i { + let rotate24 = _mm_setr_epi8(3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10); + _mm_shuffle_epi8(x, rotate24) +} + +#[inline(always)] +unsafe fn rot16(x: __m128i) -> __m128i { + let rotate16 = _mm_setr_epi8(2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9); + _mm_shuffle_epi8(x, rotate16) +} + +#[inline(always)] +unsafe fn rot63(x: __m128i) -> __m128i { + _mm_or_si128(_mm_srli_epi64(x, 63), add(x, x)) +} + +#[inline(always)] +unsafe fn round(v: &mut [__m128i; 16], m: &[__m128i; 16], r: usize) { + v[0] = add(v[0], m[SIGMA[r][0] as usize]); + v[1] = add(v[1], m[SIGMA[r][2] as usize]); + v[2] = add(v[2], m[SIGMA[r][4] as usize]); + v[3] = add(v[3], m[SIGMA[r][6] as usize]); + v[0] = add(v[0], v[4]); + v[1] = add(v[1], v[5]); + v[2] = add(v[2], v[6]); + v[3] = add(v[3], v[7]); + v[12] = xor(v[12], v[0]); + v[13] = xor(v[13], v[1]); + v[14] = xor(v[14], v[2]); + v[15] = xor(v[15], v[3]); + v[12] = rot32(v[12]); + v[13] = rot32(v[13]); + v[14] = rot32(v[14]); + v[15] = rot32(v[15]); + v[8] = add(v[8], v[12]); + v[9] = add(v[9], v[13]); + v[10] = add(v[10], v[14]); + v[11] = add(v[11], v[15]); + v[4] = xor(v[4], v[8]); + v[5] = xor(v[5], v[9]); + v[6] = xor(v[6], v[10]); + v[7] = xor(v[7], v[11]); + v[4] = rot24(v[4]); + v[5] = rot24(v[5]); + v[6] = rot24(v[6]); + v[7] = rot24(v[7]); + v[0] = add(v[0], m[SIGMA[r][1] as usize]); + v[1] = add(v[1], m[SIGMA[r][3] as usize]); + v[2] = add(v[2], m[SIGMA[r][5] as usize]); + v[3] = add(v[3], m[SIGMA[r][7] as usize]); + v[0] = add(v[0], v[4]); + v[1] = add(v[1], v[5]); + v[2] = add(v[2], v[6]); + v[3] = add(v[3], v[7]); + v[12] = xor(v[12], v[0]); + v[13] = xor(v[13], v[1]); + v[14] = xor(v[14], v[2]); + v[15] = xor(v[15], v[3]); + v[12] = rot16(v[12]); + v[13] = rot16(v[13]); + v[14] = rot16(v[14]); + v[15] = rot16(v[15]); + v[8] = add(v[8], v[12]); + v[9] = add(v[9], v[13]); + v[10] = add(v[10], v[14]); + v[11] = add(v[11], v[15]); + v[4] = xor(v[4], v[8]); + v[5] = xor(v[5], v[9]); + v[6] = xor(v[6], v[10]); + v[7] = xor(v[7], v[11]); + v[4] = rot63(v[4]); + v[5] = rot63(v[5]); + v[6] = rot63(v[6]); + v[7] = rot63(v[7]); + + v[0] = add(v[0], m[SIGMA[r][8] as usize]); + v[1] = add(v[1], m[SIGMA[r][10] as usize]); + v[2] = add(v[2], m[SIGMA[r][12] as usize]); + v[3] = add(v[3], m[SIGMA[r][14] as usize]); + v[0] = add(v[0], v[5]); + v[1] = add(v[1], v[6]); + v[2] = add(v[2], v[7]); + v[3] = add(v[3], v[4]); + v[15] = xor(v[15], v[0]); + v[12] = xor(v[12], v[1]); + v[13] = xor(v[13], v[2]); + v[14] = xor(v[14], v[3]); + v[15] = rot32(v[15]); + v[12] = rot32(v[12]); + v[13] = rot32(v[13]); + v[14] = rot32(v[14]); + v[10] = add(v[10], v[15]); + v[11] = add(v[11], v[12]); + v[8] = add(v[8], v[13]); + v[9] = add(v[9], v[14]); + v[5] = xor(v[5], v[10]); + v[6] = xor(v[6], v[11]); + v[7] = xor(v[7], v[8]); + v[4] = xor(v[4], v[9]); + v[5] = rot24(v[5]); + v[6] = rot24(v[6]); + v[7] = rot24(v[7]); + v[4] = rot24(v[4]); + v[0] = add(v[0], m[SIGMA[r][9] as usize]); + v[1] = add(v[1], m[SIGMA[r][11] as usize]); + v[2] = add(v[2], m[SIGMA[r][13] as usize]); + v[3] = add(v[3], m[SIGMA[r][15] as usize]); + v[0] = add(v[0], v[5]); + v[1] = add(v[1], v[6]); + v[2] = add(v[2], v[7]); + v[3] = add(v[3], v[4]); + v[15] = xor(v[15], v[0]); + v[12] = xor(v[12], v[1]); + v[13] = xor(v[13], v[2]); + v[14] = xor(v[14], v[3]); + v[15] = rot16(v[15]); + v[12] = rot16(v[12]); + v[13] = rot16(v[13]); + v[14] = rot16(v[14]); + v[10] = add(v[10], v[15]); + v[11] = add(v[11], v[12]); + v[8] = add(v[8], v[13]); + v[9] = add(v[9], v[14]); + v[5] = xor(v[5], v[10]); + v[6] = xor(v[6], v[11]); + v[7] = xor(v[7], v[8]); + v[4] = xor(v[4], v[9]); + v[5] = rot63(v[5]); + v[6] = rot63(v[6]); + v[7] = rot63(v[7]); + v[4] = rot63(v[4]); +} + +// We'd rather make this a regular function with #[inline(always)], but for +// some reason that blows up compile times by about 10 seconds, at least in +// some cases (BLAKE2b avx2.rs). This macro seems to get the same performance +// result, without the compile time issue. +macro_rules! compress2_transposed { + ( + $h_vecs:expr, + $msg_vecs:expr, + $count_low:expr, + $count_high:expr, + $lastblock:expr, + $lastnode:expr, + ) => { + let h_vecs: &mut [__m128i; 8] = $h_vecs; + let msg_vecs: &[__m128i; 16] = $msg_vecs; + let count_low: __m128i = $count_low; + let count_high: __m128i = $count_high; + let lastblock: __m128i = $lastblock; + let lastnode: __m128i = $lastnode; + let mut v = [ + h_vecs[0], + h_vecs[1], + h_vecs[2], + h_vecs[3], + h_vecs[4], + h_vecs[5], + h_vecs[6], + h_vecs[7], + set1(IV[0]), + set1(IV[1]), + set1(IV[2]), + set1(IV[3]), + xor(set1(IV[4]), count_low), + xor(set1(IV[5]), count_high), + xor(set1(IV[6]), lastblock), + xor(set1(IV[7]), lastnode), + ]; + + round(&mut v, &msg_vecs, 0); + round(&mut v, &msg_vecs, 1); + round(&mut v, &msg_vecs, 2); + round(&mut v, &msg_vecs, 3); + round(&mut v, &msg_vecs, 4); + round(&mut v, &msg_vecs, 5); + round(&mut v, &msg_vecs, 6); + round(&mut v, &msg_vecs, 7); + round(&mut v, &msg_vecs, 8); + round(&mut v, &msg_vecs, 9); + round(&mut v, &msg_vecs, 10); + round(&mut v, &msg_vecs, 11); + + h_vecs[0] = xor(xor(h_vecs[0], v[0]), v[8]); + h_vecs[1] = xor(xor(h_vecs[1], v[1]), v[9]); + h_vecs[2] = xor(xor(h_vecs[2], v[2]), v[10]); + h_vecs[3] = xor(xor(h_vecs[3], v[3]), v[11]); + h_vecs[4] = xor(xor(h_vecs[4], v[4]), v[12]); + h_vecs[5] = xor(xor(h_vecs[5], v[5]), v[13]); + h_vecs[6] = xor(xor(h_vecs[6], v[6]), v[14]); + h_vecs[7] = xor(xor(h_vecs[7], v[7]), v[15]); + }; +} + +#[inline(always)] +unsafe fn transpose_vecs(a: __m128i, b: __m128i) -> [__m128i; DEGREE] { + let a_words: [Word; DEGREE] = mem::transmute(a); + let b_words: [Word; DEGREE] = mem::transmute(b); + [set2(a_words[0], b_words[0]), set2(a_words[1], b_words[1])] +} + +#[inline(always)] +unsafe fn transpose_state_vecs(jobs: &[Job; DEGREE]) -> [__m128i; 8] { + // Load all the state words into transposed vectors, where the first vector + // has the first word of each state, etc. Transposing once at the beginning + // and once at the end is more efficient that repeating it for each block. + let words0 = array_refs!(&jobs[0].words, DEGREE, DEGREE, DEGREE, DEGREE); + let words1 = array_refs!(&jobs[1].words, DEGREE, DEGREE, DEGREE, DEGREE); + let [h0, h1] = transpose_vecs(loadu(words0.0), loadu(words1.0)); + let [h2, h3] = transpose_vecs(loadu(words0.1), loadu(words1.1)); + let [h4, h5] = transpose_vecs(loadu(words0.2), loadu(words1.2)); + let [h6, h7] = transpose_vecs(loadu(words0.3), loadu(words1.3)); + [h0, h1, h2, h3, h4, h5, h6, h7] +} + +#[inline(always)] +unsafe fn untranspose_state_vecs(h_vecs: &[__m128i; 8], jobs: &mut [Job; DEGREE]) { + // Un-transpose the updated state vectors back into the caller's arrays. + let [job0, job1] = jobs; + let words0 = mut_array_refs!(&mut job0.words, DEGREE, DEGREE, DEGREE, DEGREE); + let words1 = mut_array_refs!(&mut job1.words, DEGREE, DEGREE, DEGREE, DEGREE); + + let out = transpose_vecs(h_vecs[0], h_vecs[1]); + storeu(out[0], words0.0); + storeu(out[1], words1.0); + let out = transpose_vecs(h_vecs[2], h_vecs[3]); + storeu(out[0], words0.1); + storeu(out[1], words1.1); + let out = transpose_vecs(h_vecs[4], h_vecs[5]); + storeu(out[0], words0.2); + storeu(out[1], words1.2); + let out = transpose_vecs(h_vecs[6], h_vecs[7]); + storeu(out[0], words0.3); + storeu(out[1], words1.3); +} + +#[inline(always)] +unsafe fn transpose_msg_vecs(blocks: [*const [u8; BLOCKBYTES]; DEGREE]) -> [__m128i; 16] { + // These input arrays have no particular alignment, so we use unaligned + // loads to read from them. + let block0 = blocks[0] as *const [Word; DEGREE]; + let block1 = blocks[1] as *const [Word; DEGREE]; + let [m0, m1] = transpose_vecs(loadu(block0.add(0)), loadu(block1.add(0))); + let [m2, m3] = transpose_vecs(loadu(block0.add(1)), loadu(block1.add(1))); + let [m4, m5] = transpose_vecs(loadu(block0.add(2)), loadu(block1.add(2))); + let [m6, m7] = transpose_vecs(loadu(block0.add(3)), loadu(block1.add(3))); + let [m8, m9] = transpose_vecs(loadu(block0.add(4)), loadu(block1.add(4))); + let [m10, m11] = transpose_vecs(loadu(block0.add(5)), loadu(block1.add(5))); + let [m12, m13] = transpose_vecs(loadu(block0.add(6)), loadu(block1.add(6))); + let [m14, m15] = transpose_vecs(loadu(block0.add(7)), loadu(block1.add(7))); + [ + m0, m1, m2, m3, m4, m5, m6, m7, m8, m9, m10, m11, m12, m13, m14, m15, + ] +} + +#[inline(always)] +unsafe fn load_counts(jobs: &[Job; DEGREE]) -> (__m128i, __m128i) { + ( + set2(count_low(jobs[0].count), count_low(jobs[1].count)), + set2(count_high(jobs[0].count), count_high(jobs[1].count)), + ) +} + +#[inline(always)] +unsafe fn store_counts(jobs: &mut [Job; DEGREE], low: __m128i, high: __m128i) { + let low_ints: [Word; DEGREE] = mem::transmute(low); + let high_ints: [Word; DEGREE] = mem::transmute(high); + for i in 0..DEGREE { + jobs[i].count = assemble_count(low_ints[i], high_ints[i]); + } +} + +#[inline(always)] +unsafe fn add_to_counts(lo: &mut __m128i, hi: &mut __m128i, delta: __m128i) { + // If the low counts reach zero, that means they wrapped, unless the delta + // was also zero. + *lo = add(*lo, delta); + let lo_reached_zero = eq(*lo, set1(0)); + let delta_was_zero = eq(delta, set1(0)); + let hi_inc = and(set1(1), negate_and(delta_was_zero, lo_reached_zero)); + *hi = add(*hi, hi_inc); +} + +#[inline(always)] +unsafe fn flags_vec(flags: [bool; DEGREE]) -> __m128i { + set2(flag_word(flags[0]), flag_word(flags[1])) +} + +#[target_feature(enable = "sse4.1")] +pub unsafe fn compress2_loop(jobs: &mut [Job; DEGREE], finalize: Finalize, stride: Stride) { + // If we're not finalizing, there can't be a partial block at the end. + for job in jobs.iter() { + input_debug_asserts(job.input, finalize); + } + + let msg_ptrs = [jobs[0].input.as_ptr(), jobs[1].input.as_ptr()]; + let mut h_vecs = transpose_state_vecs(&jobs); + let (mut counts_lo, mut counts_hi) = load_counts(&jobs); + + // Prepare the final blocks (note, which could be empty if the input is + // empty). Do all this before entering the main loop. + let min_len = jobs.iter().map(|job| job.input.len()).min().unwrap(); + let mut fin_offset = min_len.saturating_sub(1); + fin_offset -= fin_offset % stride.padded_blockbytes(); + // Performance note, making these buffers mem::uninitialized() seems to + // cause problems in the optimizer. + let mut buf0: [u8; BLOCKBYTES] = [0; BLOCKBYTES]; + let mut buf1: [u8; BLOCKBYTES] = [0; BLOCKBYTES]; + let (block0, len0, finalize0) = final_block(jobs[0].input, fin_offset, &mut buf0, stride); + let (block1, len1, finalize1) = final_block(jobs[1].input, fin_offset, &mut buf1, stride); + let fin_blocks: [*const [u8; BLOCKBYTES]; DEGREE] = [block0, block1]; + let fin_counts_delta = set2(len0 as Word, len1 as Word); + let fin_last_block; + let fin_last_node; + if finalize.yes() { + fin_last_block = flags_vec([finalize0, finalize1]); + fin_last_node = flags_vec([ + finalize0 && jobs[0].last_node.yes(), + finalize1 && jobs[1].last_node.yes(), + ]); + } else { + fin_last_block = set1(0); + fin_last_node = set1(0); + } + + // The main loop. + let mut offset = 0; + loop { + let blocks; + let counts_delta; + let last_block; + let last_node; + if offset == fin_offset { + blocks = fin_blocks; + counts_delta = fin_counts_delta; + last_block = fin_last_block; + last_node = fin_last_node; + } else { + blocks = [ + msg_ptrs[0].add(offset) as *const [u8; BLOCKBYTES], + msg_ptrs[1].add(offset) as *const [u8; BLOCKBYTES], + ]; + counts_delta = set1(BLOCKBYTES as Word); + last_block = set1(0); + last_node = set1(0); + }; + + let m_vecs = transpose_msg_vecs(blocks); + add_to_counts(&mut counts_lo, &mut counts_hi, counts_delta); + compress2_transposed!( + &mut h_vecs, + &m_vecs, + counts_lo, + counts_hi, + last_block, + last_node, + ); + + // Check for termination before bumping the offset, to avoid overflow. + if offset == fin_offset { + break; + } + + offset += stride.padded_blockbytes(); + } + + // Write out the results. + untranspose_state_vecs(&h_vecs, &mut *jobs); + store_counts(&mut *jobs, counts_lo, counts_hi); + let max_consumed = offset.saturating_add(stride.padded_blockbytes()); + for job in jobs.iter_mut() { + let consumed = cmp::min(max_consumed, job.input.len()); + job.input = &job.input[consumed..]; + } +} diff --git a/third_party/rust/blake2b_simd/src/test.rs b/third_party/rust/blake2b_simd/src/test.rs new file mode 100644 index 0000000000..9ca8e87751 --- /dev/null +++ b/third_party/rust/blake2b_simd/src/test.rs @@ -0,0 +1,201 @@ +use super::*; + +const EMPTY_HASH: &str = "786a02f742015903c6c6fd852552d272912f4740e15847618a86e217f71f5419\ + d25e1031afee585313896444934eb04b903a685b1448b755d56f701afe9be2ce"; +const ABC_HASH: &str = "ba80a53f981c4d0d6a2797b69f12f6e94c212f14685ac4b74b12bb6fdbffa2d1\ + 7d87c5392aab792dc252d5de4533cc9518d38aa8dbf1925ab92386edd4009923"; +const ONE_BLOCK_HASH: &str = "865939e120e6805438478841afb739ae4250cf372653078a065cdcfffca4caf7\ + 98e6d462b65d658fc165782640eded70963449ae1500fb0f24981d7727e22c41"; +const THOUSAND_HASH: &str = "1ee4e51ecab5210a518f26150e882627ec839967f19d763e1508b12cfefed148\ + 58f6a1c9d1f969bc224dc9440f5a6955277e755b9c513f9ba4421c5e50c8d787"; + +#[test] +fn test_update_state() { + let io = &[ + (&b""[..], EMPTY_HASH), + (&b"abc"[..], ABC_HASH), + (&[0; BLOCKBYTES], ONE_BLOCK_HASH), + (&[0; 1000], THOUSAND_HASH), + ]; + // Test each input all at once. + for &(input, output) in io { + let hash = blake2b(input); + assert_eq!(&hash.to_hex(), output, "hash mismatch"); + } + // Now in two chunks. This is especially important for the ONE_BLOCK case, because it would be + // a mistake for update() to call compress, even though the buffer is full. + for &(input, output) in io { + let mut state = State::new(); + let split = input.len() / 2; + state.update(&input[..split]); + assert_eq!(split as Count, state.count()); + state.update(&input[split..]); + assert_eq!(input.len() as Count, state.count()); + let hash = state.finalize(); + assert_eq!(&hash.to_hex(), output, "hash mismatch"); + } + // Now one byte at a time. + for &(input, output) in io { + let mut state = State::new(); + let mut count = 0; + for &b in input { + state.update(&[b]); + count += 1; + assert_eq!(count, state.count()); + } + let hash = state.finalize(); + assert_eq!(&hash.to_hex(), output, "hash mismatch"); + } +} + +#[test] +fn test_multiple_finalizes() { + let mut state = State::new(); + assert_eq!(&state.finalize().to_hex(), EMPTY_HASH, "hash mismatch"); + assert_eq!(&state.finalize().to_hex(), EMPTY_HASH, "hash mismatch"); + assert_eq!(&state.finalize().to_hex(), EMPTY_HASH, "hash mismatch"); + state.update(b"abc"); + assert_eq!(&state.finalize().to_hex(), ABC_HASH, "hash mismatch"); + assert_eq!(&state.finalize().to_hex(), ABC_HASH, "hash mismatch"); + assert_eq!(&state.finalize().to_hex(), ABC_HASH, "hash mismatch"); +} + +#[cfg(feature = "std")] +#[test] +fn test_write() { + use std::io::prelude::*; + + let mut state = State::new(); + state.write_all(&[0; 1000]).unwrap(); + let hash = state.finalize(); + assert_eq!(&hash.to_hex(), THOUSAND_HASH, "hash mismatch"); +} + +// You can check this case against the equivalent Python: +// +// import hashlib +// hashlib.blake2b( +// b'foo', +// digest_size=18, +// key=b"bar", +// salt=b"bazbazbazbazbazb", +// person=b"bing bing bing b", +// fanout=2, +// depth=3, +// leaf_size=0x04050607, +// node_offset=0x08090a0b0c0d0e0f, +// node_depth=16, +// inner_size=17, +// last_node=True, +// ).hexdigest() +#[test] +fn test_all_parameters() { + let mut params = Params::new(); + params + .hash_length(18) + // Make sure a shorter key properly overwrites a longer one. + .key(b"not the real key") + .key(b"bar") + .salt(b"bazbazbazbazbazb") + .personal(b"bing bing bing b") + .fanout(2) + .max_depth(3) + .max_leaf_length(0x04050607) + .node_offset(0x08090a0b0c0d0e0f) + .node_depth(16) + .inner_hash_length(17) + .last_node(true); + + // Check the State API. + assert_eq!( + "ec0f59cb65f92e7fcca1280ba859a6925ded", + ¶ms.to_state().update(b"foo").finalize().to_hex() + ); + + // Check the all-at-once API. + assert_eq!( + "ec0f59cb65f92e7fcca1280ba859a6925ded", + ¶ms.hash(b"foo").to_hex() + ); +} + +#[test] +fn test_all_parameters_blake2bp() { + let mut params = blake2bp::Params::new(); + params + .hash_length(18) + // Make sure a shorter key properly overwrites a longer one. + .key(b"not the real key") + .key(b"bar"); + + // Check the State API. + assert_eq!( + "8c54e888a8a01c63da6585c058fe54ea81df", + ¶ms.to_state().update(b"foo").finalize().to_hex() + ); + + // Check the all-at-once API. + assert_eq!( + "8c54e888a8a01c63da6585c058fe54ea81df", + ¶ms.hash(b"foo").to_hex() + ); +} + +#[test] +#[should_panic] +fn test_short_hash_length_panics() { + Params::new().hash_length(0); +} + +#[test] +#[should_panic] +fn test_long_hash_length_panics() { + Params::new().hash_length(OUTBYTES + 1); +} + +#[test] +#[should_panic] +fn test_long_key_panics() { + Params::new().key(&[0; KEYBYTES + 1]); +} + +#[test] +#[should_panic] +fn test_long_salt_panics() { + Params::new().salt(&[0; SALTBYTES + 1]); +} + +#[test] +#[should_panic] +fn test_long_personal_panics() { + Params::new().personal(&[0; PERSONALBYTES + 1]); +} + +#[test] +fn test_zero_max_depth_supported() { + Params::new().max_depth(0); +} + +#[test] +#[should_panic] +fn test_long_inner_hash_length_panics() { + Params::new().inner_hash_length(OUTBYTES + 1); +} + +#[test] +#[should_panic] +fn test_blake2bp_short_hash_length_panics() { + blake2bp::Params::new().hash_length(0); +} + +#[test] +#[should_panic] +fn test_blake2bp_long_hash_length_panics() { + blake2bp::Params::new().hash_length(OUTBYTES + 1); +} + +#[test] +#[should_panic] +fn test_blake2bp_long_key_panics() { + blake2bp::Params::new().key(&[0; KEYBYTES + 1]); +} |