summaryrefslogtreecommitdiffstats
path: root/third_party/rust/blake2b_simd
diff options
context:
space:
mode:
Diffstat (limited to 'third_party/rust/blake2b_simd')
-rw-r--r--third_party/rust/blake2b_simd/.cargo-checksum.json1
-rw-r--r--third_party/rust/blake2b_simd/Cargo.toml35
-rw-r--r--third_party/rust/blake2b_simd/README.md42
-rw-r--r--third_party/rust/blake2b_simd/src/avx2.rs928
-rw-r--r--third_party/rust/blake2b_simd/src/blake2bp.rs570
-rw-r--r--third_party/rust/blake2b_simd/src/guts.rs565
-rw-r--r--third_party/rust/blake2b_simd/src/lib.rs674
-rw-r--r--third_party/rust/blake2b_simd/src/many.rs529
-rw-r--r--third_party/rust/blake2b_simd/src/portable.rs166
-rw-r--r--third_party/rust/blake2b_simd/src/sse41.rs454
-rw-r--r--third_party/rust/blake2b_simd/src/test.rs201
11 files changed, 4165 insertions, 0 deletions
diff --git a/third_party/rust/blake2b_simd/.cargo-checksum.json b/third_party/rust/blake2b_simd/.cargo-checksum.json
new file mode 100644
index 0000000000..bbeeada8d6
--- /dev/null
+++ b/third_party/rust/blake2b_simd/.cargo-checksum.json
@@ -0,0 +1 @@
+{"files":{"Cargo.toml":"648c10063fa1a16a961df45f194f50982bdf3d41d04586a48d2cc6d69e0252c1","README.md":"2253eba78d5af06642073c5dfd41253fb8be73d3a0e823bc3d7642c9d0ad0c6c","src/avx2.rs":"a97ec761e4e7f70ff6311f4c1e67cb5136ac66cfc51bc49525b81f9e23814d81","src/blake2bp.rs":"83577d4a22db3b92030d9bd4563aa9ad440f23c64a6ad5f10a9d709f22d50589","src/guts.rs":"1189cab87b18eaaf2abd5bcb3d7d799c75401a312cee6f1f65fdaad30203eb6f","src/lib.rs":"67723a3abc30dc7f3d488f434ced884b5ce962a807991c8f1cc9940df869c342","src/many.rs":"60d07e4d7ad63949fb5432ad05f7c6a525a3eee39d325f7d4e65e901b466be95","src/portable.rs":"c47baa15b311bc95d49f3d189111fe45756fb7d623a1f48f0050ae591817aedf","src/sse41.rs":"7a644b1056b804ada9ddc7586552a4a5c769e576d610ffe7ec74065f7eaff491","src/test.rs":"1685eec6fedc30fca1332cbb78c85e6c9b56eca962b6c6343c91ba69eefac754"},"package":"b83b7baab1e671718d78204225800d6b170e648188ac7dc992e9d6bddf87d0c0"} \ No newline at end of file
diff --git a/third_party/rust/blake2b_simd/Cargo.toml b/third_party/rust/blake2b_simd/Cargo.toml
new file mode 100644
index 0000000000..ca92a4cc2d
--- /dev/null
+++ b/third_party/rust/blake2b_simd/Cargo.toml
@@ -0,0 +1,35 @@
+# THIS FILE IS AUTOMATICALLY GENERATED BY CARGO
+#
+# When uploading crates to the registry Cargo will automatically
+# "normalize" Cargo.toml files for maximal compatibility
+# with all versions of Cargo and also rewrite `path` dependencies
+# to registry (e.g., crates.io) dependencies
+#
+# If you believe there's an error in this file please file an
+# issue against the rust-lang/cargo repository. If you're
+# editing this file be aware that the upstream Cargo.toml
+# will likely look very different (and much more reasonable)
+
+[package]
+edition = "2018"
+name = "blake2b_simd"
+version = "0.5.9"
+authors = ["Jack O'Connor"]
+description = "a pure Rust BLAKE2b implementation with dynamic SIMD"
+documentation = "https://docs.rs/blake2b_simd"
+readme = "README.md"
+license = "MIT"
+repository = "https://github.com/oconnor663/blake2_simd"
+[dependencies.arrayref]
+version = "0.3.5"
+
+[dependencies.arrayvec]
+version = "0.5.0"
+default-features = false
+
+[dependencies.constant_time_eq]
+version = "0.1.3"
+
+[features]
+default = ["std"]
+std = []
diff --git a/third_party/rust/blake2b_simd/README.md b/third_party/rust/blake2b_simd/README.md
new file mode 100644
index 0000000000..54caef739d
--- /dev/null
+++ b/third_party/rust/blake2b_simd/README.md
@@ -0,0 +1,42 @@
+# blake2b_simd [![GitHub](https://img.shields.io/github/tag/oconnor663/blake2_simd.svg?label=GitHub)](https://github.com/oconnor663/blake2_simd) [![crates.io](https://img.shields.io/crates/v/blake2b_simd.svg)](https://crates.io/crates/blake2b_simd) [![Build Status](https://travis-ci.org/oconnor663/blake2_simd.svg?branch=master)](https://travis-ci.org/oconnor663/blake2_simd)
+
+An implementation of the BLAKE2b and BLAKE2bp hash functions. See also
+[`blake2s_simd`](../blake2s).
+
+This crate includes:
+
+- 100% stable Rust.
+- SIMD implementations based on Samuel Neves' [`blake2-avx2`](https://github.com/sneves/blake2-avx2).
+ These are very fast. For benchmarks, see [the Performance section of the
+ README](https://github.com/oconnor663/blake2_simd#performance).
+- Portable, safe implementations for other platforms.
+- Dynamic CPU feature detection. Binaries include multiple implementations by default and
+ choose the fastest one the processor supports at runtime.
+- All the features from the [the BLAKE2 spec](https://blake2.net/blake2.pdf), like adjustable
+ length, keying, and associated data for tree hashing.
+- `no_std` support. The `std` Cargo feature is on by default, for CPU feature detection and
+ for implementing `std::io::Write`.
+- Support for computing multiple BLAKE2b hashes in parallel, matching the efficiency of
+ BLAKE2bp. See the [`many`](many/index.html) module.
+
+# Example
+
+```
+use blake2b_simd::{blake2b, Params};
+
+let expected = "ca002330e69d3e6b84a46a56a6533fd79d51d97a3bb7cad6c2ff43b354185d6d\
+ c1e723fb3db4ae0737e120378424c714bb982d9dc5bbd7a0ab318240ddd18f8d";
+let hash = blake2b(b"foo");
+assert_eq!(expected, &hash.to_hex());
+
+let hash = Params::new()
+ .hash_length(16)
+ .key(b"The Magic Words are Squeamish Ossifrage")
+ .personal(b"L. P. Waterhouse")
+ .to_state()
+ .update(b"foo")
+ .update(b"bar")
+ .update(b"baz")
+ .finalize();
+assert_eq!("ee8ff4e9be887297cf79348dc35dab56", &hash.to_hex());
+```
diff --git a/third_party/rust/blake2b_simd/src/avx2.rs b/third_party/rust/blake2b_simd/src/avx2.rs
new file mode 100644
index 0000000000..268dd82d36
--- /dev/null
+++ b/third_party/rust/blake2b_simd/src/avx2.rs
@@ -0,0 +1,928 @@
+#[cfg(target_arch = "x86")]
+use core::arch::x86::*;
+#[cfg(target_arch = "x86_64")]
+use core::arch::x86_64::*;
+
+use crate::guts::{
+ assemble_count, count_high, count_low, final_block, flag_word, input_debug_asserts, Finalize,
+ Job, LastNode, Stride,
+};
+use crate::{Count, Word, BLOCKBYTES, IV, SIGMA};
+use arrayref::{array_refs, mut_array_refs};
+use core::cmp;
+use core::mem;
+
+pub const DEGREE: usize = 4;
+
+#[inline(always)]
+unsafe fn loadu(src: *const [Word; DEGREE]) -> __m256i {
+ // This is an unaligned load, so the pointer cast is allowed.
+ _mm256_loadu_si256(src as *const __m256i)
+}
+
+#[inline(always)]
+unsafe fn storeu(src: __m256i, dest: *mut [Word; DEGREE]) {
+ // This is an unaligned store, so the pointer cast is allowed.
+ _mm256_storeu_si256(dest as *mut __m256i, src)
+}
+
+#[inline(always)]
+unsafe fn loadu_128(mem_addr: &[u8; 16]) -> __m128i {
+ _mm_loadu_si128(mem_addr.as_ptr() as *const __m128i)
+}
+
+#[inline(always)]
+unsafe fn add(a: __m256i, b: __m256i) -> __m256i {
+ _mm256_add_epi64(a, b)
+}
+
+#[inline(always)]
+unsafe fn eq(a: __m256i, b: __m256i) -> __m256i {
+ _mm256_cmpeq_epi64(a, b)
+}
+
+#[inline(always)]
+unsafe fn and(a: __m256i, b: __m256i) -> __m256i {
+ _mm256_and_si256(a, b)
+}
+
+#[inline(always)]
+unsafe fn negate_and(a: __m256i, b: __m256i) -> __m256i {
+ // Note that "and not" implies the reverse of the actual arg order.
+ _mm256_andnot_si256(a, b)
+}
+
+#[inline(always)]
+unsafe fn xor(a: __m256i, b: __m256i) -> __m256i {
+ _mm256_xor_si256(a, b)
+}
+
+#[inline(always)]
+unsafe fn set1(x: u64) -> __m256i {
+ _mm256_set1_epi64x(x as i64)
+}
+
+#[inline(always)]
+unsafe fn set4(a: u64, b: u64, c: u64, d: u64) -> __m256i {
+ _mm256_setr_epi64x(a as i64, b as i64, c as i64, d as i64)
+}
+
+// Adapted from https://github.com/rust-lang-nursery/stdsimd/pull/479.
+macro_rules! _MM_SHUFFLE {
+ ($z:expr, $y:expr, $x:expr, $w:expr) => {
+ ($z << 6) | ($y << 4) | ($x << 2) | $w
+ };
+}
+
+#[inline(always)]
+unsafe fn rot32(x: __m256i) -> __m256i {
+ _mm256_shuffle_epi32(x, _MM_SHUFFLE!(2, 3, 0, 1))
+}
+
+#[inline(always)]
+unsafe fn rot24(x: __m256i) -> __m256i {
+ let rotate24 = _mm256_setr_epi8(
+ 3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10, 3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13,
+ 14, 15, 8, 9, 10,
+ );
+ _mm256_shuffle_epi8(x, rotate24)
+}
+
+#[inline(always)]
+unsafe fn rot16(x: __m256i) -> __m256i {
+ let rotate16 = _mm256_setr_epi8(
+ 2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9, 2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12,
+ 13, 14, 15, 8, 9,
+ );
+ _mm256_shuffle_epi8(x, rotate16)
+}
+
+#[inline(always)]
+unsafe fn rot63(x: __m256i) -> __m256i {
+ _mm256_or_si256(_mm256_srli_epi64(x, 63), add(x, x))
+}
+
+#[inline(always)]
+unsafe fn g1(a: &mut __m256i, b: &mut __m256i, c: &mut __m256i, d: &mut __m256i, m: &mut __m256i) {
+ *a = add(*a, *m);
+ *a = add(*a, *b);
+ *d = xor(*d, *a);
+ *d = rot32(*d);
+ *c = add(*c, *d);
+ *b = xor(*b, *c);
+ *b = rot24(*b);
+}
+
+#[inline(always)]
+unsafe fn g2(a: &mut __m256i, b: &mut __m256i, c: &mut __m256i, d: &mut __m256i, m: &mut __m256i) {
+ *a = add(*a, *m);
+ *a = add(*a, *b);
+ *d = xor(*d, *a);
+ *d = rot16(*d);
+ *c = add(*c, *d);
+ *b = xor(*b, *c);
+ *b = rot63(*b);
+}
+
+// Note the optimization here of leaving b as the unrotated row, rather than a.
+// All the message loads below are adjusted to compensate for this. See
+// discussion at https://github.com/sneves/blake2-avx2/pull/4
+#[inline(always)]
+unsafe fn diagonalize(a: &mut __m256i, _b: &mut __m256i, c: &mut __m256i, d: &mut __m256i) {
+ *a = _mm256_permute4x64_epi64(*a, _MM_SHUFFLE!(2, 1, 0, 3));
+ *d = _mm256_permute4x64_epi64(*d, _MM_SHUFFLE!(1, 0, 3, 2));
+ *c = _mm256_permute4x64_epi64(*c, _MM_SHUFFLE!(0, 3, 2, 1));
+}
+
+#[inline(always)]
+unsafe fn undiagonalize(a: &mut __m256i, _b: &mut __m256i, c: &mut __m256i, d: &mut __m256i) {
+ *a = _mm256_permute4x64_epi64(*a, _MM_SHUFFLE!(0, 3, 2, 1));
+ *d = _mm256_permute4x64_epi64(*d, _MM_SHUFFLE!(1, 0, 3, 2));
+ *c = _mm256_permute4x64_epi64(*c, _MM_SHUFFLE!(2, 1, 0, 3));
+}
+
+#[inline(always)]
+unsafe fn compress_block(
+ block: &[u8; BLOCKBYTES],
+ words: &mut [Word; 8],
+ count: Count,
+ last_block: Word,
+ last_node: Word,
+) {
+ let (words_low, words_high) = mut_array_refs!(words, DEGREE, DEGREE);
+ let (iv_low, iv_high) = array_refs!(&IV, DEGREE, DEGREE);
+ let mut a = loadu(words_low);
+ let mut b = loadu(words_high);
+ let mut c = loadu(iv_low);
+ let flags = set4(count_low(count), count_high(count), last_block, last_node);
+ let mut d = xor(loadu(iv_high), flags);
+
+ let msg_chunks = array_refs!(block, 16, 16, 16, 16, 16, 16, 16, 16);
+ let m0 = _mm256_broadcastsi128_si256(loadu_128(msg_chunks.0));
+ let m1 = _mm256_broadcastsi128_si256(loadu_128(msg_chunks.1));
+ let m2 = _mm256_broadcastsi128_si256(loadu_128(msg_chunks.2));
+ let m3 = _mm256_broadcastsi128_si256(loadu_128(msg_chunks.3));
+ let m4 = _mm256_broadcastsi128_si256(loadu_128(msg_chunks.4));
+ let m5 = _mm256_broadcastsi128_si256(loadu_128(msg_chunks.5));
+ let m6 = _mm256_broadcastsi128_si256(loadu_128(msg_chunks.6));
+ let m7 = _mm256_broadcastsi128_si256(loadu_128(msg_chunks.7));
+
+ let iv0 = a;
+ let iv1 = b;
+ let mut t0;
+ let mut t1;
+ let mut b0;
+
+ // round 1
+ t0 = _mm256_unpacklo_epi64(m0, m1);
+ t1 = _mm256_unpacklo_epi64(m2, m3);
+ b0 = _mm256_blend_epi32(t0, t1, 0xF0);
+ g1(&mut a, &mut b, &mut c, &mut d, &mut b0);
+ t0 = _mm256_unpackhi_epi64(m0, m1);
+ t1 = _mm256_unpackhi_epi64(m2, m3);
+ b0 = _mm256_blend_epi32(t0, t1, 0xF0);
+ g2(&mut a, &mut b, &mut c, &mut d, &mut b0);
+ diagonalize(&mut a, &mut b, &mut c, &mut d);
+ t0 = _mm256_unpacklo_epi64(m7, m4);
+ t1 = _mm256_unpacklo_epi64(m5, m6);
+ b0 = _mm256_blend_epi32(t0, t1, 0xF0);
+ g1(&mut a, &mut b, &mut c, &mut d, &mut b0);
+ t0 = _mm256_unpackhi_epi64(m7, m4);
+ t1 = _mm256_unpackhi_epi64(m5, m6);
+ b0 = _mm256_blend_epi32(t0, t1, 0xF0);
+ g2(&mut a, &mut b, &mut c, &mut d, &mut b0);
+ undiagonalize(&mut a, &mut b, &mut c, &mut d);
+
+ // round 2
+ t0 = _mm256_unpacklo_epi64(m7, m2);
+ t1 = _mm256_unpackhi_epi64(m4, m6);
+ b0 = _mm256_blend_epi32(t0, t1, 0xF0);
+ g1(&mut a, &mut b, &mut c, &mut d, &mut b0);
+ t0 = _mm256_unpacklo_epi64(m5, m4);
+ t1 = _mm256_alignr_epi8(m3, m7, 8);
+ b0 = _mm256_blend_epi32(t0, t1, 0xF0);
+ g2(&mut a, &mut b, &mut c, &mut d, &mut b0);
+ diagonalize(&mut a, &mut b, &mut c, &mut d);
+ t0 = _mm256_unpackhi_epi64(m2, m0);
+ t1 = _mm256_blend_epi32(m5, m0, 0x33);
+ b0 = _mm256_blend_epi32(t0, t1, 0xF0);
+ g1(&mut a, &mut b, &mut c, &mut d, &mut b0);
+ t0 = _mm256_alignr_epi8(m6, m1, 8);
+ t1 = _mm256_blend_epi32(m3, m1, 0x33);
+ b0 = _mm256_blend_epi32(t0, t1, 0xF0);
+ g2(&mut a, &mut b, &mut c, &mut d, &mut b0);
+ undiagonalize(&mut a, &mut b, &mut c, &mut d);
+
+ // round 3
+ t0 = _mm256_alignr_epi8(m6, m5, 8);
+ t1 = _mm256_unpackhi_epi64(m2, m7);
+ b0 = _mm256_blend_epi32(t0, t1, 0xF0);
+ g1(&mut a, &mut b, &mut c, &mut d, &mut b0);
+ t0 = _mm256_unpacklo_epi64(m4, m0);
+ t1 = _mm256_blend_epi32(m6, m1, 0x33);
+ b0 = _mm256_blend_epi32(t0, t1, 0xF0);
+ g2(&mut a, &mut b, &mut c, &mut d, &mut b0);
+ diagonalize(&mut a, &mut b, &mut c, &mut d);
+ t0 = _mm256_alignr_epi8(m5, m4, 8);
+ t1 = _mm256_unpackhi_epi64(m1, m3);
+ b0 = _mm256_blend_epi32(t0, t1, 0xF0);
+ g1(&mut a, &mut b, &mut c, &mut d, &mut b0);
+ t0 = _mm256_unpacklo_epi64(m2, m7);
+ t1 = _mm256_blend_epi32(m0, m3, 0x33);
+ b0 = _mm256_blend_epi32(t0, t1, 0xF0);
+ g2(&mut a, &mut b, &mut c, &mut d, &mut b0);
+ undiagonalize(&mut a, &mut b, &mut c, &mut d);
+
+ // round 4
+ t0 = _mm256_unpackhi_epi64(m3, m1);
+ t1 = _mm256_unpackhi_epi64(m6, m5);
+ b0 = _mm256_blend_epi32(t0, t1, 0xF0);
+ g1(&mut a, &mut b, &mut c, &mut d, &mut b0);
+ t0 = _mm256_unpackhi_epi64(m4, m0);
+ t1 = _mm256_unpacklo_epi64(m6, m7);
+ b0 = _mm256_blend_epi32(t0, t1, 0xF0);
+ g2(&mut a, &mut b, &mut c, &mut d, &mut b0);
+ diagonalize(&mut a, &mut b, &mut c, &mut d);
+ t0 = _mm256_alignr_epi8(m1, m7, 8);
+ t1 = _mm256_shuffle_epi32(m2, _MM_SHUFFLE!(1, 0, 3, 2));
+ b0 = _mm256_blend_epi32(t0, t1, 0xF0);
+ g1(&mut a, &mut b, &mut c, &mut d, &mut b0);
+ t0 = _mm256_unpacklo_epi64(m4, m3);
+ t1 = _mm256_unpacklo_epi64(m5, m0);
+ b0 = _mm256_blend_epi32(t0, t1, 0xF0);
+ g2(&mut a, &mut b, &mut c, &mut d, &mut b0);
+ undiagonalize(&mut a, &mut b, &mut c, &mut d);
+
+ // round 5
+ t0 = _mm256_unpackhi_epi64(m4, m2);
+ t1 = _mm256_unpacklo_epi64(m1, m5);
+ b0 = _mm256_blend_epi32(t0, t1, 0xF0);
+ g1(&mut a, &mut b, &mut c, &mut d, &mut b0);
+ t0 = _mm256_blend_epi32(m3, m0, 0x33);
+ t1 = _mm256_blend_epi32(m7, m2, 0x33);
+ b0 = _mm256_blend_epi32(t0, t1, 0xF0);
+ g2(&mut a, &mut b, &mut c, &mut d, &mut b0);
+ diagonalize(&mut a, &mut b, &mut c, &mut d);
+ t0 = _mm256_alignr_epi8(m7, m1, 8);
+ t1 = _mm256_alignr_epi8(m3, m5, 8);
+ b0 = _mm256_blend_epi32(t0, t1, 0xF0);
+ g1(&mut a, &mut b, &mut c, &mut d, &mut b0);
+ t0 = _mm256_unpackhi_epi64(m6, m0);
+ t1 = _mm256_unpacklo_epi64(m6, m4);
+ b0 = _mm256_blend_epi32(t0, t1, 0xF0);
+ g2(&mut a, &mut b, &mut c, &mut d, &mut b0);
+ undiagonalize(&mut a, &mut b, &mut c, &mut d);
+
+ // round 6
+ t0 = _mm256_unpacklo_epi64(m1, m3);
+ t1 = _mm256_unpacklo_epi64(m0, m4);
+ b0 = _mm256_blend_epi32(t0, t1, 0xF0);
+ g1(&mut a, &mut b, &mut c, &mut d, &mut b0);
+ t0 = _mm256_unpacklo_epi64(m6, m5);
+ t1 = _mm256_unpackhi_epi64(m5, m1);
+ b0 = _mm256_blend_epi32(t0, t1, 0xF0);
+ g2(&mut a, &mut b, &mut c, &mut d, &mut b0);
+ diagonalize(&mut a, &mut b, &mut c, &mut d);
+ t0 = _mm256_alignr_epi8(m2, m0, 8);
+ t1 = _mm256_unpackhi_epi64(m3, m7);
+ b0 = _mm256_blend_epi32(t0, t1, 0xF0);
+ g1(&mut a, &mut b, &mut c, &mut d, &mut b0);
+ t0 = _mm256_unpackhi_epi64(m4, m6);
+ t1 = _mm256_alignr_epi8(m7, m2, 8);
+ b0 = _mm256_blend_epi32(t0, t1, 0xF0);
+ g2(&mut a, &mut b, &mut c, &mut d, &mut b0);
+ undiagonalize(&mut a, &mut b, &mut c, &mut d);
+
+ // round 7
+ t0 = _mm256_blend_epi32(m0, m6, 0x33);
+ t1 = _mm256_unpacklo_epi64(m7, m2);
+ b0 = _mm256_blend_epi32(t0, t1, 0xF0);
+ g1(&mut a, &mut b, &mut c, &mut d, &mut b0);
+ t0 = _mm256_unpackhi_epi64(m2, m7);
+ t1 = _mm256_alignr_epi8(m5, m6, 8);
+ b0 = _mm256_blend_epi32(t0, t1, 0xF0);
+ g2(&mut a, &mut b, &mut c, &mut d, &mut b0);
+ diagonalize(&mut a, &mut b, &mut c, &mut d);
+ t0 = _mm256_unpacklo_epi64(m4, m0);
+ t1 = _mm256_blend_epi32(m4, m3, 0x33);
+ b0 = _mm256_blend_epi32(t0, t1, 0xF0);
+ g1(&mut a, &mut b, &mut c, &mut d, &mut b0);
+ t0 = _mm256_unpackhi_epi64(m5, m3);
+ t1 = _mm256_shuffle_epi32(m1, _MM_SHUFFLE!(1, 0, 3, 2));
+ b0 = _mm256_blend_epi32(t0, t1, 0xF0);
+ g2(&mut a, &mut b, &mut c, &mut d, &mut b0);
+ undiagonalize(&mut a, &mut b, &mut c, &mut d);
+
+ // round 8
+ t0 = _mm256_unpackhi_epi64(m6, m3);
+ t1 = _mm256_blend_epi32(m1, m6, 0x33);
+ b0 = _mm256_blend_epi32(t0, t1, 0xF0);
+ g1(&mut a, &mut b, &mut c, &mut d, &mut b0);
+ t0 = _mm256_alignr_epi8(m7, m5, 8);
+ t1 = _mm256_unpackhi_epi64(m0, m4);
+ b0 = _mm256_blend_epi32(t0, t1, 0xF0);
+ g2(&mut a, &mut b, &mut c, &mut d, &mut b0);
+ diagonalize(&mut a, &mut b, &mut c, &mut d);
+ t0 = _mm256_blend_epi32(m2, m1, 0x33);
+ t1 = _mm256_alignr_epi8(m4, m7, 8);
+ b0 = _mm256_blend_epi32(t0, t1, 0xF0);
+ g1(&mut a, &mut b, &mut c, &mut d, &mut b0);
+ t0 = _mm256_unpacklo_epi64(m5, m0);
+ t1 = _mm256_unpacklo_epi64(m2, m3);
+ b0 = _mm256_blend_epi32(t0, t1, 0xF0);
+ g2(&mut a, &mut b, &mut c, &mut d, &mut b0);
+ undiagonalize(&mut a, &mut b, &mut c, &mut d);
+
+ // round 9
+ t0 = _mm256_unpacklo_epi64(m3, m7);
+ t1 = _mm256_alignr_epi8(m0, m5, 8);
+ b0 = _mm256_blend_epi32(t0, t1, 0xF0);
+ g1(&mut a, &mut b, &mut c, &mut d, &mut b0);
+ t0 = _mm256_unpackhi_epi64(m7, m4);
+ t1 = _mm256_alignr_epi8(m4, m1, 8);
+ b0 = _mm256_blend_epi32(t0, t1, 0xF0);
+ g2(&mut a, &mut b, &mut c, &mut d, &mut b0);
+ diagonalize(&mut a, &mut b, &mut c, &mut d);
+ t0 = _mm256_unpacklo_epi64(m5, m6);
+ t1 = _mm256_unpackhi_epi64(m6, m0);
+ b0 = _mm256_blend_epi32(t0, t1, 0xF0);
+ g1(&mut a, &mut b, &mut c, &mut d, &mut b0);
+ t0 = _mm256_alignr_epi8(m1, m2, 8);
+ t1 = _mm256_alignr_epi8(m2, m3, 8);
+ b0 = _mm256_blend_epi32(t0, t1, 0xF0);
+ g2(&mut a, &mut b, &mut c, &mut d, &mut b0);
+ undiagonalize(&mut a, &mut b, &mut c, &mut d);
+
+ // round 10
+ t0 = _mm256_unpacklo_epi64(m5, m4);
+ t1 = _mm256_unpackhi_epi64(m3, m0);
+ b0 = _mm256_blend_epi32(t0, t1, 0xF0);
+ g1(&mut a, &mut b, &mut c, &mut d, &mut b0);
+ t0 = _mm256_unpacklo_epi64(m1, m2);
+ t1 = _mm256_blend_epi32(m2, m3, 0x33);
+ b0 = _mm256_blend_epi32(t0, t1, 0xF0);
+ g2(&mut a, &mut b, &mut c, &mut d, &mut b0);
+ diagonalize(&mut a, &mut b, &mut c, &mut d);
+ t0 = _mm256_unpackhi_epi64(m6, m7);
+ t1 = _mm256_unpackhi_epi64(m4, m1);
+ b0 = _mm256_blend_epi32(t0, t1, 0xF0);
+ g1(&mut a, &mut b, &mut c, &mut d, &mut b0);
+ t0 = _mm256_blend_epi32(m5, m0, 0x33);
+ t1 = _mm256_unpacklo_epi64(m7, m6);
+ b0 = _mm256_blend_epi32(t0, t1, 0xF0);
+ g2(&mut a, &mut b, &mut c, &mut d, &mut b0);
+ undiagonalize(&mut a, &mut b, &mut c, &mut d);
+
+ // round 11
+ t0 = _mm256_unpacklo_epi64(m0, m1);
+ t1 = _mm256_unpacklo_epi64(m2, m3);
+ b0 = _mm256_blend_epi32(t0, t1, 0xF0);
+ g1(&mut a, &mut b, &mut c, &mut d, &mut b0);
+ t0 = _mm256_unpackhi_epi64(m0, m1);
+ t1 = _mm256_unpackhi_epi64(m2, m3);
+ b0 = _mm256_blend_epi32(t0, t1, 0xF0);
+ g2(&mut a, &mut b, &mut c, &mut d, &mut b0);
+ diagonalize(&mut a, &mut b, &mut c, &mut d);
+ t0 = _mm256_unpacklo_epi64(m7, m4);
+ t1 = _mm256_unpacklo_epi64(m5, m6);
+ b0 = _mm256_blend_epi32(t0, t1, 0xF0);
+ g1(&mut a, &mut b, &mut c, &mut d, &mut b0);
+ t0 = _mm256_unpackhi_epi64(m7, m4);
+ t1 = _mm256_unpackhi_epi64(m5, m6);
+ b0 = _mm256_blend_epi32(t0, t1, 0xF0);
+ g2(&mut a, &mut b, &mut c, &mut d, &mut b0);
+ undiagonalize(&mut a, &mut b, &mut c, &mut d);
+
+ // round 12
+ t0 = _mm256_unpacklo_epi64(m7, m2);
+ t1 = _mm256_unpackhi_epi64(m4, m6);
+ b0 = _mm256_blend_epi32(t0, t1, 0xF0);
+ g1(&mut a, &mut b, &mut c, &mut d, &mut b0);
+ t0 = _mm256_unpacklo_epi64(m5, m4);
+ t1 = _mm256_alignr_epi8(m3, m7, 8);
+ b0 = _mm256_blend_epi32(t0, t1, 0xF0);
+ g2(&mut a, &mut b, &mut c, &mut d, &mut b0);
+ diagonalize(&mut a, &mut b, &mut c, &mut d);
+ t0 = _mm256_unpackhi_epi64(m2, m0);
+ t1 = _mm256_blend_epi32(m5, m0, 0x33);
+ b0 = _mm256_blend_epi32(t0, t1, 0xF0);
+ g1(&mut a, &mut b, &mut c, &mut d, &mut b0);
+ t0 = _mm256_alignr_epi8(m6, m1, 8);
+ t1 = _mm256_blend_epi32(m3, m1, 0x33);
+ b0 = _mm256_blend_epi32(t0, t1, 0xF0);
+ g2(&mut a, &mut b, &mut c, &mut d, &mut b0);
+ undiagonalize(&mut a, &mut b, &mut c, &mut d);
+
+ a = xor(a, c);
+ b = xor(b, d);
+ a = xor(a, iv0);
+ b = xor(b, iv1);
+
+ storeu(a, words_low);
+ storeu(b, words_high);
+}
+
+#[target_feature(enable = "avx2")]
+pub unsafe fn compress1_loop(
+ input: &[u8],
+ words: &mut [Word; 8],
+ mut count: Count,
+ last_node: LastNode,
+ finalize: Finalize,
+ stride: Stride,
+) {
+ input_debug_asserts(input, finalize);
+
+ let mut local_words = *words;
+
+ let mut fin_offset = input.len().saturating_sub(1);
+ fin_offset -= fin_offset % stride.padded_blockbytes();
+ let mut buf = [0; BLOCKBYTES];
+ let (fin_block, fin_len, _) = final_block(input, fin_offset, &mut buf, stride);
+ let fin_last_block = flag_word(finalize.yes());
+ let fin_last_node = flag_word(finalize.yes() && last_node.yes());
+
+ let mut offset = 0;
+ loop {
+ let block;
+ let count_delta;
+ let last_block;
+ let last_node;
+ if offset == fin_offset {
+ block = fin_block;
+ count_delta = fin_len;
+ last_block = fin_last_block;
+ last_node = fin_last_node;
+ } else {
+ // This unsafe cast avoids bounds checks. There's guaranteed to be
+ // enough input because `offset < fin_offset`.
+ block = &*(input.as_ptr().add(offset) as *const [u8; BLOCKBYTES]);
+ count_delta = BLOCKBYTES;
+ last_block = flag_word(false);
+ last_node = flag_word(false);
+ };
+
+ count = count.wrapping_add(count_delta as Count);
+ compress_block(block, &mut local_words, count, last_block, last_node);
+
+ // Check for termination before bumping the offset, to avoid overflow.
+ if offset == fin_offset {
+ break;
+ }
+
+ offset += stride.padded_blockbytes();
+ }
+
+ *words = local_words;
+}
+
+// Performance note: Factoring out a G function here doesn't hurt performance,
+// unlike in the case of BLAKE2s where it hurts substantially. In fact, on my
+// machine, it helps a tiny bit. But the difference it tiny, so I'm going to
+// stick to the approach used by https://github.com/sneves/blake2-avx2
+// until/unless I can be sure the (tiny) improvement is consistent across
+// different Intel microarchitectures. Smaller code size is nice, but a
+// divergence between the BLAKE2b and BLAKE2s implementations is less nice.
+#[inline(always)]
+unsafe fn round(v: &mut [__m256i; 16], m: &[__m256i; 16], r: usize) {
+ v[0] = add(v[0], m[SIGMA[r][0] as usize]);
+ v[1] = add(v[1], m[SIGMA[r][2] as usize]);
+ v[2] = add(v[2], m[SIGMA[r][4] as usize]);
+ v[3] = add(v[3], m[SIGMA[r][6] as usize]);
+ v[0] = add(v[0], v[4]);
+ v[1] = add(v[1], v[5]);
+ v[2] = add(v[2], v[6]);
+ v[3] = add(v[3], v[7]);
+ v[12] = xor(v[12], v[0]);
+ v[13] = xor(v[13], v[1]);
+ v[14] = xor(v[14], v[2]);
+ v[15] = xor(v[15], v[3]);
+ v[12] = rot32(v[12]);
+ v[13] = rot32(v[13]);
+ v[14] = rot32(v[14]);
+ v[15] = rot32(v[15]);
+ v[8] = add(v[8], v[12]);
+ v[9] = add(v[9], v[13]);
+ v[10] = add(v[10], v[14]);
+ v[11] = add(v[11], v[15]);
+ v[4] = xor(v[4], v[8]);
+ v[5] = xor(v[5], v[9]);
+ v[6] = xor(v[6], v[10]);
+ v[7] = xor(v[7], v[11]);
+ v[4] = rot24(v[4]);
+ v[5] = rot24(v[5]);
+ v[6] = rot24(v[6]);
+ v[7] = rot24(v[7]);
+ v[0] = add(v[0], m[SIGMA[r][1] as usize]);
+ v[1] = add(v[1], m[SIGMA[r][3] as usize]);
+ v[2] = add(v[2], m[SIGMA[r][5] as usize]);
+ v[3] = add(v[3], m[SIGMA[r][7] as usize]);
+ v[0] = add(v[0], v[4]);
+ v[1] = add(v[1], v[5]);
+ v[2] = add(v[2], v[6]);
+ v[3] = add(v[3], v[7]);
+ v[12] = xor(v[12], v[0]);
+ v[13] = xor(v[13], v[1]);
+ v[14] = xor(v[14], v[2]);
+ v[15] = xor(v[15], v[3]);
+ v[12] = rot16(v[12]);
+ v[13] = rot16(v[13]);
+ v[14] = rot16(v[14]);
+ v[15] = rot16(v[15]);
+ v[8] = add(v[8], v[12]);
+ v[9] = add(v[9], v[13]);
+ v[10] = add(v[10], v[14]);
+ v[11] = add(v[11], v[15]);
+ v[4] = xor(v[4], v[8]);
+ v[5] = xor(v[5], v[9]);
+ v[6] = xor(v[6], v[10]);
+ v[7] = xor(v[7], v[11]);
+ v[4] = rot63(v[4]);
+ v[5] = rot63(v[5]);
+ v[6] = rot63(v[6]);
+ v[7] = rot63(v[7]);
+
+ v[0] = add(v[0], m[SIGMA[r][8] as usize]);
+ v[1] = add(v[1], m[SIGMA[r][10] as usize]);
+ v[2] = add(v[2], m[SIGMA[r][12] as usize]);
+ v[3] = add(v[3], m[SIGMA[r][14] as usize]);
+ v[0] = add(v[0], v[5]);
+ v[1] = add(v[1], v[6]);
+ v[2] = add(v[2], v[7]);
+ v[3] = add(v[3], v[4]);
+ v[15] = xor(v[15], v[0]);
+ v[12] = xor(v[12], v[1]);
+ v[13] = xor(v[13], v[2]);
+ v[14] = xor(v[14], v[3]);
+ v[15] = rot32(v[15]);
+ v[12] = rot32(v[12]);
+ v[13] = rot32(v[13]);
+ v[14] = rot32(v[14]);
+ v[10] = add(v[10], v[15]);
+ v[11] = add(v[11], v[12]);
+ v[8] = add(v[8], v[13]);
+ v[9] = add(v[9], v[14]);
+ v[5] = xor(v[5], v[10]);
+ v[6] = xor(v[6], v[11]);
+ v[7] = xor(v[7], v[8]);
+ v[4] = xor(v[4], v[9]);
+ v[5] = rot24(v[5]);
+ v[6] = rot24(v[6]);
+ v[7] = rot24(v[7]);
+ v[4] = rot24(v[4]);
+ v[0] = add(v[0], m[SIGMA[r][9] as usize]);
+ v[1] = add(v[1], m[SIGMA[r][11] as usize]);
+ v[2] = add(v[2], m[SIGMA[r][13] as usize]);
+ v[3] = add(v[3], m[SIGMA[r][15] as usize]);
+ v[0] = add(v[0], v[5]);
+ v[1] = add(v[1], v[6]);
+ v[2] = add(v[2], v[7]);
+ v[3] = add(v[3], v[4]);
+ v[15] = xor(v[15], v[0]);
+ v[12] = xor(v[12], v[1]);
+ v[13] = xor(v[13], v[2]);
+ v[14] = xor(v[14], v[3]);
+ v[15] = rot16(v[15]);
+ v[12] = rot16(v[12]);
+ v[13] = rot16(v[13]);
+ v[14] = rot16(v[14]);
+ v[10] = add(v[10], v[15]);
+ v[11] = add(v[11], v[12]);
+ v[8] = add(v[8], v[13]);
+ v[9] = add(v[9], v[14]);
+ v[5] = xor(v[5], v[10]);
+ v[6] = xor(v[6], v[11]);
+ v[7] = xor(v[7], v[8]);
+ v[4] = xor(v[4], v[9]);
+ v[5] = rot63(v[5]);
+ v[6] = rot63(v[6]);
+ v[7] = rot63(v[7]);
+ v[4] = rot63(v[4]);
+}
+
+// We'd rather make this a regular function with #[inline(always)], but for
+// some reason that blows up compile times by about 10 seconds, at least in
+// some cases (BLAKE2b avx2.rs). This macro seems to get the same performance
+// result, without the compile time issue.
+macro_rules! compress4_transposed {
+ (
+ $h_vecs:expr,
+ $msg_vecs:expr,
+ $count_low:expr,
+ $count_high:expr,
+ $lastblock:expr,
+ $lastnode:expr,
+ ) => {
+ let h_vecs: &mut [__m256i; 8] = $h_vecs;
+ let msg_vecs: &[__m256i; 16] = $msg_vecs;
+ let count_low: __m256i = $count_low;
+ let count_high: __m256i = $count_high;
+ let lastblock: __m256i = $lastblock;
+ let lastnode: __m256i = $lastnode;
+
+ let mut v = [
+ h_vecs[0],
+ h_vecs[1],
+ h_vecs[2],
+ h_vecs[3],
+ h_vecs[4],
+ h_vecs[5],
+ h_vecs[6],
+ h_vecs[7],
+ set1(IV[0]),
+ set1(IV[1]),
+ set1(IV[2]),
+ set1(IV[3]),
+ xor(set1(IV[4]), count_low),
+ xor(set1(IV[5]), count_high),
+ xor(set1(IV[6]), lastblock),
+ xor(set1(IV[7]), lastnode),
+ ];
+
+ round(&mut v, &msg_vecs, 0);
+ round(&mut v, &msg_vecs, 1);
+ round(&mut v, &msg_vecs, 2);
+ round(&mut v, &msg_vecs, 3);
+ round(&mut v, &msg_vecs, 4);
+ round(&mut v, &msg_vecs, 5);
+ round(&mut v, &msg_vecs, 6);
+ round(&mut v, &msg_vecs, 7);
+ round(&mut v, &msg_vecs, 8);
+ round(&mut v, &msg_vecs, 9);
+ round(&mut v, &msg_vecs, 10);
+ round(&mut v, &msg_vecs, 11);
+
+ h_vecs[0] = xor(xor(h_vecs[0], v[0]), v[8]);
+ h_vecs[1] = xor(xor(h_vecs[1], v[1]), v[9]);
+ h_vecs[2] = xor(xor(h_vecs[2], v[2]), v[10]);
+ h_vecs[3] = xor(xor(h_vecs[3], v[3]), v[11]);
+ h_vecs[4] = xor(xor(h_vecs[4], v[4]), v[12]);
+ h_vecs[5] = xor(xor(h_vecs[5], v[5]), v[13]);
+ h_vecs[6] = xor(xor(h_vecs[6], v[6]), v[14]);
+ h_vecs[7] = xor(xor(h_vecs[7], v[7]), v[15]);
+ };
+}
+
+#[inline(always)]
+unsafe fn interleave128(a: __m256i, b: __m256i) -> (__m256i, __m256i) {
+ (
+ _mm256_permute2x128_si256(a, b, 0x20),
+ _mm256_permute2x128_si256(a, b, 0x31),
+ )
+}
+
+// There are several ways to do a transposition. We could do it naively, with 8 separate
+// _mm256_set_epi64x instructions, referencing each of the 64 words explicitly. Or we could copy
+// the vecs into contiguous storage and then use gather instructions. This third approach is to use
+// a series of unpack instructions to interleave the vectors. In my benchmarks, interleaving is the
+// fastest approach. To test this, run `cargo +nightly bench --bench libtest load_4` in the
+// https://github.com/oconnor663/bao_experiments repo.
+#[inline(always)]
+unsafe fn transpose_vecs(
+ vec_a: __m256i,
+ vec_b: __m256i,
+ vec_c: __m256i,
+ vec_d: __m256i,
+) -> [__m256i; DEGREE] {
+ // Interleave 64-bit lates. The low unpack is lanes 00/22 and the high is 11/33.
+ let ab_02 = _mm256_unpacklo_epi64(vec_a, vec_b);
+ let ab_13 = _mm256_unpackhi_epi64(vec_a, vec_b);
+ let cd_02 = _mm256_unpacklo_epi64(vec_c, vec_d);
+ let cd_13 = _mm256_unpackhi_epi64(vec_c, vec_d);
+
+ // Interleave 128-bit lanes.
+ let (abcd_0, abcd_2) = interleave128(ab_02, cd_02);
+ let (abcd_1, abcd_3) = interleave128(ab_13, cd_13);
+
+ [abcd_0, abcd_1, abcd_2, abcd_3]
+}
+
+#[inline(always)]
+unsafe fn transpose_state_vecs(jobs: &[Job; DEGREE]) -> [__m256i; 8] {
+ // Load all the state words into transposed vectors, where the first vector
+ // has the first word of each state, etc. Transposing once at the beginning
+ // and once at the end is more efficient that repeating it for each block.
+ let words0 = array_refs!(&jobs[0].words, DEGREE, DEGREE);
+ let words1 = array_refs!(&jobs[1].words, DEGREE, DEGREE);
+ let words2 = array_refs!(&jobs[2].words, DEGREE, DEGREE);
+ let words3 = array_refs!(&jobs[3].words, DEGREE, DEGREE);
+ let [h0, h1, h2, h3] = transpose_vecs(
+ loadu(words0.0),
+ loadu(words1.0),
+ loadu(words2.0),
+ loadu(words3.0),
+ );
+ let [h4, h5, h6, h7] = transpose_vecs(
+ loadu(words0.1),
+ loadu(words1.1),
+ loadu(words2.1),
+ loadu(words3.1),
+ );
+ [h0, h1, h2, h3, h4, h5, h6, h7]
+}
+
+#[inline(always)]
+unsafe fn untranspose_state_vecs(h_vecs: &[__m256i; 8], jobs: &mut [Job; DEGREE]) {
+ // Un-transpose the updated state vectors back into the caller's arrays.
+ let [job0, job1, job2, job3] = jobs;
+ let words0 = mut_array_refs!(&mut job0.words, DEGREE, DEGREE);
+ let words1 = mut_array_refs!(&mut job1.words, DEGREE, DEGREE);
+ let words2 = mut_array_refs!(&mut job2.words, DEGREE, DEGREE);
+ let words3 = mut_array_refs!(&mut job3.words, DEGREE, DEGREE);
+ let out = transpose_vecs(h_vecs[0], h_vecs[1], h_vecs[2], h_vecs[3]);
+ storeu(out[0], words0.0);
+ storeu(out[1], words1.0);
+ storeu(out[2], words2.0);
+ storeu(out[3], words3.0);
+ let out = transpose_vecs(h_vecs[4], h_vecs[5], h_vecs[6], h_vecs[7]);
+ storeu(out[0], words0.1);
+ storeu(out[1], words1.1);
+ storeu(out[2], words2.1);
+ storeu(out[3], words3.1);
+}
+
+#[inline(always)]
+unsafe fn transpose_msg_vecs(blocks: [*const [u8; BLOCKBYTES]; DEGREE]) -> [__m256i; 16] {
+ // These input arrays have no particular alignment, so we use unaligned
+ // loads to read from them.
+ let block0 = blocks[0] as *const [Word; DEGREE];
+ let block1 = blocks[1] as *const [Word; DEGREE];
+ let block2 = blocks[2] as *const [Word; DEGREE];
+ let block3 = blocks[3] as *const [Word; DEGREE];
+ let [m0, m1, m2, m3] = transpose_vecs(
+ loadu(block0.add(0)),
+ loadu(block1.add(0)),
+ loadu(block2.add(0)),
+ loadu(block3.add(0)),
+ );
+ let [m4, m5, m6, m7] = transpose_vecs(
+ loadu(block0.add(1)),
+ loadu(block1.add(1)),
+ loadu(block2.add(1)),
+ loadu(block3.add(1)),
+ );
+ let [m8, m9, m10, m11] = transpose_vecs(
+ loadu(block0.add(2)),
+ loadu(block1.add(2)),
+ loadu(block2.add(2)),
+ loadu(block3.add(2)),
+ );
+ let [m12, m13, m14, m15] = transpose_vecs(
+ loadu(block0.add(3)),
+ loadu(block1.add(3)),
+ loadu(block2.add(3)),
+ loadu(block3.add(3)),
+ );
+ [
+ m0, m1, m2, m3, m4, m5, m6, m7, m8, m9, m10, m11, m12, m13, m14, m15,
+ ]
+}
+
+#[inline(always)]
+unsafe fn load_counts(jobs: &[Job; DEGREE]) -> (__m256i, __m256i) {
+ (
+ set4(
+ count_low(jobs[0].count),
+ count_low(jobs[1].count),
+ count_low(jobs[2].count),
+ count_low(jobs[3].count),
+ ),
+ set4(
+ count_high(jobs[0].count),
+ count_high(jobs[1].count),
+ count_high(jobs[2].count),
+ count_high(jobs[3].count),
+ ),
+ )
+}
+
+#[inline(always)]
+unsafe fn store_counts(jobs: &mut [Job; DEGREE], low: __m256i, high: __m256i) {
+ let low_ints: [Word; DEGREE] = mem::transmute(low);
+ let high_ints: [Word; DEGREE] = mem::transmute(high);
+ for i in 0..DEGREE {
+ jobs[i].count = assemble_count(low_ints[i], high_ints[i]);
+ }
+}
+
+#[inline(always)]
+unsafe fn add_to_counts(lo: &mut __m256i, hi: &mut __m256i, delta: __m256i) {
+ // If the low counts reach zero, that means they wrapped, unless the delta
+ // was also zero.
+ *lo = add(*lo, delta);
+ let lo_reached_zero = eq(*lo, set1(0));
+ let delta_was_zero = eq(delta, set1(0));
+ let hi_inc = and(set1(1), negate_and(delta_was_zero, lo_reached_zero));
+ *hi = add(*hi, hi_inc);
+}
+
+#[inline(always)]
+unsafe fn flags_vec(flags: [bool; DEGREE]) -> __m256i {
+ set4(
+ flag_word(flags[0]),
+ flag_word(flags[1]),
+ flag_word(flags[2]),
+ flag_word(flags[3]),
+ )
+}
+
+#[target_feature(enable = "avx2")]
+pub unsafe fn compress4_loop(jobs: &mut [Job; DEGREE], finalize: Finalize, stride: Stride) {
+ // If we're not finalizing, there can't be a partial block at the end.
+ for job in jobs.iter() {
+ input_debug_asserts(job.input, finalize);
+ }
+
+ let msg_ptrs = [
+ jobs[0].input.as_ptr(),
+ jobs[1].input.as_ptr(),
+ jobs[2].input.as_ptr(),
+ jobs[3].input.as_ptr(),
+ ];
+ let mut h_vecs = transpose_state_vecs(&jobs);
+ let (mut counts_lo, mut counts_hi) = load_counts(&jobs);
+
+ // Prepare the final blocks (note, which could be empty if the input is
+ // empty). Do all this before entering the main loop.
+ let min_len = jobs.iter().map(|job| job.input.len()).min().unwrap();
+ let mut fin_offset = min_len.saturating_sub(1);
+ fin_offset -= fin_offset % stride.padded_blockbytes();
+ // Performance note, making these buffers mem::uninitialized() seems to
+ // cause problems in the optimizer.
+ let mut buf0: [u8; BLOCKBYTES] = [0; BLOCKBYTES];
+ let mut buf1: [u8; BLOCKBYTES] = [0; BLOCKBYTES];
+ let mut buf2: [u8; BLOCKBYTES] = [0; BLOCKBYTES];
+ let mut buf3: [u8; BLOCKBYTES] = [0; BLOCKBYTES];
+ let (block0, len0, finalize0) = final_block(jobs[0].input, fin_offset, &mut buf0, stride);
+ let (block1, len1, finalize1) = final_block(jobs[1].input, fin_offset, &mut buf1, stride);
+ let (block2, len2, finalize2) = final_block(jobs[2].input, fin_offset, &mut buf2, stride);
+ let (block3, len3, finalize3) = final_block(jobs[3].input, fin_offset, &mut buf3, stride);
+ let fin_blocks: [*const [u8; BLOCKBYTES]; DEGREE] = [block0, block1, block2, block3];
+ let fin_counts_delta = set4(len0 as Word, len1 as Word, len2 as Word, len3 as Word);
+ let fin_last_block;
+ let fin_last_node;
+ if finalize.yes() {
+ fin_last_block = flags_vec([finalize0, finalize1, finalize2, finalize3]);
+ fin_last_node = flags_vec([
+ finalize0 && jobs[0].last_node.yes(),
+ finalize1 && jobs[1].last_node.yes(),
+ finalize2 && jobs[2].last_node.yes(),
+ finalize3 && jobs[3].last_node.yes(),
+ ]);
+ } else {
+ fin_last_block = set1(0);
+ fin_last_node = set1(0);
+ }
+
+ // The main loop.
+ let mut offset = 0;
+ loop {
+ let blocks;
+ let counts_delta;
+ let last_block;
+ let last_node;
+ if offset == fin_offset {
+ blocks = fin_blocks;
+ counts_delta = fin_counts_delta;
+ last_block = fin_last_block;
+ last_node = fin_last_node;
+ } else {
+ blocks = [
+ msg_ptrs[0].add(offset) as *const [u8; BLOCKBYTES],
+ msg_ptrs[1].add(offset) as *const [u8; BLOCKBYTES],
+ msg_ptrs[2].add(offset) as *const [u8; BLOCKBYTES],
+ msg_ptrs[3].add(offset) as *const [u8; BLOCKBYTES],
+ ];
+ counts_delta = set1(BLOCKBYTES as Word);
+ last_block = set1(0);
+ last_node = set1(0);
+ };
+
+ let m_vecs = transpose_msg_vecs(blocks);
+ add_to_counts(&mut counts_lo, &mut counts_hi, counts_delta);
+ compress4_transposed!(
+ &mut h_vecs,
+ &m_vecs,
+ counts_lo,
+ counts_hi,
+ last_block,
+ last_node,
+ );
+
+ // Check for termination before bumping the offset, to avoid overflow.
+ if offset == fin_offset {
+ break;
+ }
+
+ offset += stride.padded_blockbytes();
+ }
+
+ // Write out the results.
+ untranspose_state_vecs(&h_vecs, &mut *jobs);
+ store_counts(&mut *jobs, counts_lo, counts_hi);
+ let max_consumed = offset.saturating_add(stride.padded_blockbytes());
+ for job in jobs.iter_mut() {
+ let consumed = cmp::min(max_consumed, job.input.len());
+ job.input = &job.input[consumed..];
+ }
+}
diff --git a/third_party/rust/blake2b_simd/src/blake2bp.rs b/third_party/rust/blake2b_simd/src/blake2bp.rs
new file mode 100644
index 0000000000..7bdfad64f6
--- /dev/null
+++ b/third_party/rust/blake2b_simd/src/blake2bp.rs
@@ -0,0 +1,570 @@
+//! BLAKE2bp, a variant of BLAKE2b that uses SIMD more efficiently.
+//!
+//! The AVX2 implementation of BLAKE2bp is about twice as fast that of BLAKE2b.
+//! However, note that it's a different hash function, and it gives a different
+//! hash from BLAKE2b for the same input.
+//!
+//! # Example
+//!
+//! ```
+//! use blake2b_simd::blake2bp;
+//!
+//! let hash = blake2bp::Params::new()
+//! .hash_length(16)
+//! .key(b"The Magic Words are Squeamish Ossifrage")
+//! .to_state()
+//! .update(b"foo")
+//! .update(b"bar")
+//! .update(b"baz")
+//! .finalize();
+//! assert_eq!("e69c7d2c42a5ac14948772231c68c552", &hash.to_hex());
+//! ```
+
+use crate::guts::{Finalize, Implementation, Job, LastNode, Stride};
+use crate::many;
+use crate::Count;
+use crate::Hash;
+use crate::Word;
+use crate::BLOCKBYTES;
+use crate::KEYBYTES;
+use crate::OUTBYTES;
+use core::cmp;
+use core::fmt;
+use core::mem::size_of;
+
+#[cfg(feature = "std")]
+use std;
+
+pub(crate) const DEGREE: usize = 4;
+
+/// Compute the BLAKE2bp hash of a slice of bytes all at once, using default
+/// parameters.
+///
+/// # Example
+///
+/// ```
+/// # use blake2b_simd::blake2bp::blake2bp;
+/// let expected = "8ca9ccee7946afcb686fe7556628b5ba1bf9a691da37ca58cd049354d99f3704\
+/// 2c007427e5f219b9ab5063707ec6823872dee413ee014b4d02f2ebb6abb5f643";
+/// let hash = blake2bp(b"foo");
+/// assert_eq!(expected, &hash.to_hex());
+/// ```
+pub fn blake2bp(input: &[u8]) -> Hash {
+ Params::new().hash(input)
+}
+
+/// A parameter builder for BLAKE2bp, just like the [`Params`](../struct.Params.html) type for
+/// BLAKE2b.
+///
+/// This builder only supports configuring the hash length and a secret key. This matches the
+/// options provided by the [reference
+/// implementation](https://github.com/BLAKE2/BLAKE2/blob/320c325437539ae91091ce62efec1913cd8093c2/ref/blake2.h#L162-L165).
+///
+/// # Example
+///
+/// ```
+/// use blake2b_simd::blake2bp;
+/// let mut state = blake2bp::Params::new().hash_length(32).to_state();
+/// ```
+#[derive(Clone)]
+pub struct Params {
+ hash_length: u8,
+ key_length: u8,
+ key: [u8; KEYBYTES],
+ implementation: Implementation,
+}
+
+impl Params {
+ /// Equivalent to `Params::default()`.
+ pub fn new() -> Self {
+ Self {
+ hash_length: OUTBYTES as u8,
+ key_length: 0,
+ key: [0; KEYBYTES],
+ implementation: Implementation::detect(),
+ }
+ }
+
+ fn to_words(&self) -> ([[Word; 8]; DEGREE], [Word; 8]) {
+ let mut base_params = crate::Params::new();
+ base_params
+ .hash_length(self.hash_length as usize)
+ .key(&self.key[..self.key_length as usize])
+ .fanout(DEGREE as u8)
+ .max_depth(2)
+ .max_leaf_length(0)
+ // Note that inner_hash_length is always OUTBYTES, regardless of the hash_length
+ // parameter. This isn't documented in the spec, but it matches the behavior of the
+ // reference implementation: https://github.com/BLAKE2/BLAKE2/blob/320c325437539ae91091ce62efec1913cd8093c2/ref/blake2bp-ref.c#L55
+ .inner_hash_length(OUTBYTES);
+ let leaf_words = |worker_index| {
+ base_params
+ .clone()
+ .node_offset(worker_index)
+ .node_depth(0)
+ // Note that setting the last_node flag here has no effect,
+ // because it isn't included in the state words.
+ .to_words()
+ };
+ let leaf_words = [leaf_words(0), leaf_words(1), leaf_words(2), leaf_words(3)];
+ let root_words = base_params
+ .clone()
+ .node_offset(0)
+ .node_depth(1)
+ // Note that setting the last_node flag here has no effect, because
+ // it isn't included in the state words. Also note that because
+ // we're only preserving its state words, the root node won't hash
+ // any key bytes.
+ .to_words();
+ (leaf_words, root_words)
+ }
+
+ /// Hash an input all at once with these parameters.
+ pub fn hash(&self, input: &[u8]) -> Hash {
+ // If there's a key, just fall back to using the State.
+ if self.key_length > 0 {
+ return self.to_state().update(input).finalize();
+ }
+ let (mut leaf_words, mut root_words) = self.to_words();
+ // Hash each leaf in parallel.
+ let jobs = leaf_words.iter_mut().enumerate().map(|(i, words)| {
+ let input_start = cmp::min(input.len(), i * BLOCKBYTES);
+ Job {
+ input: &input[input_start..],
+ words,
+ count: 0,
+ last_node: if i == DEGREE - 1 {
+ LastNode::Yes
+ } else {
+ LastNode::No
+ },
+ }
+ });
+ many::compress_many(jobs, self.implementation, Finalize::Yes, Stride::Parallel);
+ // Hash each leaf into the root.
+ finalize_root_words(
+ &leaf_words,
+ &mut root_words,
+ self.hash_length,
+ self.implementation,
+ )
+ }
+
+ /// Construct a BLAKE2bp `State` object based on these parameters.
+ pub fn to_state(&self) -> State {
+ State::with_params(self)
+ }
+
+ /// Set the length of the final hash, from 1 to `OUTBYTES` (64). Apart from controlling the
+ /// length of the final `Hash`, this is also associated data, and changing it will result in a
+ /// totally different hash.
+ pub fn hash_length(&mut self, length: usize) -> &mut Self {
+ assert!(
+ 1 <= length && length <= OUTBYTES,
+ "Bad hash length: {}",
+ length
+ );
+ self.hash_length = length as u8;
+ self
+ }
+
+ /// Use a secret key, so that BLAKE2bp acts as a MAC. The maximum key length is `KEYBYTES`
+ /// (64). An empty key is equivalent to having no key at all.
+ pub fn key(&mut self, key: &[u8]) -> &mut Self {
+ assert!(key.len() <= KEYBYTES, "Bad key length: {}", key.len());
+ self.key_length = key.len() as u8;
+ self.key = [0; KEYBYTES];
+ self.key[..key.len()].copy_from_slice(key);
+ self
+ }
+}
+
+impl Default for Params {
+ fn default() -> Self {
+ Self::new()
+ }
+}
+
+impl fmt::Debug for Params {
+ fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+ write!(
+ f,
+ "Params {{ hash_length: {}, key_length: {} }}",
+ self.hash_length,
+ // NB: Don't print the key itself. Debug shouldn't leak secrets.
+ self.key_length,
+ )
+ }
+}
+
+/// An incremental hasher for BLAKE2bp, just like the [`State`](../struct.State.html) type for
+/// BLAKE2b.
+///
+/// # Example
+///
+/// ```
+/// use blake2b_simd::blake2bp;
+///
+/// let mut state = blake2bp::State::new();
+/// state.update(b"foo");
+/// state.update(b"bar");
+/// let hash = state.finalize();
+///
+/// let expected = "e654427b6ef02949471712263e59071abbb6aa94855674c1daeed6cfaf127c33\
+/// dfa3205f7f7f71e4f0673d25fa82a368488911f446bccd323af3ab03f53e56e5";
+/// assert_eq!(expected, &hash.to_hex());
+/// ```
+#[derive(Clone)]
+pub struct State {
+ leaf_words: [[Word; 8]; DEGREE],
+ root_words: [Word; 8],
+ // Note that this buffer is twice as large as what compress4 needs. That guarantees that we
+ // have enough input when we compress to know we don't need to finalize any of the leaves.
+ buf: [u8; 2 * DEGREE * BLOCKBYTES],
+ buf_len: u16,
+ // Note that this is the *per-leaf* count.
+ count: Count,
+ hash_length: u8,
+ implementation: Implementation,
+ is_keyed: bool,
+}
+
+impl State {
+ /// Equivalent to `State::default()` or `Params::default().to_state()`.
+ pub fn new() -> Self {
+ Self::with_params(&Params::default())
+ }
+
+ fn with_params(params: &Params) -> Self {
+ let (leaf_words, root_words) = params.to_words();
+
+ // If a key is set, initalize the buffer to contain the key bytes. Note
+ // that only the leaves hash key bytes. The root doesn't, even though
+ // the key length it still set in its parameters. Again this isn't
+ // documented in the spec, but it matches the behavior of the reference
+ // implementation:
+ // https://github.com/BLAKE2/BLAKE2/blob/320c325437539ae91091ce62efec1913cd8093c2/ref/blake2bp-ref.c#L128
+ // This particular behavior (though not the inner hash length behavior
+ // above) is also corroborated by the official test vectors; see
+ // tests/vector_tests.rs.
+ let mut buf = [0; 2 * DEGREE * BLOCKBYTES];
+ let mut buf_len = 0;
+ if params.key_length > 0 {
+ for i in 0..DEGREE {
+ let keybytes = &params.key[..params.key_length as usize];
+ buf[i * BLOCKBYTES..][..keybytes.len()].copy_from_slice(keybytes);
+ buf_len = BLOCKBYTES * DEGREE;
+ }
+ }
+
+ Self {
+ leaf_words,
+ root_words,
+ buf,
+ buf_len: buf_len as u16,
+ count: 0, // count gets updated in self.compress()
+ hash_length: params.hash_length,
+ implementation: params.implementation,
+ is_keyed: params.key_length > 0,
+ }
+ }
+
+ fn fill_buf(&mut self, input: &mut &[u8]) {
+ let take = cmp::min(self.buf.len() - self.buf_len as usize, input.len());
+ self.buf[self.buf_len as usize..][..take].copy_from_slice(&input[..take]);
+ self.buf_len += take as u16;
+ *input = &input[take..];
+ }
+
+ fn compress_to_leaves(
+ leaves: &mut [[Word; 8]; DEGREE],
+ input: &[u8],
+ count: &mut Count,
+ implementation: Implementation,
+ ) {
+ // Input is assumed to be an even number of blocks for each leaf. Since
+ // we're not finilizing, debug asserts will fire otherwise.
+ let jobs = leaves.iter_mut().enumerate().map(|(i, words)| {
+ Job {
+ input: &input[i * BLOCKBYTES..],
+ words,
+ count: *count,
+ last_node: LastNode::No, // irrelevant when not finalizing
+ }
+ });
+ many::compress_many(jobs, implementation, Finalize::No, Stride::Parallel);
+ // Note that count is the bytes input *per-leaf*.
+ *count = count.wrapping_add((input.len() / DEGREE) as Count);
+ }
+
+ /// Add input to the hash. You can call `update` any number of times.
+ pub fn update(&mut self, mut input: &[u8]) -> &mut Self {
+ // If we have a partial buffer, try to complete it. If we complete it and there's more
+ // input waiting, we need to compress to make more room. However, because we need to be
+ // sure that *none* of the leaves would need to be finalized as part of this round of
+ // compression, we need to buffer more than we would for BLAKE2b.
+ if self.buf_len > 0 {
+ self.fill_buf(&mut input);
+ // The buffer is large enough for two compressions. If we've filled
+ // the buffer and there's still more input coming, then we have to
+ // do at least one compression. If there's enough input still
+ // coming that all the leaves are guaranteed to get more, do both
+ // compressions in the buffer. Otherwise, do just one and shift the
+ // back half of the buffer to the front.
+ if !input.is_empty() {
+ if input.len() > (DEGREE - 1) * BLOCKBYTES {
+ // Enough input coming to do both compressions.
+ Self::compress_to_leaves(
+ &mut self.leaf_words,
+ &self.buf,
+ &mut self.count,
+ self.implementation,
+ );
+ self.buf_len = 0;
+ } else {
+ // Only enough input coming for one compression.
+ Self::compress_to_leaves(
+ &mut self.leaf_words,
+ &self.buf[..DEGREE * BLOCKBYTES],
+ &mut self.count,
+ self.implementation,
+ );
+ self.buf_len = (DEGREE * BLOCKBYTES) as u16;
+ let (buf_front, buf_back) = self.buf.split_at_mut(DEGREE * BLOCKBYTES);
+ buf_front.copy_from_slice(buf_back);
+ }
+ }
+ }
+
+ // Now we directly compress as much input as possible, without copying
+ // it into the buffer. We need to make sure we buffer at least one byte
+ // for each of the leaves, so that we know we don't need to finalize
+ // them.
+ let needed_tail = (DEGREE - 1) * BLOCKBYTES + 1;
+ let mut bulk_bytes = input.len().saturating_sub(needed_tail);
+ bulk_bytes -= bulk_bytes % (DEGREE * BLOCKBYTES);
+ if bulk_bytes > 0 {
+ Self::compress_to_leaves(
+ &mut self.leaf_words,
+ &input[..bulk_bytes],
+ &mut self.count,
+ self.implementation,
+ );
+ input = &input[bulk_bytes..];
+ }
+
+ // Buffer any remaining input, to be either compressed or finalized in
+ // a subsequent call.
+ self.fill_buf(&mut input);
+ debug_assert_eq!(0, input.len());
+ self
+ }
+
+ /// Finalize the state and return a `Hash`. This method is idempotent, and calling it multiple
+ /// times will give the same result. It's also possible to `update` with more input in between.
+ pub fn finalize(&self) -> Hash {
+ // Hash whatever's remaining in the buffer and finalize the leaves.
+ let buf_len = self.buf_len as usize;
+ let mut leaves_copy = self.leaf_words;
+ let jobs = leaves_copy
+ .iter_mut()
+ .enumerate()
+ .map(|(leaf_index, leaf_words)| {
+ let input = &self.buf[cmp::min(leaf_index * BLOCKBYTES, buf_len)..buf_len];
+ Job {
+ input,
+ words: leaf_words,
+ count: self.count,
+ last_node: if leaf_index == DEGREE - 1 {
+ LastNode::Yes
+ } else {
+ LastNode::No
+ },
+ }
+ });
+ many::compress_many(jobs, self.implementation, Finalize::Yes, Stride::Parallel);
+
+ // Concatenate each leaf into the root and hash that.
+ let mut root_words_copy = self.root_words;
+ finalize_root_words(
+ &leaves_copy,
+ &mut root_words_copy,
+ self.hash_length,
+ self.implementation,
+ )
+ }
+
+ /// Return the total number of bytes input so far.
+ ///
+ /// Note that `count` doesn't include the bytes of the key block, if any.
+ /// It's exactly the total number of input bytes fed to `update`.
+ pub fn count(&self) -> Count {
+ // Remember that self.count is *per-leaf*.
+ let mut ret = self
+ .count
+ .wrapping_mul(DEGREE as Count)
+ .wrapping_add(self.buf_len as Count);
+ if self.is_keyed {
+ ret -= (DEGREE * BLOCKBYTES) as Count;
+ }
+ ret
+ }
+}
+
+#[cfg(feature = "std")]
+impl std::io::Write for State {
+ fn write(&mut self, buf: &[u8]) -> std::io::Result<usize> {
+ self.update(buf);
+ Ok(buf.len())
+ }
+
+ fn flush(&mut self) -> std::io::Result<()> {
+ Ok(())
+ }
+}
+
+impl fmt::Debug for State {
+ fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+ write!(
+ f,
+ "State {{ count: {}, hash_length: {} }}",
+ self.count(),
+ self.hash_length,
+ )
+ }
+}
+
+impl Default for State {
+ fn default() -> Self {
+ Self::with_params(&Params::default())
+ }
+}
+
+// Compress each of the four finalized hashes into the root words as input,
+// using two compressions. Note that even if a future version of this
+// implementation supports the hash_length parameter and sets it as associated
+// data for all nodes, this step must still use the untruncated output of each
+// leaf. Note also that, as mentioned above, the root node doesn't hash any key
+// bytes.
+fn finalize_root_words(
+ leaf_words: &[[Word; 8]; DEGREE],
+ root_words: &mut [Word; 8],
+ hash_length: u8,
+ imp: Implementation,
+) -> Hash {
+ debug_assert_eq!(OUTBYTES, 8 * size_of::<Word>());
+ let mut block = [0; DEGREE * OUTBYTES];
+ for (word, chunk) in leaf_words
+ .iter()
+ .flat_map(|words| words.iter())
+ .zip(block.chunks_exact_mut(size_of::<Word>()))
+ {
+ chunk.copy_from_slice(&word.to_le_bytes());
+ }
+ imp.compress1_loop(
+ &block,
+ root_words,
+ 0,
+ LastNode::Yes,
+ Finalize::Yes,
+ Stride::Serial,
+ );
+ Hash {
+ bytes: crate::state_words_to_bytes(&root_words),
+ len: hash_length,
+ }
+}
+
+pub(crate) fn force_portable(params: &mut Params) {
+ params.implementation = Implementation::portable();
+}
+
+#[cfg(test)]
+pub(crate) mod test {
+ use super::*;
+ use crate::paint_test_input;
+
+ // This is a simple reference implementation without the complicated buffering or parameter
+ // support of the real implementation. We need this because the official test vectors don't
+ // include any inputs large enough to exercise all the branches in the buffering logic.
+ fn blake2bp_reference(input: &[u8]) -> Hash {
+ let mut leaves = arrayvec::ArrayVec::<[_; DEGREE]>::new();
+ for leaf_index in 0..DEGREE {
+ leaves.push(
+ crate::Params::new()
+ .fanout(DEGREE as u8)
+ .max_depth(2)
+ .node_offset(leaf_index as u64)
+ .inner_hash_length(OUTBYTES)
+ .to_state(),
+ );
+ }
+ leaves[DEGREE - 1].set_last_node(true);
+ for (i, chunk) in input.chunks(BLOCKBYTES).enumerate() {
+ leaves[i % DEGREE].update(chunk);
+ }
+ let mut root = crate::Params::new()
+ .fanout(DEGREE as u8)
+ .max_depth(2)
+ .node_depth(1)
+ .inner_hash_length(OUTBYTES)
+ .last_node(true)
+ .to_state();
+ for leaf in &mut leaves {
+ root.update(leaf.finalize().as_bytes());
+ }
+ root.finalize()
+ }
+
+ #[test]
+ fn test_against_reference() {
+ let mut buf = [0; 21 * BLOCKBYTES];
+ paint_test_input(&mut buf);
+ // - 8 blocks is just enought to fill the double buffer.
+ // - 9 blocks triggers the "perform one compression on the double buffer" case.
+ // - 11 blocks is the largest input where only one compression may be performed, on the
+ // first half of the buffer, because there's not enough input to avoid needing to
+ // finalize the second half.
+ // - 12 blocks triggers the "perform both compressions in the double buffer" case.
+ // - 15 blocks is the largest input where, after compressing 8 blocks from the buffer,
+ // there's not enough input to hash directly from memory.
+ // - 16 blocks triggers "after emptying the buffer, hash directly from memory".
+ for num_blocks in 0..=20 {
+ for &extra in &[0, 1, BLOCKBYTES - 1] {
+ for &portable in &[false, true] {
+ // eprintln!("\ncase -----");
+ // dbg!(num_blocks);
+ // dbg!(extra);
+ // dbg!(portable);
+
+ // First hash the input all at once, as a sanity check.
+ let mut params = Params::new();
+ if portable {
+ force_portable(&mut params);
+ }
+ let input = &buf[..num_blocks * BLOCKBYTES + extra];
+ let expected = blake2bp_reference(&input);
+ let mut state = params.to_state();
+ let found = state.update(input).finalize();
+ assert_eq!(expected, found);
+
+ // Then, do it again, but buffer 1 byte of input first. That causes the buffering
+ // branch to trigger.
+ let mut state = params.to_state();
+ let maybe_one = cmp::min(1, input.len());
+ state.update(&input[..maybe_one]);
+ assert_eq!(maybe_one as Count, state.count());
+ // Do a throwaway finalize here to check for idempotency.
+ state.finalize();
+ state.update(&input[maybe_one..]);
+ assert_eq!(input.len() as Count, state.count());
+ let found = state.finalize();
+ assert_eq!(expected, found);
+
+ // Finally, do it again with the all-at-once interface.
+ assert_eq!(expected, blake2bp(input));
+ }
+ }
+ }
+ }
+}
diff --git a/third_party/rust/blake2b_simd/src/guts.rs b/third_party/rust/blake2b_simd/src/guts.rs
new file mode 100644
index 0000000000..9fcacf319c
--- /dev/null
+++ b/third_party/rust/blake2b_simd/src/guts.rs
@@ -0,0 +1,565 @@
+use crate::*;
+use arrayref::array_ref;
+use core::cmp;
+
+#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+pub const MAX_DEGREE: usize = 4;
+
+#[cfg(not(any(target_arch = "x86", target_arch = "x86_64")))]
+pub const MAX_DEGREE: usize = 1;
+
+// Variants other than Portable are unreachable in no_std, unless CPU features
+// are explicitly enabled for the build with e.g. RUSTFLAGS="-C target-feature=avx2".
+// This might change in the future if is_x86_feature_detected moves into libcore.
+#[allow(dead_code)]
+#[derive(Clone, Copy, Debug, Eq, PartialEq)]
+enum Platform {
+ Portable,
+ #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+ SSE41,
+ #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+ AVX2,
+}
+
+#[derive(Clone, Copy, Debug)]
+pub struct Implementation(Platform);
+
+impl Implementation {
+ pub fn detect() -> Self {
+ // Try the different implementations in order of how fast/modern they
+ // are. Currently on non-x86, everything just uses portable.
+ #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+ {
+ if let Some(avx2_impl) = Self::avx2_if_supported() {
+ return avx2_impl;
+ }
+ }
+ #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+ {
+ if let Some(sse41_impl) = Self::sse41_if_supported() {
+ return sse41_impl;
+ }
+ }
+ Self::portable()
+ }
+
+ pub fn portable() -> Self {
+ Implementation(Platform::Portable)
+ }
+
+ #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+ #[allow(unreachable_code)]
+ pub fn sse41_if_supported() -> Option<Self> {
+ // Check whether SSE4.1 support is assumed by the build.
+ #[cfg(target_feature = "sse4.1")]
+ {
+ return Some(Implementation(Platform::SSE41));
+ }
+ // Otherwise dynamically check for support if we can.
+ #[cfg(feature = "std")]
+ {
+ if is_x86_feature_detected!("sse4.1") {
+ return Some(Implementation(Platform::SSE41));
+ }
+ }
+ None
+ }
+
+ #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+ #[allow(unreachable_code)]
+ pub fn avx2_if_supported() -> Option<Self> {
+ // Check whether AVX2 support is assumed by the build.
+ #[cfg(target_feature = "avx2")]
+ {
+ return Some(Implementation(Platform::AVX2));
+ }
+ // Otherwise dynamically check for support if we can.
+ #[cfg(feature = "std")]
+ {
+ if is_x86_feature_detected!("avx2") {
+ return Some(Implementation(Platform::AVX2));
+ }
+ }
+ None
+ }
+
+ pub fn degree(&self) -> usize {
+ match self.0 {
+ #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+ Platform::AVX2 => avx2::DEGREE,
+ #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+ Platform::SSE41 => sse41::DEGREE,
+ Platform::Portable => 1,
+ }
+ }
+
+ pub fn compress1_loop(
+ &self,
+ input: &[u8],
+ words: &mut [Word; 8],
+ count: Count,
+ last_node: LastNode,
+ finalize: Finalize,
+ stride: Stride,
+ ) {
+ match self.0 {
+ #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+ Platform::AVX2 => unsafe {
+ avx2::compress1_loop(input, words, count, last_node, finalize, stride);
+ },
+ // Note that there's an SSE version of compress1 in the official C
+ // implementation, but I haven't ported it yet.
+ _ => {
+ portable::compress1_loop(input, words, count, last_node, finalize, stride);
+ }
+ }
+ }
+
+ pub fn compress2_loop(&self, jobs: &mut [Job; 2], finalize: Finalize, stride: Stride) {
+ match self.0 {
+ #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+ Platform::AVX2 | Platform::SSE41 => unsafe {
+ sse41::compress2_loop(jobs, finalize, stride)
+ },
+ _ => panic!("unsupported"),
+ }
+ }
+
+ pub fn compress4_loop(&self, jobs: &mut [Job; 4], finalize: Finalize, stride: Stride) {
+ match self.0 {
+ #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+ Platform::AVX2 => unsafe { avx2::compress4_loop(jobs, finalize, stride) },
+ _ => panic!("unsupported"),
+ }
+ }
+}
+
+pub struct Job<'a, 'b> {
+ pub input: &'a [u8],
+ pub words: &'b mut [Word; 8],
+ pub count: Count,
+ pub last_node: LastNode,
+}
+
+impl<'a, 'b> core::fmt::Debug for Job<'a, 'b> {
+ fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+ // NB: Don't print the words. Leaking them would allow length extension.
+ write!(
+ f,
+ "Job {{ input_len: {}, count: {}, last_node: {} }}",
+ self.input.len(),
+ self.count,
+ self.last_node.yes(),
+ )
+ }
+}
+
+// Finalize could just be a bool, but this is easier to read at callsites.
+#[derive(Clone, Copy, Debug)]
+pub enum Finalize {
+ Yes,
+ No,
+}
+
+impl Finalize {
+ pub fn yes(&self) -> bool {
+ match self {
+ Finalize::Yes => true,
+ Finalize::No => false,
+ }
+ }
+}
+
+// Like Finalize, this is easier to read at callsites.
+#[derive(Clone, Copy, Debug)]
+pub enum LastNode {
+ Yes,
+ No,
+}
+
+impl LastNode {
+ pub fn yes(&self) -> bool {
+ match self {
+ LastNode::Yes => true,
+ LastNode::No => false,
+ }
+ }
+}
+
+#[derive(Clone, Copy, Debug)]
+pub enum Stride {
+ Serial, // BLAKE2b/BLAKE2s
+ Parallel, // BLAKE2bp/BLAKE2sp
+}
+
+impl Stride {
+ pub fn padded_blockbytes(&self) -> usize {
+ match self {
+ Stride::Serial => BLOCKBYTES,
+ Stride::Parallel => blake2bp::DEGREE * BLOCKBYTES,
+ }
+ }
+}
+
+pub(crate) fn count_low(count: Count) -> Word {
+ count as Word
+}
+
+pub(crate) fn count_high(count: Count) -> Word {
+ (count >> 8 * size_of::<Word>()) as Word
+}
+
+pub(crate) fn assemble_count(low: Word, high: Word) -> Count {
+ low as Count + ((high as Count) << 8 * size_of::<Word>())
+}
+
+pub(crate) fn flag_word(flag: bool) -> Word {
+ if flag {
+ !0
+ } else {
+ 0
+ }
+}
+
+// Pull a array reference at the given offset straight from the input, if
+// there's a full block of input available. If there's only a partial block,
+// copy it into the provided buffer, and return an array reference that. Along
+// with the array, return the number of bytes of real input, and whether the
+// input can be finalized (i.e. whether there aren't any more bytes after this
+// block). Note that this is written so that the optimizer can elide bounds
+// checks, see: https://godbolt.org/z/0hH2bC
+pub fn final_block<'a>(
+ input: &'a [u8],
+ offset: usize,
+ buffer: &'a mut [u8; BLOCKBYTES],
+ stride: Stride,
+) -> (&'a [u8; BLOCKBYTES], usize, bool) {
+ let capped_offset = cmp::min(offset, input.len());
+ let offset_slice = &input[capped_offset..];
+ if offset_slice.len() >= BLOCKBYTES {
+ let block = array_ref!(offset_slice, 0, BLOCKBYTES);
+ let should_finalize = offset_slice.len() <= stride.padded_blockbytes();
+ (block, BLOCKBYTES, should_finalize)
+ } else {
+ // Copy the final block to the front of the block buffer. The rest of
+ // the buffer is assumed to be initialized to zero.
+ buffer[..offset_slice.len()].copy_from_slice(offset_slice);
+ (buffer, offset_slice.len(), true)
+ }
+}
+
+pub fn input_debug_asserts(input: &[u8], finalize: Finalize) {
+ // If we're not finalizing, the input must not be empty, and it must be an
+ // even multiple of the block size.
+ if !finalize.yes() {
+ debug_assert!(!input.is_empty());
+ debug_assert_eq!(0, input.len() % BLOCKBYTES);
+ }
+}
+
+#[cfg(test)]
+mod test {
+ use super::*;
+ use arrayvec::ArrayVec;
+ use core::mem::size_of;
+
+ #[test]
+ fn test_detection() {
+ assert_eq!(Platform::Portable, Implementation::portable().0);
+
+ #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+ #[cfg(feature = "std")]
+ {
+ if is_x86_feature_detected!("avx2") {
+ assert_eq!(Platform::AVX2, Implementation::detect().0);
+ assert_eq!(
+ Platform::AVX2,
+ Implementation::avx2_if_supported().unwrap().0
+ );
+ assert_eq!(
+ Platform::SSE41,
+ Implementation::sse41_if_supported().unwrap().0
+ );
+ } else if is_x86_feature_detected!("sse4.1") {
+ assert_eq!(Platform::SSE41, Implementation::detect().0);
+ assert!(Implementation::avx2_if_supported().is_none());
+ assert_eq!(
+ Platform::SSE41,
+ Implementation::sse41_if_supported().unwrap().0
+ );
+ } else {
+ assert_eq!(Platform::Portable, Implementation::detect().0);
+ assert!(Implementation::avx2_if_supported().is_none());
+ assert!(Implementation::sse41_if_supported().is_none());
+ }
+ }
+ }
+
+ // TODO: Move all of these case tests into the implementation files.
+ fn exercise_cases<F>(mut f: F)
+ where
+ F: FnMut(Stride, usize, LastNode, Finalize, Count),
+ {
+ // Chose counts to hit the relevant overflow cases.
+ let counts = &[
+ (0 as Count),
+ ((1 as Count) << (8 * size_of::<Word>())) - BLOCKBYTES as Count,
+ (0 as Count).wrapping_sub(BLOCKBYTES as Count),
+ ];
+ for &stride in &[Stride::Serial, Stride::Parallel] {
+ let lengths = [
+ 0,
+ 1,
+ BLOCKBYTES - 1,
+ BLOCKBYTES,
+ BLOCKBYTES + 1,
+ 2 * BLOCKBYTES - 1,
+ 2 * BLOCKBYTES,
+ 2 * BLOCKBYTES + 1,
+ stride.padded_blockbytes() - 1,
+ stride.padded_blockbytes(),
+ stride.padded_blockbytes() + 1,
+ 2 * stride.padded_blockbytes() - 1,
+ 2 * stride.padded_blockbytes(),
+ 2 * stride.padded_blockbytes() + 1,
+ ];
+ for &length in &lengths {
+ for &last_node in &[LastNode::No, LastNode::Yes] {
+ for &finalize in &[Finalize::No, Finalize::Yes] {
+ if !finalize.yes() && (length == 0 || length % BLOCKBYTES != 0) {
+ // Skip these cases, they're invalid.
+ continue;
+ }
+ for &count in counts {
+ // eprintln!("\ncase -----");
+ // dbg!(stride);
+ // dbg!(length);
+ // dbg!(last_node);
+ // dbg!(finalize);
+ // dbg!(count);
+
+ f(stride, length, last_node, finalize, count);
+ }
+ }
+ }
+ }
+ }
+ }
+
+ fn initial_test_words(input_index: usize) -> [Word; 8] {
+ crate::Params::new()
+ .node_offset(input_index as u64)
+ .to_words()
+ }
+
+ // Use the portable implementation, one block at a time, to compute the
+ // final state words expected for a given test case.
+ fn reference_compression(
+ input: &[u8],
+ stride: Stride,
+ last_node: LastNode,
+ finalize: Finalize,
+ mut count: Count,
+ input_index: usize,
+ ) -> [Word; 8] {
+ let mut words = initial_test_words(input_index);
+ let mut offset = 0;
+ while offset == 0 || offset < input.len() {
+ let block_size = cmp::min(BLOCKBYTES, input.len() - offset);
+ let maybe_finalize = if offset + stride.padded_blockbytes() < input.len() {
+ Finalize::No
+ } else {
+ finalize
+ };
+ portable::compress1_loop(
+ &input[offset..][..block_size],
+ &mut words,
+ count,
+ last_node,
+ maybe_finalize,
+ Stride::Serial,
+ );
+ offset += stride.padded_blockbytes();
+ count = count.wrapping_add(BLOCKBYTES as Count);
+ }
+ words
+ }
+
+ // For various loop lengths and finalization parameters, make sure that the
+ // implementation gives the same answer as the portable implementation does
+ // when invoked one block at a time. (So even the portable implementation
+ // itself is being tested here, to make sure its loop is correct.) Note
+ // that this doesn't include any fixed test vectors; those are taken from
+ // the blake2-kat.json file (copied from upstream) and tested elsewhere.
+ fn exercise_compress1_loop(implementation: Implementation) {
+ let mut input = [0; 100 * BLOCKBYTES];
+ paint_test_input(&mut input);
+
+ exercise_cases(|stride, length, last_node, finalize, count| {
+ let reference_words =
+ reference_compression(&input[..length], stride, last_node, finalize, count, 0);
+
+ let mut test_words = initial_test_words(0);
+ implementation.compress1_loop(
+ &input[..length],
+ &mut test_words,
+ count,
+ last_node,
+ finalize,
+ stride,
+ );
+ assert_eq!(reference_words, test_words);
+ });
+ }
+
+ #[test]
+ fn test_compress1_loop_portable() {
+ exercise_compress1_loop(Implementation::portable());
+ }
+
+ #[test]
+ #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+ fn test_compress1_loop_sse41() {
+ // Currently this just falls back to portable, but we test it anyway.
+ if let Some(imp) = Implementation::sse41_if_supported() {
+ exercise_compress1_loop(imp);
+ }
+ }
+
+ #[test]
+ #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+ fn test_compress1_loop_avx2() {
+ if let Some(imp) = Implementation::avx2_if_supported() {
+ exercise_compress1_loop(imp);
+ }
+ }
+
+ // I use ArrayVec everywhere in here becuase currently these tests pass
+ // under no_std. I might decide that's not worth maintaining at some point,
+ // since really all we care about with no_std is that the library builds,
+ // but for now it's here. Everything is keyed off of this N constant so
+ // that it's easy to copy the code to exercise_compress4_loop.
+ fn exercise_compress2_loop(implementation: Implementation) {
+ const N: usize = 2;
+
+ let mut input_buffer = [0; 100 * BLOCKBYTES];
+ paint_test_input(&mut input_buffer);
+ let mut inputs = ArrayVec::<[_; N]>::new();
+ for i in 0..N {
+ inputs.push(&input_buffer[i..]);
+ }
+
+ exercise_cases(|stride, length, last_node, finalize, count| {
+ let mut reference_words = ArrayVec::<[_; N]>::new();
+ for i in 0..N {
+ let words = reference_compression(
+ &inputs[i][..length],
+ stride,
+ last_node,
+ finalize,
+ count.wrapping_add((i * BLOCKBYTES) as Count),
+ i,
+ );
+ reference_words.push(words);
+ }
+
+ let mut test_words = ArrayVec::<[_; N]>::new();
+ for i in 0..N {
+ test_words.push(initial_test_words(i));
+ }
+ let mut jobs = ArrayVec::<[_; N]>::new();
+ for (i, words) in test_words.iter_mut().enumerate() {
+ jobs.push(Job {
+ input: &inputs[i][..length],
+ words,
+ count: count.wrapping_add((i * BLOCKBYTES) as Count),
+ last_node,
+ });
+ }
+ let mut jobs = jobs.into_inner().expect("full");
+ implementation.compress2_loop(&mut jobs, finalize, stride);
+
+ for i in 0..N {
+ assert_eq!(reference_words[i], test_words[i], "words {} unequal", i);
+ }
+ });
+ }
+
+ #[test]
+ #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+ fn test_compress2_loop_sse41() {
+ if let Some(imp) = Implementation::sse41_if_supported() {
+ exercise_compress2_loop(imp);
+ }
+ }
+
+ #[test]
+ #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+ fn test_compress2_loop_avx2() {
+ // Currently this just falls back to SSE4.1, but we test it anyway.
+ if let Some(imp) = Implementation::avx2_if_supported() {
+ exercise_compress2_loop(imp);
+ }
+ }
+
+ // Copied from exercise_compress2_loop, with a different value of N and an
+ // interior call to compress4_loop.
+ fn exercise_compress4_loop(implementation: Implementation) {
+ const N: usize = 4;
+
+ let mut input_buffer = [0; 100 * BLOCKBYTES];
+ paint_test_input(&mut input_buffer);
+ let mut inputs = ArrayVec::<[_; N]>::new();
+ for i in 0..N {
+ inputs.push(&input_buffer[i..]);
+ }
+
+ exercise_cases(|stride, length, last_node, finalize, count| {
+ let mut reference_words = ArrayVec::<[_; N]>::new();
+ for i in 0..N {
+ let words = reference_compression(
+ &inputs[i][..length],
+ stride,
+ last_node,
+ finalize,
+ count.wrapping_add((i * BLOCKBYTES) as Count),
+ i,
+ );
+ reference_words.push(words);
+ }
+
+ let mut test_words = ArrayVec::<[_; N]>::new();
+ for i in 0..N {
+ test_words.push(initial_test_words(i));
+ }
+ let mut jobs = ArrayVec::<[_; N]>::new();
+ for (i, words) in test_words.iter_mut().enumerate() {
+ jobs.push(Job {
+ input: &inputs[i][..length],
+ words,
+ count: count.wrapping_add((i * BLOCKBYTES) as Count),
+ last_node,
+ });
+ }
+ let mut jobs = jobs.into_inner().expect("full");
+ implementation.compress4_loop(&mut jobs, finalize, stride);
+
+ for i in 0..N {
+ assert_eq!(reference_words[i], test_words[i], "words {} unequal", i);
+ }
+ });
+ }
+
+ #[test]
+ #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+ fn test_compress4_loop_avx2() {
+ if let Some(imp) = Implementation::avx2_if_supported() {
+ exercise_compress4_loop(imp);
+ }
+ }
+
+ #[test]
+ fn sanity_check_count_size() {
+ assert_eq!(size_of::<Count>(), 2 * size_of::<Word>());
+ }
+}
diff --git a/third_party/rust/blake2b_simd/src/lib.rs b/third_party/rust/blake2b_simd/src/lib.rs
new file mode 100644
index 0000000000..cadef7f9a9
--- /dev/null
+++ b/third_party/rust/blake2b_simd/src/lib.rs
@@ -0,0 +1,674 @@
+//! [![GitHub](https://img.shields.io/github/tag/oconnor663/blake2_simd.svg?label=GitHub)](https://github.com/oconnor663/blake2_simd) [![crates.io](https://img.shields.io/crates/v/blake2b_simd.svg)](https://crates.io/crates/blake2b_simd) [![Build Status](https://travis-ci.org/oconnor663/blake2_simd.svg?branch=master)](https://travis-ci.org/oconnor663/blake2_simd)
+//!
+//! An implementation of the BLAKE2b and BLAKE2bp hash functions. See also
+//! [`blake2s_simd`](https://docs.rs/blake2s_simd).
+//!
+//! This crate includes:
+//!
+//! - 100% stable Rust.
+//! - SIMD implementations based on Samuel Neves' [`blake2-avx2`](https://github.com/sneves/blake2-avx2).
+//! These are very fast. For benchmarks, see [the Performance section of the
+//! README](https://github.com/oconnor663/blake2_simd#performance).
+//! - Portable, safe implementations for other platforms.
+//! - Dynamic CPU feature detection. Binaries include multiple implementations by default and
+//! choose the fastest one the processor supports at runtime.
+//! - All the features from the [the BLAKE2 spec](https://blake2.net/blake2.pdf), like adjustable
+//! length, keying, and associated data for tree hashing.
+//! - `no_std` support. The `std` Cargo feature is on by default, for CPU feature detection and
+//! for implementing `std::io::Write`.
+//! - Support for computing multiple BLAKE2b hashes in parallel, matching the efficiency of
+//! BLAKE2bp. See the [`many`](many/index.html) module.
+//!
+//! # Example
+//!
+//! ```
+//! use blake2b_simd::{blake2b, Params};
+//!
+//! let expected = "ca002330e69d3e6b84a46a56a6533fd79d51d97a3bb7cad6c2ff43b354185d6d\
+//! c1e723fb3db4ae0737e120378424c714bb982d9dc5bbd7a0ab318240ddd18f8d";
+//! let hash = blake2b(b"foo");
+//! assert_eq!(expected, &hash.to_hex());
+//!
+//! let hash = Params::new()
+//! .hash_length(16)
+//! .key(b"The Magic Words are Squeamish Ossifrage")
+//! .personal(b"L. P. Waterhouse")
+//! .to_state()
+//! .update(b"foo")
+//! .update(b"bar")
+//! .update(b"baz")
+//! .finalize();
+//! assert_eq!("ee8ff4e9be887297cf79348dc35dab56", &hash.to_hex());
+//! ```
+
+#![cfg_attr(not(feature = "std"), no_std)]
+
+use arrayref::{array_refs, mut_array_refs};
+use core::cmp;
+use core::fmt;
+use core::mem::size_of;
+
+#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+mod avx2;
+mod portable;
+#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+mod sse41;
+
+pub mod blake2bp;
+mod guts;
+pub mod many;
+
+#[cfg(test)]
+mod test;
+
+type Word = u64;
+type Count = u128;
+
+/// The max hash length.
+pub const OUTBYTES: usize = 8 * size_of::<Word>();
+/// The max key length.
+pub const KEYBYTES: usize = 8 * size_of::<Word>();
+/// The max salt length.
+pub const SALTBYTES: usize = 2 * size_of::<Word>();
+/// The max personalization length.
+pub const PERSONALBYTES: usize = 2 * size_of::<Word>();
+/// The number input bytes passed to each call to the compression function. Small benchmarks need
+/// to use an even multiple of `BLOCKBYTES`, or else their apparent throughput will be low.
+pub const BLOCKBYTES: usize = 16 * size_of::<Word>();
+
+const IV: [Word; 8] = [
+ 0x6A09E667F3BCC908,
+ 0xBB67AE8584CAA73B,
+ 0x3C6EF372FE94F82B,
+ 0xA54FF53A5F1D36F1,
+ 0x510E527FADE682D1,
+ 0x9B05688C2B3E6C1F,
+ 0x1F83D9ABFB41BD6B,
+ 0x5BE0CD19137E2179,
+];
+
+const SIGMA: [[u8; 16]; 12] = [
+ [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15],
+ [14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3],
+ [11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4],
+ [7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8],
+ [9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13],
+ [2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9],
+ [12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11],
+ [13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10],
+ [6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5],
+ [10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0],
+ [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15],
+ [14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3],
+];
+
+/// Compute the BLAKE2b hash of a slice of bytes all at once, using default
+/// parameters.
+///
+/// # Example
+///
+/// ```
+/// # use blake2b_simd::{blake2b, Params};
+/// let expected = "ca002330e69d3e6b84a46a56a6533fd79d51d97a3bb7cad6c2ff43b354185d6d\
+/// c1e723fb3db4ae0737e120378424c714bb982d9dc5bbd7a0ab318240ddd18f8d";
+/// let hash = blake2b(b"foo");
+/// assert_eq!(expected, &hash.to_hex());
+/// ```
+pub fn blake2b(input: &[u8]) -> Hash {
+ Params::new().hash(input)
+}
+
+/// A parameter builder that exposes all the non-default BLAKE2 features.
+///
+/// Apart from `hash_length`, which controls the length of the final `Hash`,
+/// all of these parameters are just associated data that gets mixed with the
+/// input. For more details, see [the BLAKE2 spec](https://blake2.net/blake2.pdf).
+///
+/// Several of the parameters have a valid range defined in the spec and
+/// documented below. Trying to set an invalid parameter will panic.
+///
+/// # Example
+///
+/// ```
+/// # use blake2b_simd::Params;
+/// // Create a Params object with a secret key and a non-default length.
+/// let mut params = Params::new();
+/// params.key(b"my secret key");
+/// params.hash_length(16);
+///
+/// // Use those params to hash an input all at once.
+/// let hash = params.hash(b"my input");
+///
+/// // Or use those params to build an incremental State.
+/// let mut state = params.to_state();
+/// ```
+#[derive(Clone)]
+pub struct Params {
+ hash_length: u8,
+ key_length: u8,
+ key_block: [u8; BLOCKBYTES],
+ salt: [u8; SALTBYTES],
+ personal: [u8; PERSONALBYTES],
+ fanout: u8,
+ max_depth: u8,
+ max_leaf_length: u32,
+ node_offset: u64,
+ node_depth: u8,
+ inner_hash_length: u8,
+ last_node: guts::LastNode,
+ implementation: guts::Implementation,
+}
+
+impl Params {
+ /// Equivalent to `Params::default()`.
+ #[inline]
+ pub fn new() -> Self {
+ Self {
+ hash_length: OUTBYTES as u8,
+ key_length: 0,
+ key_block: [0; BLOCKBYTES],
+ salt: [0; SALTBYTES],
+ personal: [0; PERSONALBYTES],
+ // NOTE: fanout and max_depth don't default to zero!
+ fanout: 1,
+ max_depth: 1,
+ max_leaf_length: 0,
+ node_offset: 0,
+ node_depth: 0,
+ inner_hash_length: 0,
+ last_node: guts::LastNode::No,
+ implementation: guts::Implementation::detect(),
+ }
+ }
+
+ #[inline(always)]
+ fn to_words(&self) -> [Word; 8] {
+ let (salt_left, salt_right) = array_refs!(&self.salt, SALTBYTES / 2, SALTBYTES / 2);
+ let (personal_left, personal_right) =
+ array_refs!(&self.personal, PERSONALBYTES / 2, PERSONALBYTES / 2);
+ [
+ IV[0]
+ ^ self.hash_length as u64
+ ^ (self.key_length as u64) << 8
+ ^ (self.fanout as u64) << 16
+ ^ (self.max_depth as u64) << 24
+ ^ (self.max_leaf_length as u64) << 32,
+ IV[1] ^ self.node_offset,
+ IV[2] ^ self.node_depth as u64 ^ (self.inner_hash_length as u64) << 8,
+ IV[3],
+ IV[4] ^ Word::from_le_bytes(*salt_left),
+ IV[5] ^ Word::from_le_bytes(*salt_right),
+ IV[6] ^ Word::from_le_bytes(*personal_left),
+ IV[7] ^ Word::from_le_bytes(*personal_right),
+ ]
+ }
+
+ /// Hash an input all at once with these parameters.
+ #[inline]
+ pub fn hash(&self, input: &[u8]) -> Hash {
+ // If there's a key, just fall back to using the State.
+ if self.key_length > 0 {
+ return self.to_state().update(input).finalize();
+ }
+ let mut words = self.to_words();
+ self.implementation.compress1_loop(
+ input,
+ &mut words,
+ 0,
+ self.last_node,
+ guts::Finalize::Yes,
+ guts::Stride::Serial,
+ );
+ Hash {
+ bytes: state_words_to_bytes(&words),
+ len: self.hash_length,
+ }
+ }
+
+ /// Construct a `State` object based on these parameters, for hashing input
+ /// incrementally.
+ pub fn to_state(&self) -> State {
+ State::with_params(self)
+ }
+
+ /// Set the length of the final hash in bytes, from 1 to `OUTBYTES` (64). Apart from
+ /// controlling the length of the final `Hash`, this is also associated data, and changing it
+ /// will result in a totally different hash.
+ #[inline]
+ pub fn hash_length(&mut self, length: usize) -> &mut Self {
+ assert!(
+ 1 <= length && length <= OUTBYTES,
+ "Bad hash length: {}",
+ length
+ );
+ self.hash_length = length as u8;
+ self
+ }
+
+ /// Use a secret key, so that BLAKE2 acts as a MAC. The maximum key length is `KEYBYTES` (64).
+ /// An empty key is equivalent to having no key at all.
+ #[inline]
+ pub fn key(&mut self, key: &[u8]) -> &mut Self {
+ assert!(key.len() <= KEYBYTES, "Bad key length: {}", key.len());
+ self.key_length = key.len() as u8;
+ self.key_block = [0; BLOCKBYTES];
+ self.key_block[..key.len()].copy_from_slice(key);
+ self
+ }
+
+ /// At most `SALTBYTES` (16). Shorter salts are padded with null bytes. An empty salt is
+ /// equivalent to having no salt at all.
+ #[inline]
+ pub fn salt(&mut self, salt: &[u8]) -> &mut Self {
+ assert!(salt.len() <= SALTBYTES, "Bad salt length: {}", salt.len());
+ self.salt = [0; SALTBYTES];
+ self.salt[..salt.len()].copy_from_slice(salt);
+ self
+ }
+
+ /// At most `PERSONALBYTES` (16). Shorter personalizations are padded with null bytes. An empty
+ /// personalization is equivalent to having no personalization at all.
+ #[inline]
+ pub fn personal(&mut self, personalization: &[u8]) -> &mut Self {
+ assert!(
+ personalization.len() <= PERSONALBYTES,
+ "Bad personalization length: {}",
+ personalization.len()
+ );
+ self.personal = [0; PERSONALBYTES];
+ self.personal[..personalization.len()].copy_from_slice(personalization);
+ self
+ }
+
+ /// From 0 (meaning unlimited) to 255. The default is 1 (meaning sequential).
+ #[inline]
+ pub fn fanout(&mut self, fanout: u8) -> &mut Self {
+ self.fanout = fanout;
+ self
+ }
+
+ /// From 0 (meaning BLAKE2X B2 hashes), through 1 (the default, meaning sequential) to 255 (meaning unlimited).
+ #[inline]
+ pub fn max_depth(&mut self, depth: u8) -> &mut Self {
+ self.max_depth = depth;
+ self
+ }
+
+ /// From 0 (the default, meaning unlimited or sequential) to `2^32 - 1`.
+ #[inline]
+ pub fn max_leaf_length(&mut self, length: u32) -> &mut Self {
+ self.max_leaf_length = length;
+ self
+ }
+
+ /// From 0 (the default, meaning first, leftmost, leaf, or sequential) to `2^64 - 1`.
+ #[inline]
+ pub fn node_offset(&mut self, offset: u64) -> &mut Self {
+ self.node_offset = offset;
+ self
+ }
+
+ /// From 0 (the default, meaning leaf or sequential) to 255.
+ #[inline]
+ pub fn node_depth(&mut self, depth: u8) -> &mut Self {
+ self.node_depth = depth;
+ self
+ }
+
+ /// From 0 (the default, meaning sequential) to `OUTBYTES` (64).
+ #[inline]
+ pub fn inner_hash_length(&mut self, length: usize) -> &mut Self {
+ assert!(length <= OUTBYTES, "Bad inner hash length: {}", length);
+ self.inner_hash_length = length as u8;
+ self
+ }
+
+ /// Indicates the rightmost node in a row. This can also be changed on the
+ /// `State` object, potentially after hashing has begun. See
+ /// [`State::set_last_node`].
+ ///
+ /// [`State::set_last_node`]: struct.State.html#method.set_last_node
+ #[inline]
+ pub fn last_node(&mut self, last_node: bool) -> &mut Self {
+ self.last_node = if last_node {
+ guts::LastNode::Yes
+ } else {
+ guts::LastNode::No
+ };
+ self
+ }
+}
+
+impl Default for Params {
+ fn default() -> Self {
+ Self::new()
+ }
+}
+
+impl fmt::Debug for Params {
+ fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+ write!(
+ f,
+ "Params {{ hash_length: {}, key_length: {}, salt: {:?}, personal: {:?}, fanout: {}, \
+ max_depth: {}, max_leaf_length: {}, node_offset: {}, node_depth: {}, \
+ inner_hash_length: {}, last_node: {} }}",
+ self.hash_length,
+ // NB: Don't print the key itself. Debug shouldn't leak secrets.
+ self.key_length,
+ &self.salt,
+ &self.personal,
+ self.fanout,
+ self.max_depth,
+ self.max_leaf_length,
+ self.node_offset,
+ self.node_depth,
+ self.inner_hash_length,
+ self.last_node.yes(),
+ )
+ }
+}
+
+/// An incremental hasher for BLAKE2b.
+///
+/// To construct a `State` with non-default parameters, see `Params::to_state`.
+///
+/// # Example
+///
+/// ```
+/// use blake2b_simd::{State, blake2b};
+///
+/// let mut state = blake2b_simd::State::new();
+///
+/// state.update(b"foo");
+/// assert_eq!(blake2b(b"foo"), state.finalize());
+///
+/// state.update(b"bar");
+/// assert_eq!(blake2b(b"foobar"), state.finalize());
+/// ```
+#[derive(Clone)]
+pub struct State {
+ words: [Word; 8],
+ count: Count,
+ buf: [u8; BLOCKBYTES],
+ buflen: u8,
+ last_node: guts::LastNode,
+ hash_length: u8,
+ implementation: guts::Implementation,
+ is_keyed: bool,
+}
+
+impl State {
+ /// Equivalent to `State::default()` or `Params::default().to_state()`.
+ pub fn new() -> Self {
+ Self::with_params(&Params::default())
+ }
+
+ fn with_params(params: &Params) -> Self {
+ let mut state = Self {
+ words: params.to_words(),
+ count: 0,
+ buf: [0; BLOCKBYTES],
+ buflen: 0,
+ last_node: params.last_node,
+ hash_length: params.hash_length,
+ implementation: params.implementation,
+ is_keyed: params.key_length > 0,
+ };
+ if state.is_keyed {
+ state.buf = params.key_block;
+ state.buflen = state.buf.len() as u8;
+ }
+ state
+ }
+
+ fn fill_buf(&mut self, input: &mut &[u8]) {
+ let take = cmp::min(BLOCKBYTES - self.buflen as usize, input.len());
+ self.buf[self.buflen as usize..self.buflen as usize + take].copy_from_slice(&input[..take]);
+ self.buflen += take as u8;
+ *input = &input[take..];
+ }
+
+ // If the state already has some input in its buffer, try to fill the buffer and perform a
+ // compression. However, only do the compression if there's more input coming, otherwise it
+ // will give the wrong hash it the caller finalizes immediately after.
+ fn compress_buffer_if_possible(&mut self, input: &mut &[u8]) {
+ if self.buflen > 0 {
+ self.fill_buf(input);
+ if !input.is_empty() {
+ self.implementation.compress1_loop(
+ &self.buf,
+ &mut self.words,
+ self.count,
+ self.last_node,
+ guts::Finalize::No,
+ guts::Stride::Serial,
+ );
+ self.count = self.count.wrapping_add(BLOCKBYTES as Count);
+ self.buflen = 0;
+ }
+ }
+ }
+
+ /// Add input to the hash. You can call `update` any number of times.
+ pub fn update(&mut self, mut input: &[u8]) -> &mut Self {
+ // If we have a partial buffer, try to complete it.
+ self.compress_buffer_if_possible(&mut input);
+ // While there's more than a block of input left (which also means we cleared the buffer
+ // above), compress blocks directly without copying.
+ let mut end = input.len().saturating_sub(1);
+ end -= end % BLOCKBYTES;
+ if end > 0 {
+ self.implementation.compress1_loop(
+ &input[..end],
+ &mut self.words,
+ self.count,
+ self.last_node,
+ guts::Finalize::No,
+ guts::Stride::Serial,
+ );
+ self.count = self.count.wrapping_add(end as Count);
+ input = &input[end..];
+ }
+ // Buffer any remaining input, to be either compressed or finalized in a subsequent call.
+ // Note that this represents some copying overhead, which in theory we could avoid in
+ // all-at-once setting. A function hardcoded for exactly BLOCKSIZE input bytes is about 10%
+ // faster than using this implementation for the same input.
+ self.fill_buf(&mut input);
+ self
+ }
+
+ /// Finalize the state and return a `Hash`. This method is idempotent, and calling it multiple
+ /// times will give the same result. It's also possible to `update` with more input in between.
+ pub fn finalize(&self) -> Hash {
+ let mut words_copy = self.words;
+ self.implementation.compress1_loop(
+ &self.buf[..self.buflen as usize],
+ &mut words_copy,
+ self.count,
+ self.last_node,
+ guts::Finalize::Yes,
+ guts::Stride::Serial,
+ );
+ Hash {
+ bytes: state_words_to_bytes(&words_copy),
+ len: self.hash_length,
+ }
+ }
+
+ /// Set a flag indicating that this is the last node of its level in a tree hash. This is
+ /// equivalent to [`Params::last_node`], except that it can be set at any time before calling
+ /// `finalize`. That allows callers to begin hashing a node without knowing ahead of time
+ /// whether it's the last in its level. For more details about the intended use of this flag
+ /// [the BLAKE2 spec].
+ ///
+ /// [`Params::last_node`]: struct.Params.html#method.last_node
+ /// [the BLAKE2 spec]: https://blake2.net/blake2.pdf
+ pub fn set_last_node(&mut self, last_node: bool) -> &mut Self {
+ self.last_node = if last_node {
+ guts::LastNode::Yes
+ } else {
+ guts::LastNode::No
+ };
+ self
+ }
+
+ /// Return the total number of bytes input so far.
+ ///
+ /// Note that `count` doesn't include the bytes of the key block, if any.
+ /// It's exactly the total number of input bytes fed to `update`.
+ pub fn count(&self) -> Count {
+ let mut ret = self.count.wrapping_add(self.buflen as Count);
+ if self.is_keyed {
+ ret -= BLOCKBYTES as Count;
+ }
+ ret
+ }
+}
+
+#[inline(always)]
+fn state_words_to_bytes(state_words: &[Word; 8]) -> [u8; OUTBYTES] {
+ let mut bytes = [0; OUTBYTES];
+ {
+ const W: usize = size_of::<Word>();
+ let refs = mut_array_refs!(&mut bytes, W, W, W, W, W, W, W, W);
+ *refs.0 = state_words[0].to_le_bytes();
+ *refs.1 = state_words[1].to_le_bytes();
+ *refs.2 = state_words[2].to_le_bytes();
+ *refs.3 = state_words[3].to_le_bytes();
+ *refs.4 = state_words[4].to_le_bytes();
+ *refs.5 = state_words[5].to_le_bytes();
+ *refs.6 = state_words[6].to_le_bytes();
+ *refs.7 = state_words[7].to_le_bytes();
+ }
+ bytes
+}
+
+#[cfg(feature = "std")]
+impl std::io::Write for State {
+ fn write(&mut self, buf: &[u8]) -> std::io::Result<usize> {
+ self.update(buf);
+ Ok(buf.len())
+ }
+
+ fn flush(&mut self) -> std::io::Result<()> {
+ Ok(())
+ }
+}
+
+impl fmt::Debug for State {
+ fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+ // NB: Don't print the words. Leaking them would allow length extension.
+ write!(
+ f,
+ "State {{ count: {}, hash_length: {}, last_node: {} }}",
+ self.count(),
+ self.hash_length,
+ self.last_node.yes(),
+ )
+ }
+}
+
+impl Default for State {
+ fn default() -> Self {
+ Self::with_params(&Params::default())
+ }
+}
+
+type HexString = arrayvec::ArrayString<[u8; 2 * OUTBYTES]>;
+
+/// A finalized BLAKE2 hash, with constant-time equality.
+#[derive(Clone, Copy)]
+pub struct Hash {
+ bytes: [u8; OUTBYTES],
+ len: u8,
+}
+
+impl Hash {
+ /// Convert the hash to a byte slice. Note that if you're using BLAKE2 as a MAC, you need
+ /// constant time equality, which `&[u8]` doesn't provide.
+ pub fn as_bytes(&self) -> &[u8] {
+ &self.bytes[..self.len as usize]
+ }
+
+ /// Convert the hash to a byte array. Note that if you're using BLAKE2 as a
+ /// MAC, you need constant time equality, which arrays don't provide. This
+ /// panics in debug mode if the length of the hash isn't `OUTBYTES`.
+ #[inline]
+ pub fn as_array(&self) -> &[u8; OUTBYTES] {
+ debug_assert_eq!(self.len as usize, OUTBYTES);
+ &self.bytes
+ }
+
+ /// Convert the hash to a lowercase hexadecimal
+ /// [`ArrayString`](https://docs.rs/arrayvec/0.4/arrayvec/struct.ArrayString.html).
+ pub fn to_hex(&self) -> HexString {
+ bytes_to_hex(self.as_bytes())
+ }
+}
+
+fn bytes_to_hex(bytes: &[u8]) -> HexString {
+ let mut s = arrayvec::ArrayString::new();
+ let table = b"0123456789abcdef";
+ for &b in bytes {
+ s.push(table[(b >> 4) as usize] as char);
+ s.push(table[(b & 0xf) as usize] as char);
+ }
+ s
+}
+
+/// This implementation is constant time, if the two hashes are the same length.
+impl PartialEq for Hash {
+ fn eq(&self, other: &Hash) -> bool {
+ constant_time_eq::constant_time_eq(&self.as_bytes(), &other.as_bytes())
+ }
+}
+
+/// This implementation is constant time, if the slice is the same length as the hash.
+impl PartialEq<[u8]> for Hash {
+ fn eq(&self, other: &[u8]) -> bool {
+ constant_time_eq::constant_time_eq(&self.as_bytes(), other)
+ }
+}
+
+impl Eq for Hash {}
+
+impl AsRef<[u8]> for Hash {
+ fn as_ref(&self) -> &[u8] {
+ self.as_bytes()
+ }
+}
+
+impl fmt::Debug for Hash {
+ fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+ write!(f, "Hash(0x{})", self.to_hex())
+ }
+}
+
+// Paint a byte pattern that won't repeat, so that we don't accidentally miss
+// buffer offset bugs. This is the same as what Bao uses in its tests.
+#[cfg(test)]
+fn paint_test_input(buf: &mut [u8]) {
+ let mut offset = 0;
+ let mut counter: u32 = 1;
+ while offset < buf.len() {
+ let bytes = counter.to_le_bytes();
+ let take = cmp::min(bytes.len(), buf.len() - offset);
+ buf[offset..][..take].copy_from_slice(&bytes[..take]);
+ counter += 1;
+ offset += take;
+ }
+}
+
+// This module is pub for internal benchmarks only. Please don't use it.
+#[doc(hidden)]
+pub mod benchmarks {
+ use super::*;
+
+ pub fn force_portable(params: &mut Params) {
+ params.implementation = guts::Implementation::portable();
+ }
+
+ pub fn force_portable_blake2bp(params: &mut blake2bp::Params) {
+ blake2bp::force_portable(params);
+ }
+}
diff --git a/third_party/rust/blake2b_simd/src/many.rs b/third_party/rust/blake2b_simd/src/many.rs
new file mode 100644
index 0000000000..1588a3fb92
--- /dev/null
+++ b/third_party/rust/blake2b_simd/src/many.rs
@@ -0,0 +1,529 @@
+//! Interfaces for hashing multiple inputs at once, using SIMD more
+//! efficiently.
+//!
+//! The throughput of these interfaces is comparable to BLAKE2bp, about twice
+//! the throughput of regular BLAKE2b when AVX2 is available.
+//!
+//! These interfaces can accept any number of inputs, and the implementation
+//! does its best to parallelize them. In general, the more inputs you can pass
+//! in at once the better. If you need to batch your inputs in smaller groups,
+//! see the [`degree`](fn.degree.html) function for a good batch size.
+//!
+//! The implementation keeps working in parallel even when inputs are of
+//! different lengths, by managing a working set of jobs whose input isn't yet
+//! exhausted. However, if one or two inputs are much longer than the others,
+//! and they're encountered only at the end, there might not be any remaining
+//! work to parallelize them with. In this case, sorting the inputs
+//! longest-first can improve parallelism.
+//!
+//! # Example
+//!
+//! ```
+//! use blake2b_simd::{blake2b, State, many::update_many};
+//!
+//! let mut states = [
+//! State::new(),
+//! State::new(),
+//! State::new(),
+//! State::new(),
+//! ];
+//!
+//! let inputs = [
+//! &b"foo"[..],
+//! &b"bar"[..],
+//! &b"baz"[..],
+//! &b"bing"[..],
+//! ];
+//!
+//! update_many(states.iter_mut().zip(inputs.iter()));
+//!
+//! for (state, input) in states.iter_mut().zip(inputs.iter()) {
+//! assert_eq!(blake2b(input), state.finalize());
+//! }
+//! ```
+
+use crate::guts::{self, Finalize, Implementation, Job, LastNode, Stride};
+use crate::state_words_to_bytes;
+use crate::Count;
+use crate::Hash;
+use crate::Params;
+use crate::State;
+use crate::Word;
+use crate::BLOCKBYTES;
+use arrayref::array_mut_ref;
+use arrayvec::ArrayVec;
+use core::fmt;
+
+/// The largest possible value of [`degree`](fn.degree.html) on the target
+/// platform.
+///
+/// Note that this constant reflects the parallelism degree supported by this
+/// crate, so it will change over time as support is added or removed. For
+/// example, when Rust stabilizes AVX-512 support and this crate adds an
+/// AVX-512 implementation, this constant will double on x86 targets. If that
+/// implementation is an optional feature (e.g. because it's nightly-only), the
+/// value of this constant will depend on that optional feature also.
+pub const MAX_DEGREE: usize = guts::MAX_DEGREE;
+
+/// The parallelism degree of the implementation, detected at runtime. If you
+/// hash your inputs in small batches, making the batch size a multiple of
+/// `degree` will generally give good performance.
+///
+/// For example, an x86 processor that supports AVX2 can compute four BLAKE2b
+/// hashes in parallel, so `degree` returns 4 on that machine. If you call
+/// [`hash_many`] with only three inputs, that's not enough to use the AVX2
+/// implementation, and your average throughput will be lower. Likewise if you
+/// call it with five inputs of equal length, the first four will be hashed in
+/// parallel with AVX2, but the last one will have to be hashed by itself, and
+/// again your average throughput will be lower.
+///
+/// As noted in the module level docs, performance is more complicated if your
+/// inputs are of different lengths. When parallelizing long and short inputs
+/// together, the longer ones will have bytes left over, and the implementation
+/// will try to parallelize those leftover bytes with subsequent inputs. The
+/// more inputs available in that case, the more the implementation will be
+/// able to parallelize.
+///
+/// If you need a constant batch size, for example to collect inputs in an
+/// array, see [`MAX_DEGREE`].
+///
+/// [`hash_many`]: fn.hash_many.html
+/// [`MAX_DEGREE`]: constant.MAX_DEGREE.html
+pub fn degree() -> usize {
+ guts::Implementation::detect().degree()
+}
+
+type JobsVec<'a, 'b> = ArrayVec<[Job<'a, 'b>; guts::MAX_DEGREE]>;
+
+#[inline(always)]
+fn fill_jobs_vec<'a, 'b>(
+ jobs_iter: &mut impl Iterator<Item = Job<'a, 'b>>,
+ vec: &mut JobsVec<'a, 'b>,
+ target_len: usize,
+) {
+ while vec.len() < target_len {
+ if let Some(job) = jobs_iter.next() {
+ vec.push(job);
+ } else {
+ break;
+ }
+ }
+}
+
+#[inline(always)]
+fn evict_finished<'a, 'b>(vec: &mut JobsVec<'a, 'b>, num_jobs: usize) {
+ // Iterate backwards so that removal doesn't cause an out-of-bounds panic.
+ for i in (0..num_jobs).rev() {
+ // Note that is_empty() is only valid because we know all these jobs
+ // have been run at least once. Otherwise we could confuse the empty
+ // input for a finished job, which would be incorrect.
+ //
+ // Avoid a panic branch here in release mode.
+ debug_assert!(vec.len() > i);
+ if vec.len() > i && vec[i].input.is_empty() {
+ // Note that calling pop_at() repeatedly has some overhead, because
+ // later elements need to be shifted up. However, the JobsVec is
+ // small, and this approach guarantees that jobs are encountered in
+ // order.
+ vec.pop_at(i);
+ }
+ }
+}
+
+pub(crate) fn compress_many<'a, 'b, I>(
+ jobs: I,
+ imp: Implementation,
+ finalize: Finalize,
+ stride: Stride,
+) where
+ I: IntoIterator<Item = Job<'a, 'b>>,
+{
+ // Fuse is important for correctness, since each of these blocks tries to
+ // advance the iterator, even if a previous block emptied it.
+ let mut jobs_iter = jobs.into_iter().fuse();
+ let mut jobs_vec = JobsVec::new();
+
+ if imp.degree() >= 4 {
+ loop {
+ fill_jobs_vec(&mut jobs_iter, &mut jobs_vec, 4);
+ if jobs_vec.len() < 4 {
+ break;
+ }
+ let jobs_array = array_mut_ref!(jobs_vec, 0, 4);
+ imp.compress4_loop(jobs_array, finalize, stride);
+ evict_finished(&mut jobs_vec, 4);
+ }
+ }
+
+ if imp.degree() >= 2 {
+ loop {
+ fill_jobs_vec(&mut jobs_iter, &mut jobs_vec, 2);
+ if jobs_vec.len() < 2 {
+ break;
+ }
+ let jobs_array = array_mut_ref!(jobs_vec, 0, 2);
+ imp.compress2_loop(jobs_array, finalize, stride);
+ evict_finished(&mut jobs_vec, 2);
+ }
+ }
+
+ for job in jobs_vec.into_iter().chain(jobs_iter) {
+ let Job {
+ input,
+ words,
+ count,
+ last_node,
+ } = job;
+ imp.compress1_loop(input, words, count, last_node, finalize, stride);
+ }
+}
+
+/// Update any number of `State` objects at once.
+///
+/// # Example
+///
+/// ```
+/// use blake2b_simd::{blake2b, State, many::update_many};
+///
+/// let mut states = [
+/// State::new(),
+/// State::new(),
+/// State::new(),
+/// State::new(),
+/// ];
+///
+/// let inputs = [
+/// &b"foo"[..],
+/// &b"bar"[..],
+/// &b"baz"[..],
+/// &b"bing"[..],
+/// ];
+///
+/// update_many(states.iter_mut().zip(inputs.iter()));
+///
+/// for (state, input) in states.iter_mut().zip(inputs.iter()) {
+/// assert_eq!(blake2b(input), state.finalize());
+/// }
+/// ```
+pub fn update_many<'a, 'b, I, T>(pairs: I)
+where
+ I: IntoIterator<Item = (&'a mut State, &'b T)>,
+ T: 'b + AsRef<[u8]> + ?Sized,
+{
+ // Get the guts::Implementation from the first state, if any.
+ let mut peekable_pairs = pairs.into_iter().peekable();
+ let implementation = if let Some((state, _)) = peekable_pairs.peek() {
+ state.implementation
+ } else {
+ // No work items, just short circuit.
+ return;
+ };
+
+ // Adapt the pairs iterator into a Jobs iterator, but skip over the Jobs
+ // where there's not actually any work to do (e.g. because there's not much
+ // input and it's all just going in the State buffer).
+ let jobs = peekable_pairs.flat_map(|(state, input_t)| {
+ let mut input = input_t.as_ref();
+ // For each pair, if the State has some input in its buffer, try to
+ // finish that buffer. If there wasn't enough input to do that --
+ // or if the input was empty to begin with -- skip this pair.
+ state.compress_buffer_if_possible(&mut input);
+ if input.is_empty() {
+ return None;
+ }
+ // Now we know the buffer is empty and there's more input. Make sure we
+ // buffer the final block, because update() doesn't finalize.
+ let mut last_block_start = input.len() - 1;
+ last_block_start -= last_block_start % BLOCKBYTES;
+ let (blocks, last_block) = input.split_at(last_block_start);
+ state.buf[..last_block.len()].copy_from_slice(last_block);
+ state.buflen = last_block.len() as u8;
+ // Finally, if the full blocks slice is non-empty, prepare that job for
+ // compression, and bump the State count.
+ if blocks.is_empty() {
+ None
+ } else {
+ let count = state.count;
+ state.count = state.count.wrapping_add(blocks.len() as Count);
+ Some(Job {
+ input: blocks,
+ words: &mut state.words,
+ count,
+ last_node: state.last_node,
+ })
+ }
+ });
+
+ // Run all the Jobs in the iterator.
+ compress_many(jobs, implementation, Finalize::No, Stride::Serial);
+}
+
+/// A job for the [`hash_many`] function. After calling [`hash_many`] on a
+/// collection of `HashManyJob` objects, you can call [`to_hash`] on each job
+/// to get the result.
+///
+/// [`hash_many`]: fn.hash_many.html
+/// [`to_hash`]: struct.HashManyJob.html#method.to_hash
+#[derive(Clone)]
+pub struct HashManyJob<'a> {
+ words: [Word; 8],
+ count: Count,
+ last_node: LastNode,
+ hash_length: u8,
+ input: &'a [u8],
+ finished: bool,
+ implementation: guts::Implementation,
+}
+
+impl<'a> HashManyJob<'a> {
+ /// Construct a new `HashManyJob` from a set of hashing parameters and an
+ /// input.
+ #[inline]
+ pub fn new(params: &Params, input: &'a [u8]) -> Self {
+ let mut words = params.to_words();
+ let mut count = 0;
+ let mut finished = false;
+ // If we have key bytes, compress them into the state words. If there's
+ // no additional input, this compression needs to finalize and set
+ // finished=true.
+ if params.key_length > 0 {
+ let mut finalization = Finalize::No;
+ if input.is_empty() {
+ finalization = Finalize::Yes;
+ finished = true;
+ }
+ params.implementation.compress1_loop(
+ &params.key_block,
+ &mut words,
+ 0,
+ params.last_node,
+ finalization,
+ Stride::Serial,
+ );
+ count = BLOCKBYTES as Count;
+ }
+ Self {
+ words,
+ count,
+ last_node: params.last_node,
+ hash_length: params.hash_length,
+ input,
+ finished,
+ implementation: params.implementation,
+ }
+ }
+
+ /// Get the hash from a finished job. If you call this before calling
+ /// [`hash_many`], it will panic in debug mode.
+ ///
+ /// [`hash_many`]: fn.hash_many.html
+ #[inline]
+ pub fn to_hash(&self) -> Hash {
+ debug_assert!(self.finished, "job hasn't been run yet");
+ Hash {
+ bytes: state_words_to_bytes(&self.words),
+ len: self.hash_length,
+ }
+ }
+}
+
+impl<'a> fmt::Debug for HashManyJob<'a> {
+ fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+ // NB: Don't print the words. Leaking them would allow length extension.
+ write!(
+ f,
+ "HashManyJob {{ count: {}, hash_length: {}, last_node: {}, input_len: {} }}",
+ self.count,
+ self.hash_length,
+ self.last_node.yes(),
+ self.input.len(),
+ )
+ }
+}
+
+/// Hash any number of complete inputs all at once.
+///
+/// This is slightly more efficient than using `update_many` with `State`
+/// objects, because it doesn't need to do any buffering.
+///
+/// Running `hash_many` on the same `HashManyJob` object more than once has no
+/// effect.
+///
+/// # Example
+///
+/// ```
+/// use blake2b_simd::{blake2b, Params, many::{HashManyJob, hash_many}};
+///
+/// let inputs = [
+/// &b"foo"[..],
+/// &b"bar"[..],
+/// &b"baz"[..],
+/// &b"bing"[..],
+/// ];
+///
+/// let mut params = Params::new();
+/// params.hash_length(16);
+///
+/// let mut jobs = [
+/// HashManyJob::new(&params, inputs[0]),
+/// HashManyJob::new(&params, inputs[1]),
+/// HashManyJob::new(&params, inputs[2]),
+/// HashManyJob::new(&params, inputs[3]),
+/// ];
+///
+/// hash_many(jobs.iter_mut());
+///
+/// for (input, job) in inputs.iter().zip(jobs.iter()) {
+/// let expected = params.hash(input);
+/// assert_eq!(expected, job.to_hash());
+/// }
+/// ```
+pub fn hash_many<'a, 'b, I>(hash_many_jobs: I)
+where
+ 'b: 'a,
+ I: IntoIterator<Item = &'a mut HashManyJob<'b>>,
+{
+ // Get the guts::Implementation from the first job, if any.
+ let mut peekable_jobs = hash_many_jobs.into_iter().peekable();
+ let implementation = if let Some(job) = peekable_jobs.peek() {
+ job.implementation
+ } else {
+ // No work items, just short circuit.
+ return;
+ };
+
+ // In the jobs iterator, skip HashManyJobs that have already been run. This
+ // is less because we actually expect callers to call hash_many twice
+ // (though they're allowed to if they want), and more because
+ // HashManyJob::new might need to finalize if there are key bytes but no
+ // input. Tying the job lifetime to the Params reference is an alternative,
+ // but I've found it too constraining in practice. We could also put key
+ // bytes in every HashManyJob, but that would add unnecessary storage and
+ // zeroing for all callers.
+ let unfinished_jobs = peekable_jobs.into_iter().filter(|j| !j.finished);
+ let jobs = unfinished_jobs.map(|j| {
+ j.finished = true;
+ Job {
+ input: j.input,
+ words: &mut j.words,
+ count: j.count,
+ last_node: j.last_node,
+ }
+ });
+ compress_many(jobs, implementation, Finalize::Yes, Stride::Serial);
+}
+
+#[cfg(test)]
+mod test {
+ use super::*;
+ use crate::guts;
+ use crate::paint_test_input;
+ use crate::BLOCKBYTES;
+ use arrayvec::ArrayVec;
+
+ #[test]
+ fn test_degree() {
+ assert!(degree() <= MAX_DEGREE);
+
+ #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+ #[cfg(feature = "std")]
+ {
+ if is_x86_feature_detected!("avx2") {
+ assert!(degree() >= 4);
+ }
+ if is_x86_feature_detected!("sse4.1") {
+ assert!(degree() >= 2);
+ }
+ }
+ }
+
+ #[test]
+ fn test_hash_many() {
+ // Use a length of inputs that will exercise all of the power-of-two loops.
+ const LEN: usize = 2 * guts::MAX_DEGREE - 1;
+
+ // Rerun LEN inputs LEN different times, with the empty input starting in a
+ // different spot each time.
+ let mut input = [0; LEN * BLOCKBYTES];
+ paint_test_input(&mut input);
+ for start_offset in 0..LEN {
+ let mut inputs: [&[u8]; LEN] = [&[]; LEN];
+ for i in 0..LEN {
+ let chunks = (i + start_offset) % LEN;
+ inputs[i] = &input[..chunks * BLOCKBYTES];
+ }
+
+ let mut params: ArrayVec<[Params; LEN]> = ArrayVec::new();
+ for i in 0..LEN {
+ let mut p = Params::new();
+ p.node_offset(i as u64);
+ if i % 2 == 1 {
+ p.last_node(true);
+ p.key(b"foo");
+ }
+ params.push(p);
+ }
+
+ let mut jobs: ArrayVec<[HashManyJob; LEN]> = ArrayVec::new();
+ for i in 0..LEN {
+ jobs.push(HashManyJob::new(&params[i], inputs[i]));
+ }
+
+ hash_many(&mut jobs);
+
+ // Check the outputs.
+ for i in 0..LEN {
+ let expected = params[i].hash(inputs[i]);
+ assert_eq!(expected, jobs[i].to_hash());
+ }
+ }
+ }
+
+ #[test]
+ fn test_update_many() {
+ // Use a length of inputs that will exercise all of the power-of-two loops.
+ const LEN: usize = 2 * guts::MAX_DEGREE - 1;
+
+ // Rerun LEN inputs LEN different times, with the empty input starting in a
+ // different spot each time.
+ let mut input = [0; LEN * BLOCKBYTES];
+ paint_test_input(&mut input);
+ for start_offset in 0..LEN {
+ let mut inputs: [&[u8]; LEN] = [&[]; LEN];
+ for i in 0..LEN {
+ let chunks = (i + start_offset) % LEN;
+ inputs[i] = &input[..chunks * BLOCKBYTES];
+ }
+
+ let mut params: ArrayVec<[Params; LEN]> = ArrayVec::new();
+ for i in 0..LEN {
+ let mut p = Params::new();
+ p.node_offset(i as u64);
+ if i % 2 == 1 {
+ p.last_node(true);
+ p.key(b"foo");
+ }
+ params.push(p);
+ }
+
+ let mut states: ArrayVec<[State; LEN]> = ArrayVec::new();
+ for i in 0..LEN {
+ states.push(params[i].to_state());
+ }
+
+ // Run each input twice through, to exercise buffering.
+ update_many(states.iter_mut().zip(inputs.iter()));
+ update_many(states.iter_mut().zip(inputs.iter()));
+
+ // Check the outputs.
+ for i in 0..LEN {
+ let mut reference_state = params[i].to_state();
+ // Again, run the input twice.
+ reference_state.update(inputs[i]);
+ reference_state.update(inputs[i]);
+ assert_eq!(reference_state.finalize(), states[i].finalize());
+ assert_eq!(2 * inputs[i].len() as Count, states[i].count());
+ }
+ }
+ }
+}
diff --git a/third_party/rust/blake2b_simd/src/portable.rs b/third_party/rust/blake2b_simd/src/portable.rs
new file mode 100644
index 0000000000..5a667ef2a8
--- /dev/null
+++ b/third_party/rust/blake2b_simd/src/portable.rs
@@ -0,0 +1,166 @@
+use arrayref::{array_ref, array_refs};
+
+use super::*;
+use crate::guts::{
+ count_high, count_low, final_block, flag_word, input_debug_asserts, Finalize, LastNode, Stride,
+};
+
+// G is the mixing function, called eight times per round in the compression
+// function. V is the 16-word state vector of the compression function, usually
+// described as a 4x4 matrix. A, B, C, and D are the mixing indices, set by the
+// caller first to the four columns of V, and then to its four diagonals. X and
+// Y are words of input, chosen by the caller according to the message
+// schedule, SIGMA.
+#[inline(always)]
+fn g(v: &mut [Word; 16], a: usize, b: usize, c: usize, d: usize, x: Word, y: Word) {
+ v[a] = v[a].wrapping_add(v[b]).wrapping_add(x);
+ v[d] = (v[d] ^ v[a]).rotate_right(32);
+ v[c] = v[c].wrapping_add(v[d]);
+ v[b] = (v[b] ^ v[c]).rotate_right(24);
+ v[a] = v[a].wrapping_add(v[b]).wrapping_add(y);
+ v[d] = (v[d] ^ v[a]).rotate_right(16);
+ v[c] = v[c].wrapping_add(v[d]);
+ v[b] = (v[b] ^ v[c]).rotate_right(63);
+}
+
+#[inline(always)]
+fn round(r: usize, m: &[Word; 16], v: &mut [Word; 16]) {
+ // Select the message schedule based on the round.
+ let s = SIGMA[r];
+
+ // Mix the columns.
+ g(v, 0, 4, 8, 12, m[s[0] as usize], m[s[1] as usize]);
+ g(v, 1, 5, 9, 13, m[s[2] as usize], m[s[3] as usize]);
+ g(v, 2, 6, 10, 14, m[s[4] as usize], m[s[5] as usize]);
+ g(v, 3, 7, 11, 15, m[s[6] as usize], m[s[7] as usize]);
+
+ // Mix the rows.
+ g(v, 0, 5, 10, 15, m[s[8] as usize], m[s[9] as usize]);
+ g(v, 1, 6, 11, 12, m[s[10] as usize], m[s[11] as usize]);
+ g(v, 2, 7, 8, 13, m[s[12] as usize], m[s[13] as usize]);
+ g(v, 3, 4, 9, 14, m[s[14] as usize], m[s[15] as usize]);
+}
+
+#[inline(always)]
+fn compress_block(
+ block: &[u8; BLOCKBYTES],
+ words: &mut [Word; 8],
+ count: Count,
+ last_block: Word,
+ last_node: Word,
+) {
+ // Initialize the compression state.
+ let mut v = [
+ words[0],
+ words[1],
+ words[2],
+ words[3],
+ words[4],
+ words[5],
+ words[6],
+ words[7],
+ IV[0],
+ IV[1],
+ IV[2],
+ IV[3],
+ IV[4] ^ count_low(count),
+ IV[5] ^ count_high(count),
+ IV[6] ^ last_block,
+ IV[7] ^ last_node,
+ ];
+
+ // Parse the message bytes as ints in little endian order.
+ const W: usize = size_of::<Word>();
+ let msg_refs = array_refs!(block, W, W, W, W, W, W, W, W, W, W, W, W, W, W, W, W);
+ let m = [
+ Word::from_le_bytes(*msg_refs.0),
+ Word::from_le_bytes(*msg_refs.1),
+ Word::from_le_bytes(*msg_refs.2),
+ Word::from_le_bytes(*msg_refs.3),
+ Word::from_le_bytes(*msg_refs.4),
+ Word::from_le_bytes(*msg_refs.5),
+ Word::from_le_bytes(*msg_refs.6),
+ Word::from_le_bytes(*msg_refs.7),
+ Word::from_le_bytes(*msg_refs.8),
+ Word::from_le_bytes(*msg_refs.9),
+ Word::from_le_bytes(*msg_refs.10),
+ Word::from_le_bytes(*msg_refs.11),
+ Word::from_le_bytes(*msg_refs.12),
+ Word::from_le_bytes(*msg_refs.13),
+ Word::from_le_bytes(*msg_refs.14),
+ Word::from_le_bytes(*msg_refs.15),
+ ];
+
+ round(0, &m, &mut v);
+ round(1, &m, &mut v);
+ round(2, &m, &mut v);
+ round(3, &m, &mut v);
+ round(4, &m, &mut v);
+ round(5, &m, &mut v);
+ round(6, &m, &mut v);
+ round(7, &m, &mut v);
+ round(8, &m, &mut v);
+ round(9, &m, &mut v);
+ round(10, &m, &mut v);
+ round(11, &m, &mut v);
+
+ words[0] ^= v[0] ^ v[8];
+ words[1] ^= v[1] ^ v[9];
+ words[2] ^= v[2] ^ v[10];
+ words[3] ^= v[3] ^ v[11];
+ words[4] ^= v[4] ^ v[12];
+ words[5] ^= v[5] ^ v[13];
+ words[6] ^= v[6] ^ v[14];
+ words[7] ^= v[7] ^ v[15];
+}
+
+pub fn compress1_loop(
+ input: &[u8],
+ words: &mut [Word; 8],
+ mut count: Count,
+ last_node: LastNode,
+ finalize: Finalize,
+ stride: Stride,
+) {
+ input_debug_asserts(input, finalize);
+
+ let mut local_words = *words;
+
+ let mut fin_offset = input.len().saturating_sub(1);
+ fin_offset -= fin_offset % stride.padded_blockbytes();
+ let mut buf = [0; BLOCKBYTES];
+ let (fin_block, fin_len, _) = final_block(input, fin_offset, &mut buf, stride);
+ let fin_last_block = flag_word(finalize.yes());
+ let fin_last_node = flag_word(finalize.yes() && last_node.yes());
+
+ let mut offset = 0;
+ loop {
+ let block;
+ let count_delta;
+ let last_block;
+ let last_node;
+ if offset == fin_offset {
+ block = fin_block;
+ count_delta = fin_len;
+ last_block = fin_last_block;
+ last_node = fin_last_node;
+ } else {
+ block = array_ref!(input, offset, BLOCKBYTES);
+ count_delta = BLOCKBYTES;
+ last_block = flag_word(false);
+ last_node = flag_word(false);
+ };
+
+ count = count.wrapping_add(count_delta as Count);
+ compress_block(block, &mut local_words, count, last_block, last_node);
+
+ // Check for termination before bumping the offset, to avoid overflow.
+ if offset == fin_offset {
+ break;
+ }
+
+ offset += stride.padded_blockbytes();
+ }
+
+ *words = local_words;
+}
diff --git a/third_party/rust/blake2b_simd/src/sse41.rs b/third_party/rust/blake2b_simd/src/sse41.rs
new file mode 100644
index 0000000000..3a55dc8c35
--- /dev/null
+++ b/third_party/rust/blake2b_simd/src/sse41.rs
@@ -0,0 +1,454 @@
+#[cfg(target_arch = "x86")]
+use core::arch::x86::*;
+#[cfg(target_arch = "x86_64")]
+use core::arch::x86_64::*;
+
+use crate::guts::{
+ assemble_count, count_high, count_low, final_block, flag_word, input_debug_asserts, Finalize,
+ Job, Stride,
+};
+use crate::{Word, BLOCKBYTES, IV, SIGMA};
+use arrayref::{array_refs, mut_array_refs};
+use core::cmp;
+use core::mem;
+
+pub const DEGREE: usize = 2;
+
+#[inline(always)]
+unsafe fn loadu(src: *const [Word; DEGREE]) -> __m128i {
+ // This is an unaligned load, so the pointer cast is allowed.
+ _mm_loadu_si128(src as *const __m128i)
+}
+
+#[inline(always)]
+unsafe fn storeu(src: __m128i, dest: *mut [Word; DEGREE]) {
+ // This is an unaligned store, so the pointer cast is allowed.
+ _mm_storeu_si128(dest as *mut __m128i, src)
+}
+
+#[inline(always)]
+unsafe fn add(a: __m128i, b: __m128i) -> __m128i {
+ _mm_add_epi64(a, b)
+}
+
+#[inline(always)]
+unsafe fn eq(a: __m128i, b: __m128i) -> __m128i {
+ _mm_cmpeq_epi64(a, b)
+}
+
+#[inline(always)]
+unsafe fn and(a: __m128i, b: __m128i) -> __m128i {
+ _mm_and_si128(a, b)
+}
+
+#[inline(always)]
+unsafe fn negate_and(a: __m128i, b: __m128i) -> __m128i {
+ // Note that "and not" implies the reverse of the actual arg order.
+ _mm_andnot_si128(a, b)
+}
+
+#[inline(always)]
+unsafe fn xor(a: __m128i, b: __m128i) -> __m128i {
+ _mm_xor_si128(a, b)
+}
+
+#[inline(always)]
+unsafe fn set1(x: u64) -> __m128i {
+ _mm_set1_epi64x(x as i64)
+}
+
+#[inline(always)]
+unsafe fn set2(a: u64, b: u64) -> __m128i {
+ // There's no _mm_setr_epi64x, so note the arg order is backwards.
+ _mm_set_epi64x(b as i64, a as i64)
+}
+
+// Adapted from https://github.com/rust-lang-nursery/stdsimd/pull/479.
+macro_rules! _MM_SHUFFLE {
+ ($z:expr, $y:expr, $x:expr, $w:expr) => {
+ ($z << 6) | ($y << 4) | ($x << 2) | $w
+ };
+}
+
+#[inline(always)]
+unsafe fn rot32(x: __m128i) -> __m128i {
+ _mm_shuffle_epi32(x, _MM_SHUFFLE!(2, 3, 0, 1))
+}
+
+#[inline(always)]
+unsafe fn rot24(x: __m128i) -> __m128i {
+ let rotate24 = _mm_setr_epi8(3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10);
+ _mm_shuffle_epi8(x, rotate24)
+}
+
+#[inline(always)]
+unsafe fn rot16(x: __m128i) -> __m128i {
+ let rotate16 = _mm_setr_epi8(2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9);
+ _mm_shuffle_epi8(x, rotate16)
+}
+
+#[inline(always)]
+unsafe fn rot63(x: __m128i) -> __m128i {
+ _mm_or_si128(_mm_srli_epi64(x, 63), add(x, x))
+}
+
+#[inline(always)]
+unsafe fn round(v: &mut [__m128i; 16], m: &[__m128i; 16], r: usize) {
+ v[0] = add(v[0], m[SIGMA[r][0] as usize]);
+ v[1] = add(v[1], m[SIGMA[r][2] as usize]);
+ v[2] = add(v[2], m[SIGMA[r][4] as usize]);
+ v[3] = add(v[3], m[SIGMA[r][6] as usize]);
+ v[0] = add(v[0], v[4]);
+ v[1] = add(v[1], v[5]);
+ v[2] = add(v[2], v[6]);
+ v[3] = add(v[3], v[7]);
+ v[12] = xor(v[12], v[0]);
+ v[13] = xor(v[13], v[1]);
+ v[14] = xor(v[14], v[2]);
+ v[15] = xor(v[15], v[3]);
+ v[12] = rot32(v[12]);
+ v[13] = rot32(v[13]);
+ v[14] = rot32(v[14]);
+ v[15] = rot32(v[15]);
+ v[8] = add(v[8], v[12]);
+ v[9] = add(v[9], v[13]);
+ v[10] = add(v[10], v[14]);
+ v[11] = add(v[11], v[15]);
+ v[4] = xor(v[4], v[8]);
+ v[5] = xor(v[5], v[9]);
+ v[6] = xor(v[6], v[10]);
+ v[7] = xor(v[7], v[11]);
+ v[4] = rot24(v[4]);
+ v[5] = rot24(v[5]);
+ v[6] = rot24(v[6]);
+ v[7] = rot24(v[7]);
+ v[0] = add(v[0], m[SIGMA[r][1] as usize]);
+ v[1] = add(v[1], m[SIGMA[r][3] as usize]);
+ v[2] = add(v[2], m[SIGMA[r][5] as usize]);
+ v[3] = add(v[3], m[SIGMA[r][7] as usize]);
+ v[0] = add(v[0], v[4]);
+ v[1] = add(v[1], v[5]);
+ v[2] = add(v[2], v[6]);
+ v[3] = add(v[3], v[7]);
+ v[12] = xor(v[12], v[0]);
+ v[13] = xor(v[13], v[1]);
+ v[14] = xor(v[14], v[2]);
+ v[15] = xor(v[15], v[3]);
+ v[12] = rot16(v[12]);
+ v[13] = rot16(v[13]);
+ v[14] = rot16(v[14]);
+ v[15] = rot16(v[15]);
+ v[8] = add(v[8], v[12]);
+ v[9] = add(v[9], v[13]);
+ v[10] = add(v[10], v[14]);
+ v[11] = add(v[11], v[15]);
+ v[4] = xor(v[4], v[8]);
+ v[5] = xor(v[5], v[9]);
+ v[6] = xor(v[6], v[10]);
+ v[7] = xor(v[7], v[11]);
+ v[4] = rot63(v[4]);
+ v[5] = rot63(v[5]);
+ v[6] = rot63(v[6]);
+ v[7] = rot63(v[7]);
+
+ v[0] = add(v[0], m[SIGMA[r][8] as usize]);
+ v[1] = add(v[1], m[SIGMA[r][10] as usize]);
+ v[2] = add(v[2], m[SIGMA[r][12] as usize]);
+ v[3] = add(v[3], m[SIGMA[r][14] as usize]);
+ v[0] = add(v[0], v[5]);
+ v[1] = add(v[1], v[6]);
+ v[2] = add(v[2], v[7]);
+ v[3] = add(v[3], v[4]);
+ v[15] = xor(v[15], v[0]);
+ v[12] = xor(v[12], v[1]);
+ v[13] = xor(v[13], v[2]);
+ v[14] = xor(v[14], v[3]);
+ v[15] = rot32(v[15]);
+ v[12] = rot32(v[12]);
+ v[13] = rot32(v[13]);
+ v[14] = rot32(v[14]);
+ v[10] = add(v[10], v[15]);
+ v[11] = add(v[11], v[12]);
+ v[8] = add(v[8], v[13]);
+ v[9] = add(v[9], v[14]);
+ v[5] = xor(v[5], v[10]);
+ v[6] = xor(v[6], v[11]);
+ v[7] = xor(v[7], v[8]);
+ v[4] = xor(v[4], v[9]);
+ v[5] = rot24(v[5]);
+ v[6] = rot24(v[6]);
+ v[7] = rot24(v[7]);
+ v[4] = rot24(v[4]);
+ v[0] = add(v[0], m[SIGMA[r][9] as usize]);
+ v[1] = add(v[1], m[SIGMA[r][11] as usize]);
+ v[2] = add(v[2], m[SIGMA[r][13] as usize]);
+ v[3] = add(v[3], m[SIGMA[r][15] as usize]);
+ v[0] = add(v[0], v[5]);
+ v[1] = add(v[1], v[6]);
+ v[2] = add(v[2], v[7]);
+ v[3] = add(v[3], v[4]);
+ v[15] = xor(v[15], v[0]);
+ v[12] = xor(v[12], v[1]);
+ v[13] = xor(v[13], v[2]);
+ v[14] = xor(v[14], v[3]);
+ v[15] = rot16(v[15]);
+ v[12] = rot16(v[12]);
+ v[13] = rot16(v[13]);
+ v[14] = rot16(v[14]);
+ v[10] = add(v[10], v[15]);
+ v[11] = add(v[11], v[12]);
+ v[8] = add(v[8], v[13]);
+ v[9] = add(v[9], v[14]);
+ v[5] = xor(v[5], v[10]);
+ v[6] = xor(v[6], v[11]);
+ v[7] = xor(v[7], v[8]);
+ v[4] = xor(v[4], v[9]);
+ v[5] = rot63(v[5]);
+ v[6] = rot63(v[6]);
+ v[7] = rot63(v[7]);
+ v[4] = rot63(v[4]);
+}
+
+// We'd rather make this a regular function with #[inline(always)], but for
+// some reason that blows up compile times by about 10 seconds, at least in
+// some cases (BLAKE2b avx2.rs). This macro seems to get the same performance
+// result, without the compile time issue.
+macro_rules! compress2_transposed {
+ (
+ $h_vecs:expr,
+ $msg_vecs:expr,
+ $count_low:expr,
+ $count_high:expr,
+ $lastblock:expr,
+ $lastnode:expr,
+ ) => {
+ let h_vecs: &mut [__m128i; 8] = $h_vecs;
+ let msg_vecs: &[__m128i; 16] = $msg_vecs;
+ let count_low: __m128i = $count_low;
+ let count_high: __m128i = $count_high;
+ let lastblock: __m128i = $lastblock;
+ let lastnode: __m128i = $lastnode;
+ let mut v = [
+ h_vecs[0],
+ h_vecs[1],
+ h_vecs[2],
+ h_vecs[3],
+ h_vecs[4],
+ h_vecs[5],
+ h_vecs[6],
+ h_vecs[7],
+ set1(IV[0]),
+ set1(IV[1]),
+ set1(IV[2]),
+ set1(IV[3]),
+ xor(set1(IV[4]), count_low),
+ xor(set1(IV[5]), count_high),
+ xor(set1(IV[6]), lastblock),
+ xor(set1(IV[7]), lastnode),
+ ];
+
+ round(&mut v, &msg_vecs, 0);
+ round(&mut v, &msg_vecs, 1);
+ round(&mut v, &msg_vecs, 2);
+ round(&mut v, &msg_vecs, 3);
+ round(&mut v, &msg_vecs, 4);
+ round(&mut v, &msg_vecs, 5);
+ round(&mut v, &msg_vecs, 6);
+ round(&mut v, &msg_vecs, 7);
+ round(&mut v, &msg_vecs, 8);
+ round(&mut v, &msg_vecs, 9);
+ round(&mut v, &msg_vecs, 10);
+ round(&mut v, &msg_vecs, 11);
+
+ h_vecs[0] = xor(xor(h_vecs[0], v[0]), v[8]);
+ h_vecs[1] = xor(xor(h_vecs[1], v[1]), v[9]);
+ h_vecs[2] = xor(xor(h_vecs[2], v[2]), v[10]);
+ h_vecs[3] = xor(xor(h_vecs[3], v[3]), v[11]);
+ h_vecs[4] = xor(xor(h_vecs[4], v[4]), v[12]);
+ h_vecs[5] = xor(xor(h_vecs[5], v[5]), v[13]);
+ h_vecs[6] = xor(xor(h_vecs[6], v[6]), v[14]);
+ h_vecs[7] = xor(xor(h_vecs[7], v[7]), v[15]);
+ };
+}
+
+#[inline(always)]
+unsafe fn transpose_vecs(a: __m128i, b: __m128i) -> [__m128i; DEGREE] {
+ let a_words: [Word; DEGREE] = mem::transmute(a);
+ let b_words: [Word; DEGREE] = mem::transmute(b);
+ [set2(a_words[0], b_words[0]), set2(a_words[1], b_words[1])]
+}
+
+#[inline(always)]
+unsafe fn transpose_state_vecs(jobs: &[Job; DEGREE]) -> [__m128i; 8] {
+ // Load all the state words into transposed vectors, where the first vector
+ // has the first word of each state, etc. Transposing once at the beginning
+ // and once at the end is more efficient that repeating it for each block.
+ let words0 = array_refs!(&jobs[0].words, DEGREE, DEGREE, DEGREE, DEGREE);
+ let words1 = array_refs!(&jobs[1].words, DEGREE, DEGREE, DEGREE, DEGREE);
+ let [h0, h1] = transpose_vecs(loadu(words0.0), loadu(words1.0));
+ let [h2, h3] = transpose_vecs(loadu(words0.1), loadu(words1.1));
+ let [h4, h5] = transpose_vecs(loadu(words0.2), loadu(words1.2));
+ let [h6, h7] = transpose_vecs(loadu(words0.3), loadu(words1.3));
+ [h0, h1, h2, h3, h4, h5, h6, h7]
+}
+
+#[inline(always)]
+unsafe fn untranspose_state_vecs(h_vecs: &[__m128i; 8], jobs: &mut [Job; DEGREE]) {
+ // Un-transpose the updated state vectors back into the caller's arrays.
+ let [job0, job1] = jobs;
+ let words0 = mut_array_refs!(&mut job0.words, DEGREE, DEGREE, DEGREE, DEGREE);
+ let words1 = mut_array_refs!(&mut job1.words, DEGREE, DEGREE, DEGREE, DEGREE);
+
+ let out = transpose_vecs(h_vecs[0], h_vecs[1]);
+ storeu(out[0], words0.0);
+ storeu(out[1], words1.0);
+ let out = transpose_vecs(h_vecs[2], h_vecs[3]);
+ storeu(out[0], words0.1);
+ storeu(out[1], words1.1);
+ let out = transpose_vecs(h_vecs[4], h_vecs[5]);
+ storeu(out[0], words0.2);
+ storeu(out[1], words1.2);
+ let out = transpose_vecs(h_vecs[6], h_vecs[7]);
+ storeu(out[0], words0.3);
+ storeu(out[1], words1.3);
+}
+
+#[inline(always)]
+unsafe fn transpose_msg_vecs(blocks: [*const [u8; BLOCKBYTES]; DEGREE]) -> [__m128i; 16] {
+ // These input arrays have no particular alignment, so we use unaligned
+ // loads to read from them.
+ let block0 = blocks[0] as *const [Word; DEGREE];
+ let block1 = blocks[1] as *const [Word; DEGREE];
+ let [m0, m1] = transpose_vecs(loadu(block0.add(0)), loadu(block1.add(0)));
+ let [m2, m3] = transpose_vecs(loadu(block0.add(1)), loadu(block1.add(1)));
+ let [m4, m5] = transpose_vecs(loadu(block0.add(2)), loadu(block1.add(2)));
+ let [m6, m7] = transpose_vecs(loadu(block0.add(3)), loadu(block1.add(3)));
+ let [m8, m9] = transpose_vecs(loadu(block0.add(4)), loadu(block1.add(4)));
+ let [m10, m11] = transpose_vecs(loadu(block0.add(5)), loadu(block1.add(5)));
+ let [m12, m13] = transpose_vecs(loadu(block0.add(6)), loadu(block1.add(6)));
+ let [m14, m15] = transpose_vecs(loadu(block0.add(7)), loadu(block1.add(7)));
+ [
+ m0, m1, m2, m3, m4, m5, m6, m7, m8, m9, m10, m11, m12, m13, m14, m15,
+ ]
+}
+
+#[inline(always)]
+unsafe fn load_counts(jobs: &[Job; DEGREE]) -> (__m128i, __m128i) {
+ (
+ set2(count_low(jobs[0].count), count_low(jobs[1].count)),
+ set2(count_high(jobs[0].count), count_high(jobs[1].count)),
+ )
+}
+
+#[inline(always)]
+unsafe fn store_counts(jobs: &mut [Job; DEGREE], low: __m128i, high: __m128i) {
+ let low_ints: [Word; DEGREE] = mem::transmute(low);
+ let high_ints: [Word; DEGREE] = mem::transmute(high);
+ for i in 0..DEGREE {
+ jobs[i].count = assemble_count(low_ints[i], high_ints[i]);
+ }
+}
+
+#[inline(always)]
+unsafe fn add_to_counts(lo: &mut __m128i, hi: &mut __m128i, delta: __m128i) {
+ // If the low counts reach zero, that means they wrapped, unless the delta
+ // was also zero.
+ *lo = add(*lo, delta);
+ let lo_reached_zero = eq(*lo, set1(0));
+ let delta_was_zero = eq(delta, set1(0));
+ let hi_inc = and(set1(1), negate_and(delta_was_zero, lo_reached_zero));
+ *hi = add(*hi, hi_inc);
+}
+
+#[inline(always)]
+unsafe fn flags_vec(flags: [bool; DEGREE]) -> __m128i {
+ set2(flag_word(flags[0]), flag_word(flags[1]))
+}
+
+#[target_feature(enable = "sse4.1")]
+pub unsafe fn compress2_loop(jobs: &mut [Job; DEGREE], finalize: Finalize, stride: Stride) {
+ // If we're not finalizing, there can't be a partial block at the end.
+ for job in jobs.iter() {
+ input_debug_asserts(job.input, finalize);
+ }
+
+ let msg_ptrs = [jobs[0].input.as_ptr(), jobs[1].input.as_ptr()];
+ let mut h_vecs = transpose_state_vecs(&jobs);
+ let (mut counts_lo, mut counts_hi) = load_counts(&jobs);
+
+ // Prepare the final blocks (note, which could be empty if the input is
+ // empty). Do all this before entering the main loop.
+ let min_len = jobs.iter().map(|job| job.input.len()).min().unwrap();
+ let mut fin_offset = min_len.saturating_sub(1);
+ fin_offset -= fin_offset % stride.padded_blockbytes();
+ // Performance note, making these buffers mem::uninitialized() seems to
+ // cause problems in the optimizer.
+ let mut buf0: [u8; BLOCKBYTES] = [0; BLOCKBYTES];
+ let mut buf1: [u8; BLOCKBYTES] = [0; BLOCKBYTES];
+ let (block0, len0, finalize0) = final_block(jobs[0].input, fin_offset, &mut buf0, stride);
+ let (block1, len1, finalize1) = final_block(jobs[1].input, fin_offset, &mut buf1, stride);
+ let fin_blocks: [*const [u8; BLOCKBYTES]; DEGREE] = [block0, block1];
+ let fin_counts_delta = set2(len0 as Word, len1 as Word);
+ let fin_last_block;
+ let fin_last_node;
+ if finalize.yes() {
+ fin_last_block = flags_vec([finalize0, finalize1]);
+ fin_last_node = flags_vec([
+ finalize0 && jobs[0].last_node.yes(),
+ finalize1 && jobs[1].last_node.yes(),
+ ]);
+ } else {
+ fin_last_block = set1(0);
+ fin_last_node = set1(0);
+ }
+
+ // The main loop.
+ let mut offset = 0;
+ loop {
+ let blocks;
+ let counts_delta;
+ let last_block;
+ let last_node;
+ if offset == fin_offset {
+ blocks = fin_blocks;
+ counts_delta = fin_counts_delta;
+ last_block = fin_last_block;
+ last_node = fin_last_node;
+ } else {
+ blocks = [
+ msg_ptrs[0].add(offset) as *const [u8; BLOCKBYTES],
+ msg_ptrs[1].add(offset) as *const [u8; BLOCKBYTES],
+ ];
+ counts_delta = set1(BLOCKBYTES as Word);
+ last_block = set1(0);
+ last_node = set1(0);
+ };
+
+ let m_vecs = transpose_msg_vecs(blocks);
+ add_to_counts(&mut counts_lo, &mut counts_hi, counts_delta);
+ compress2_transposed!(
+ &mut h_vecs,
+ &m_vecs,
+ counts_lo,
+ counts_hi,
+ last_block,
+ last_node,
+ );
+
+ // Check for termination before bumping the offset, to avoid overflow.
+ if offset == fin_offset {
+ break;
+ }
+
+ offset += stride.padded_blockbytes();
+ }
+
+ // Write out the results.
+ untranspose_state_vecs(&h_vecs, &mut *jobs);
+ store_counts(&mut *jobs, counts_lo, counts_hi);
+ let max_consumed = offset.saturating_add(stride.padded_blockbytes());
+ for job in jobs.iter_mut() {
+ let consumed = cmp::min(max_consumed, job.input.len());
+ job.input = &job.input[consumed..];
+ }
+}
diff --git a/third_party/rust/blake2b_simd/src/test.rs b/third_party/rust/blake2b_simd/src/test.rs
new file mode 100644
index 0000000000..9ca8e87751
--- /dev/null
+++ b/third_party/rust/blake2b_simd/src/test.rs
@@ -0,0 +1,201 @@
+use super::*;
+
+const EMPTY_HASH: &str = "786a02f742015903c6c6fd852552d272912f4740e15847618a86e217f71f5419\
+ d25e1031afee585313896444934eb04b903a685b1448b755d56f701afe9be2ce";
+const ABC_HASH: &str = "ba80a53f981c4d0d6a2797b69f12f6e94c212f14685ac4b74b12bb6fdbffa2d1\
+ 7d87c5392aab792dc252d5de4533cc9518d38aa8dbf1925ab92386edd4009923";
+const ONE_BLOCK_HASH: &str = "865939e120e6805438478841afb739ae4250cf372653078a065cdcfffca4caf7\
+ 98e6d462b65d658fc165782640eded70963449ae1500fb0f24981d7727e22c41";
+const THOUSAND_HASH: &str = "1ee4e51ecab5210a518f26150e882627ec839967f19d763e1508b12cfefed148\
+ 58f6a1c9d1f969bc224dc9440f5a6955277e755b9c513f9ba4421c5e50c8d787";
+
+#[test]
+fn test_update_state() {
+ let io = &[
+ (&b""[..], EMPTY_HASH),
+ (&b"abc"[..], ABC_HASH),
+ (&[0; BLOCKBYTES], ONE_BLOCK_HASH),
+ (&[0; 1000], THOUSAND_HASH),
+ ];
+ // Test each input all at once.
+ for &(input, output) in io {
+ let hash = blake2b(input);
+ assert_eq!(&hash.to_hex(), output, "hash mismatch");
+ }
+ // Now in two chunks. This is especially important for the ONE_BLOCK case, because it would be
+ // a mistake for update() to call compress, even though the buffer is full.
+ for &(input, output) in io {
+ let mut state = State::new();
+ let split = input.len() / 2;
+ state.update(&input[..split]);
+ assert_eq!(split as Count, state.count());
+ state.update(&input[split..]);
+ assert_eq!(input.len() as Count, state.count());
+ let hash = state.finalize();
+ assert_eq!(&hash.to_hex(), output, "hash mismatch");
+ }
+ // Now one byte at a time.
+ for &(input, output) in io {
+ let mut state = State::new();
+ let mut count = 0;
+ for &b in input {
+ state.update(&[b]);
+ count += 1;
+ assert_eq!(count, state.count());
+ }
+ let hash = state.finalize();
+ assert_eq!(&hash.to_hex(), output, "hash mismatch");
+ }
+}
+
+#[test]
+fn test_multiple_finalizes() {
+ let mut state = State::new();
+ assert_eq!(&state.finalize().to_hex(), EMPTY_HASH, "hash mismatch");
+ assert_eq!(&state.finalize().to_hex(), EMPTY_HASH, "hash mismatch");
+ assert_eq!(&state.finalize().to_hex(), EMPTY_HASH, "hash mismatch");
+ state.update(b"abc");
+ assert_eq!(&state.finalize().to_hex(), ABC_HASH, "hash mismatch");
+ assert_eq!(&state.finalize().to_hex(), ABC_HASH, "hash mismatch");
+ assert_eq!(&state.finalize().to_hex(), ABC_HASH, "hash mismatch");
+}
+
+#[cfg(feature = "std")]
+#[test]
+fn test_write() {
+ use std::io::prelude::*;
+
+ let mut state = State::new();
+ state.write_all(&[0; 1000]).unwrap();
+ let hash = state.finalize();
+ assert_eq!(&hash.to_hex(), THOUSAND_HASH, "hash mismatch");
+}
+
+// You can check this case against the equivalent Python:
+//
+// import hashlib
+// hashlib.blake2b(
+// b'foo',
+// digest_size=18,
+// key=b"bar",
+// salt=b"bazbazbazbazbazb",
+// person=b"bing bing bing b",
+// fanout=2,
+// depth=3,
+// leaf_size=0x04050607,
+// node_offset=0x08090a0b0c0d0e0f,
+// node_depth=16,
+// inner_size=17,
+// last_node=True,
+// ).hexdigest()
+#[test]
+fn test_all_parameters() {
+ let mut params = Params::new();
+ params
+ .hash_length(18)
+ // Make sure a shorter key properly overwrites a longer one.
+ .key(b"not the real key")
+ .key(b"bar")
+ .salt(b"bazbazbazbazbazb")
+ .personal(b"bing bing bing b")
+ .fanout(2)
+ .max_depth(3)
+ .max_leaf_length(0x04050607)
+ .node_offset(0x08090a0b0c0d0e0f)
+ .node_depth(16)
+ .inner_hash_length(17)
+ .last_node(true);
+
+ // Check the State API.
+ assert_eq!(
+ "ec0f59cb65f92e7fcca1280ba859a6925ded",
+ &params.to_state().update(b"foo").finalize().to_hex()
+ );
+
+ // Check the all-at-once API.
+ assert_eq!(
+ "ec0f59cb65f92e7fcca1280ba859a6925ded",
+ &params.hash(b"foo").to_hex()
+ );
+}
+
+#[test]
+fn test_all_parameters_blake2bp() {
+ let mut params = blake2bp::Params::new();
+ params
+ .hash_length(18)
+ // Make sure a shorter key properly overwrites a longer one.
+ .key(b"not the real key")
+ .key(b"bar");
+
+ // Check the State API.
+ assert_eq!(
+ "8c54e888a8a01c63da6585c058fe54ea81df",
+ &params.to_state().update(b"foo").finalize().to_hex()
+ );
+
+ // Check the all-at-once API.
+ assert_eq!(
+ "8c54e888a8a01c63da6585c058fe54ea81df",
+ &params.hash(b"foo").to_hex()
+ );
+}
+
+#[test]
+#[should_panic]
+fn test_short_hash_length_panics() {
+ Params::new().hash_length(0);
+}
+
+#[test]
+#[should_panic]
+fn test_long_hash_length_panics() {
+ Params::new().hash_length(OUTBYTES + 1);
+}
+
+#[test]
+#[should_panic]
+fn test_long_key_panics() {
+ Params::new().key(&[0; KEYBYTES + 1]);
+}
+
+#[test]
+#[should_panic]
+fn test_long_salt_panics() {
+ Params::new().salt(&[0; SALTBYTES + 1]);
+}
+
+#[test]
+#[should_panic]
+fn test_long_personal_panics() {
+ Params::new().personal(&[0; PERSONALBYTES + 1]);
+}
+
+#[test]
+fn test_zero_max_depth_supported() {
+ Params::new().max_depth(0);
+}
+
+#[test]
+#[should_panic]
+fn test_long_inner_hash_length_panics() {
+ Params::new().inner_hash_length(OUTBYTES + 1);
+}
+
+#[test]
+#[should_panic]
+fn test_blake2bp_short_hash_length_panics() {
+ blake2bp::Params::new().hash_length(0);
+}
+
+#[test]
+#[should_panic]
+fn test_blake2bp_long_hash_length_panics() {
+ blake2bp::Params::new().hash_length(OUTBYTES + 1);
+}
+
+#[test]
+#[should_panic]
+fn test_blake2bp_long_key_panics() {
+ blake2bp::Params::new().key(&[0; KEYBYTES + 1]);
+}