Adding upstream version 86.0.1.upstream/86.0.1 upstream

Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
author: Daniel Baumann <daniel.baumann@progress-linux.org> 2024-04-28 14:29:10 +0000
committer: Daniel Baumann <daniel.baumann@progress-linux.org> 2024-04-28 14:29:10 +0000
commit: 2aa4a82499d4becd2284cdb482213d541b8804dd (patch)
tree: b80bf8bf13c3766139fbacc530efd0dd9d54394c /third_party/rust/blake2b_simd/src
parent: Initial commit. (diff)
download: firefox-upstream.tar.xz
firefox-upstream.zip
8 files changed, 4087 insertions, 0 deletions
diff --git a/third_party/rust/blake2b_simd/src/avx2.rs b/third_party/rust/blake2b_simd/src/avx2.rs
new file mode 100644
index 0000000000..268dd82d36
--- /dev/null
+++ b/third_party/rust/blake2b_simd/src/avx2.rs
@@ -0,0 +1,928 @@
+#[cfg(target_arch = "x86")]
+use core::arch::x86::*;
+#[cfg(target_arch = "x86_64")]
+use core::arch::x86_64::*;
+
+use crate::guts::{
+    assemble_count, count_high, count_low, final_block, flag_word, input_debug_asserts, Finalize,
+    Job, LastNode, Stride,
+};
+use crate::{Count, Word, BLOCKBYTES, IV, SIGMA};
+use arrayref::{array_refs, mut_array_refs};
+use core::cmp;
+use core::mem;
+
+pub const DEGREE: usize = 4;
+
+#[inline(always)]
+unsafe fn loadu(src: *const [Word; DEGREE]) -> __m256i {
+    // This is an unaligned load, so the pointer cast is allowed.
+    _mm256_loadu_si256(src as *const __m256i)
+}
+
+#[inline(always)]
+unsafe fn storeu(src: __m256i, dest: *mut [Word; DEGREE]) {
+    // This is an unaligned store, so the pointer cast is allowed.
+    _mm256_storeu_si256(dest as *mut __m256i, src)
+}
+
+#[inline(always)]
+unsafe fn loadu_128(mem_addr: &[u8; 16]) -> __m128i {
+    _mm_loadu_si128(mem_addr.as_ptr() as *const __m128i)
+}
+
+#[inline(always)]
+unsafe fn add(a: __m256i, b: __m256i) -> __m256i {
+    _mm256_add_epi64(a, b)
+}
+
+#[inline(always)]
+unsafe fn eq(a: __m256i, b: __m256i) -> __m256i {
+    _mm256_cmpeq_epi64(a, b)
+}
+
+#[inline(always)]
+unsafe fn and(a: __m256i, b: __m256i) -> __m256i {
+    _mm256_and_si256(a, b)
+}
+
+#[inline(always)]
+unsafe fn negate_and(a: __m256i, b: __m256i) -> __m256i {
+    // Note that "and not" implies the reverse of the actual arg order.
+    _mm256_andnot_si256(a, b)
+}
+
+#[inline(always)]
+unsafe fn xor(a: __m256i, b: __m256i) -> __m256i {
+    _mm256_xor_si256(a, b)
+}
+
+#[inline(always)]
+unsafe fn set1(x: u64) -> __m256i {
+    _mm256_set1_epi64x(x as i64)
+}
+
+#[inline(always)]
+unsafe fn set4(a: u64, b: u64, c: u64, d: u64) -> __m256i {
+    _mm256_setr_epi64x(a as i64, b as i64, c as i64, d as i64)
+}
+
+// Adapted from https://github.com/rust-lang-nursery/stdsimd/pull/479.
+macro_rules! _MM_SHUFFLE {
+    ($z:expr, $y:expr, $x:expr, $w:expr) => {
+        ($z << 6) | ($y << 4) | ($x << 2) | $w
+    };
+}
+
+#[inline(always)]
+unsafe fn rot32(x: __m256i) -> __m256i {
+    _mm256_shuffle_epi32(x, _MM_SHUFFLE!(2, 3, 0, 1))
+}
+
+#[inline(always)]
+unsafe fn rot24(x: __m256i) -> __m256i {
+    let rotate24 = _mm256_setr_epi8(
+        3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10, 3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13,
+        14, 15, 8, 9, 10,
+    );
+    _mm256_shuffle_epi8(x, rotate24)
+}
+
+#[inline(always)]
+unsafe fn rot16(x: __m256i) -> __m256i {
+    let rotate16 = _mm256_setr_epi8(
+        2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9, 2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12,
+        13, 14, 15, 8, 9,
+    );
+    _mm256_shuffle_epi8(x, rotate16)
+}
+
+#[inline(always)]
+unsafe fn rot63(x: __m256i) -> __m256i {
+    _mm256_or_si256(_mm256_srli_epi64(x, 63), add(x, x))
+}
+
+#[inline(always)]
+unsafe fn g1(a: &mut __m256i, b: &mut __m256i, c: &mut __m256i, d: &mut __m256i, m: &mut __m256i) {
+    *a = add(*a, *m);
+    *a = add(*a, *b);
+    *d = xor(*d, *a);
+    *d = rot32(*d);
+    *c = add(*c, *d);
+    *b = xor(*b, *c);
+    *b = rot24(*b);
+}
+
+#[inline(always)]
+unsafe fn g2(a: &mut __m256i, b: &mut __m256i, c: &mut __m256i, d: &mut __m256i, m: &mut __m256i) {
+    *a = add(*a, *m);
+    *a = add(*a, *b);
+    *d = xor(*d, *a);
+    *d = rot16(*d);
+    *c = add(*c, *d);
+    *b = xor(*b, *c);
+    *b = rot63(*b);
+}
+
+// Note the optimization here of leaving b as the unrotated row, rather than a.
+// All the message loads below are adjusted to compensate for this. See
+// discussion at https://github.com/sneves/blake2-avx2/pull/4
+#[inline(always)]
+unsafe fn diagonalize(a: &mut __m256i, _b: &mut __m256i, c: &mut __m256i, d: &mut __m256i) {
+    *a = _mm256_permute4x64_epi64(*a, _MM_SHUFFLE!(2, 1, 0, 3));
+    *d = _mm256_permute4x64_epi64(*d, _MM_SHUFFLE!(1, 0, 3, 2));
+    *c = _mm256_permute4x64_epi64(*c, _MM_SHUFFLE!(0, 3, 2, 1));
+}
+
+#[inline(always)]
+unsafe fn undiagonalize(a: &mut __m256i, _b: &mut __m256i, c: &mut __m256i, d: &mut __m256i) {
+    *a = _mm256_permute4x64_epi64(*a, _MM_SHUFFLE!(0, 3, 2, 1));
+    *d = _mm256_permute4x64_epi64(*d, _MM_SHUFFLE!(1, 0, 3, 2));
+    *c = _mm256_permute4x64_epi64(*c, _MM_SHUFFLE!(2, 1, 0, 3));
+}
+
+#[inline(always)]
+unsafe fn compress_block(
+    block: &[u8; BLOCKBYTES],
+    words: &mut [Word; 8],
+    count: Count,
+    last_block: Word,
+    last_node: Word,
+) {
+    let (words_low, words_high) = mut_array_refs!(words, DEGREE, DEGREE);
+    let (iv_low, iv_high) = array_refs!(&IV, DEGREE, DEGREE);
+    let mut a = loadu(words_low);
+    let mut b = loadu(words_high);
+    let mut c = loadu(iv_low);
+    let flags = set4(count_low(count), count_high(count), last_block, last_node);
+    let mut d = xor(loadu(iv_high), flags);
+
+    let msg_chunks = array_refs!(block, 16, 16, 16, 16, 16, 16, 16, 16);
+    let m0 = _mm256_broadcastsi128_si256(loadu_128(msg_chunks.0));
+    let m1 = _mm256_broadcastsi128_si256(loadu_128(msg_chunks.1));
+    let m2 = _mm256_broadcastsi128_si256(loadu_128(msg_chunks.2));
+    let m3 = _mm256_broadcastsi128_si256(loadu_128(msg_chunks.3));
+    let m4 = _mm256_broadcastsi128_si256(loadu_128(msg_chunks.4));
+    let m5 = _mm256_broadcastsi128_si256(loadu_128(msg_chunks.5));
+    let m6 = _mm256_broadcastsi128_si256(loadu_128(msg_chunks.6));
+    let m7 = _mm256_broadcastsi128_si256(loadu_128(msg_chunks.7));
+
+    let iv0 = a;
+    let iv1 = b;
+    let mut t0;
+    let mut t1;
+    let mut b0;
+
+    // round 1
+    t0 = _mm256_unpacklo_epi64(m0, m1);
+    t1 = _mm256_unpacklo_epi64(m2, m3);
+    b0 = _mm256_blend_epi32(t0, t1, 0xF0);
+    g1(&mut a, &mut b, &mut c, &mut d, &mut b0);
+    t0 = _mm256_unpackhi_epi64(m0, m1);
+    t1 = _mm256_unpackhi_epi64(m2, m3);
+    b0 = _mm256_blend_epi32(t0, t1, 0xF0);
+    g2(&mut a, &mut b, &mut c, &mut d, &mut b0);
+    diagonalize(&mut a, &mut b, &mut c, &mut d);
+    t0 = _mm256_unpacklo_epi64(m7, m4);
+    t1 = _mm256_unpacklo_epi64(m5, m6);
+    b0 = _mm256_blend_epi32(t0, t1, 0xF0);
+    g1(&mut a, &mut b, &mut c, &mut d, &mut b0);
+    t0 = _mm256_unpackhi_epi64(m7, m4);
+    t1 = _mm256_unpackhi_epi64(m5, m6);
+    b0 = _mm256_blend_epi32(t0, t1, 0xF0);
+    g2(&mut a, &mut b, &mut c, &mut d, &mut b0);
+    undiagonalize(&mut a, &mut b, &mut c, &mut d);
+
+    // round 2
+    t0 = _mm256_unpacklo_epi64(m7, m2);
+    t1 = _mm256_unpackhi_epi64(m4, m6);
+    b0 = _mm256_blend_epi32(t0, t1, 0xF0);
+    g1(&mut a, &mut b, &mut c, &mut d, &mut b0);
+    t0 = _mm256_unpacklo_epi64(m5, m4);
+    t1 = _mm256_alignr_epi8(m3, m7, 8);
+    b0 = _mm256_blend_epi32(t0, t1, 0xF0);
+    g2(&mut a, &mut b, &mut c, &mut d, &mut b0);
+    diagonalize(&mut a, &mut b, &mut c, &mut d);
+    t0 = _mm256_unpackhi_epi64(m2, m0);
+    t1 = _mm256_blend_epi32(m5, m0, 0x33);
+    b0 = _mm256_blend_epi32(t0, t1, 0xF0);
+    g1(&mut a, &mut b, &mut c, &mut d, &mut b0);
+    t0 = _mm256_alignr_epi8(m6, m1, 8);
+    t1 = _mm256_blend_epi32(m3, m1, 0x33);
+    b0 = _mm256_blend_epi32(t0, t1, 0xF0);
+    g2(&mut a, &mut b, &mut c, &mut d, &mut b0);
+    undiagonalize(&mut a, &mut b, &mut c, &mut d);
+
+    // round 3
+    t0 = _mm256_alignr_epi8(m6, m5, 8);
+    t1 = _mm256_unpackhi_epi64(m2, m7);
+    b0 = _mm256_blend_epi32(t0, t1, 0xF0);
+    g1(&mut a, &mut b, &mut c, &mut d, &mut b0);
+    t0 = _mm256_unpacklo_epi64(m4, m0);
+    t1 = _mm256_blend_epi32(m6, m1, 0x33);
+    b0 = _mm256_blend_epi32(t0, t1, 0xF0);
+    g2(&mut a, &mut b, &mut c, &mut d, &mut b0);
+    diagonalize(&mut a, &mut b, &mut c, &mut d);
+    t0 = _mm256_alignr_epi8(m5, m4, 8);
+    t1 = _mm256_unpackhi_epi64(m1, m3);
+    b0 = _mm256_blend_epi32(t0, t1, 0xF0);
+    g1(&mut a, &mut b, &mut c, &mut d, &mut b0);
+    t0 = _mm256_unpacklo_epi64(m2, m7);
+    t1 = _mm256_blend_epi32(m0, m3, 0x33);
+    b0 = _mm256_blend_epi32(t0, t1, 0xF0);
+    g2(&mut a, &mut b, &mut c, &mut d, &mut b0);
+    undiagonalize(&mut a, &mut b, &mut c, &mut d);
+
+    // round 4
+    t0 = _mm256_unpackhi_epi64(m3, m1);
+    t1 = _mm256_unpackhi_epi64(m6, m5);
+    b0 = _mm256_blend_epi32(t0, t1, 0xF0);
+    g1(&mut a, &mut b, &mut c, &mut d, &mut b0);
+    t0 = _mm256_unpackhi_epi64(m4, m0);
+    t1 = _mm256_unpacklo_epi64(m6, m7);
+    b0 = _mm256_blend_epi32(t0, t1, 0xF0);
+    g2(&mut a, &mut b, &mut c, &mut d, &mut b0);
+    diagonalize(&mut a, &mut b, &mut c, &mut d);
+    t0 = _mm256_alignr_epi8(m1, m7, 8);
+    t1 = _mm256_shuffle_epi32(m2, _MM_SHUFFLE!(1, 0, 3, 2));
+    b0 = _mm256_blend_epi32(t0, t1, 0xF0);
+    g1(&mut a, &mut b, &mut c, &mut d, &mut b0);
+    t0 = _mm256_unpacklo_epi64(m4, m3);
+    t1 = _mm256_unpacklo_epi64(m5, m0);
+    b0 = _mm256_blend_epi32(t0, t1, 0xF0);
+    g2(&mut a, &mut b, &mut c, &mut d, &mut b0);
+    undiagonalize(&mut a, &mut b, &mut c, &mut d);
+
+    // round 5
+    t0 = _mm256_unpackhi_epi64(m4, m2);
+    t1 = _mm256_unpacklo_epi64(m1, m5);
+    b0 = _mm256_blend_epi32(t0, t1, 0xF0);
+    g1(&mut a, &mut b, &mut c, &mut d, &mut b0);
+    t0 = _mm256_blend_epi32(m3, m0, 0x33);
+    t1 = _mm256_blend_epi32(m7, m2, 0x33);
+    b0 = _mm256_blend_epi32(t0, t1, 0xF0);
+    g2(&mut a, &mut b, &mut c, &mut d, &mut b0);
+    diagonalize(&mut a, &mut b, &mut c, &mut d);
+    t0 = _mm256_alignr_epi8(m7, m1, 8);
+    t1 = _mm256_alignr_epi8(m3, m5, 8);
+    b0 = _mm256_blend_epi32(t0, t1, 0xF0);
+    g1(&mut a, &mut b, &mut c, &mut d, &mut b0);
+    t0 = _mm256_unpackhi_epi64(m6, m0);
+    t1 = _mm256_unpacklo_epi64(m6, m4);
+    b0 = _mm256_blend_epi32(t0, t1, 0xF0);
+    g2(&mut a, &mut b, &mut c, &mut d, &mut b0);
+    undiagonalize(&mut a, &mut b, &mut c, &mut d);
+
+    // round 6
+    t0 = _mm256_unpacklo_epi64(m1, m3);
+    t1 = _mm256_unpacklo_epi64(m0, m4);
+    b0 = _mm256_blend_epi32(t0, t1, 0xF0);
+    g1(&mut a, &mut b, &mut c, &mut d, &mut b0);
+    t0 = _mm256_unpacklo_epi64(m6, m5);
+    t1 = _mm256_unpackhi_epi64(m5, m1);
+    b0 = _mm256_blend_epi32(t0, t1, 0xF0);
+    g2(&mut a, &mut b, &mut c, &mut d, &mut b0);
+    diagonalize(&mut a, &mut b, &mut c, &mut d);
+    t0 = _mm256_alignr_epi8(m2, m0, 8);
+    t1 = _mm256_unpackhi_epi64(m3, m7);
+    b0 = _mm256_blend_epi32(t0, t1, 0xF0);
+    g1(&mut a, &mut b, &mut c, &mut d, &mut b0);
+    t0 = _mm256_unpackhi_epi64(m4, m6);
+    t1 = _mm256_alignr_epi8(m7, m2, 8);
+    b0 = _mm256_blend_epi32(t0, t1, 0xF0);
+    g2(&mut a, &mut b, &mut c, &mut d, &mut b0);
+    undiagonalize(&mut a, &mut b, &mut c, &mut d);
+
+    // round 7
+    t0 = _mm256_blend_epi32(m0, m6, 0x33);
+    t1 = _mm256_unpacklo_epi64(m7, m2);
+    b0 = _mm256_blend_epi32(t0, t1, 0xF0);
+    g1(&mut a, &mut b, &mut c, &mut d, &mut b0);
+    t0 = _mm256_unpackhi_epi64(m2, m7);
+    t1 = _mm256_alignr_epi8(m5, m6, 8);
+    b0 = _mm256_blend_epi32(t0, t1, 0xF0);
+    g2(&mut a, &mut b, &mut c, &mut d, &mut b0);
+    diagonalize(&mut a, &mut b, &mut c, &mut d);
+    t0 = _mm256_unpacklo_epi64(m4, m0);
+    t1 = _mm256_blend_epi32(m4, m3, 0x33);
+    b0 = _mm256_blend_epi32(t0, t1, 0xF0);
+    g1(&mut a, &mut b, &mut c, &mut d, &mut b0);
+    t0 = _mm256_unpackhi_epi64(m5, m3);
+    t1 = _mm256_shuffle_epi32(m1, _MM_SHUFFLE!(1, 0, 3, 2));
+    b0 = _mm256_blend_epi32(t0, t1, 0xF0);
+    g2(&mut a, &mut b, &mut c, &mut d, &mut b0);
+    undiagonalize(&mut a, &mut b, &mut c, &mut d);
+
+    // round 8
+    t0 = _mm256_unpackhi_epi64(m6, m3);
+    t1 = _mm256_blend_epi32(m1, m6, 0x33);
+    b0 = _mm256_blend_epi32(t0, t1, 0xF0);
+    g1(&mut a, &mut b, &mut c, &mut d, &mut b0);
+    t0 = _mm256_alignr_epi8(m7, m5, 8);
+    t1 = _mm256_unpackhi_epi64(m0, m4);
+    b0 = _mm256_blend_epi32(t0, t1, 0xF0);
+    g2(&mut a, &mut b, &mut c, &mut d, &mut b0);
+    diagonalize(&mut a, &mut b, &mut c, &mut d);
+    t0 = _mm256_blend_epi32(m2, m1, 0x33);
+    t1 = _mm256_alignr_epi8(m4, m7, 8);
+    b0 = _mm256_blend_epi32(t0, t1, 0xF0);
+    g1(&mut a, &mut b, &mut c, &mut d, &mut b0);
+    t0 = _mm256_unpacklo_epi64(m5, m0);
+    t1 = _mm256_unpacklo_epi64(m2, m3);
+    b0 = _mm256_blend_epi32(t0, t1, 0xF0);
+    g2(&mut a, &mut b, &mut c, &mut d, &mut b0);
+    undiagonalize(&mut a, &mut b, &mut c, &mut d);
+
+    // round 9
+    t0 = _mm256_unpacklo_epi64(m3, m7);
+    t1 = _mm256_alignr_epi8(m0, m5, 8);
+    b0 = _mm256_blend_epi32(t0, t1, 0xF0);
+    g1(&mut a, &mut b, &mut c, &mut d, &mut b0);
+    t0 = _mm256_unpackhi_epi64(m7, m4);
+    t1 = _mm256_alignr_epi8(m4, m1, 8);
+    b0 = _mm256_blend_epi32(t0, t1, 0xF0);
+    g2(&mut a, &mut b, &mut c, &mut d, &mut b0);
+    diagonalize(&mut a, &mut b, &mut c, &mut d);
+    t0 = _mm256_unpacklo_epi64(m5, m6);
+    t1 = _mm256_unpackhi_epi64(m6, m0);
+    b0 = _mm256_blend_epi32(t0, t1, 0xF0);
+    g1(&mut a, &mut b, &mut c, &mut d, &mut b0);
+    t0 = _mm256_alignr_epi8(m1, m2, 8);
+    t1 = _mm256_alignr_epi8(m2, m3, 8);
+    b0 = _mm256_blend_epi32(t0, t1, 0xF0);
+    g2(&mut a, &mut b, &mut c, &mut d, &mut b0);
+    undiagonalize(&mut a, &mut b, &mut c, &mut d);
+
+    // round 10
+    t0 = _mm256_unpacklo_epi64(m5, m4);
+    t1 = _mm256_unpackhi_epi64(m3, m0);
+    b0 = _mm256_blend_epi32(t0, t1, 0xF0);
+    g1(&mut a, &mut b, &mut c, &mut d, &mut b0);
+    t0 = _mm256_unpacklo_epi64(m1, m2);
+    t1 = _mm256_blend_epi32(m2, m3, 0x33);
+    b0 = _mm256_blend_epi32(t0, t1, 0xF0);
+    g2(&mut a, &mut b, &mut c, &mut d, &mut b0);
+    diagonalize(&mut a, &mut b, &mut c, &mut d);
+    t0 = _mm256_unpackhi_epi64(m6, m7);
+    t1 = _mm256_unpackhi_epi64(m4, m1);
+    b0 = _mm256_blend_epi32(t0, t1, 0xF0);
+    g1(&mut a, &mut b, &mut c, &mut d, &mut b0);
+    t0 = _mm256_blend_epi32(m5, m0, 0x33);
+    t1 = _mm256_unpacklo_epi64(m7, m6);
+    b0 = _mm256_blend_epi32(t0, t1, 0xF0);
+    g2(&mut a, &mut b, &mut c, &mut d, &mut b0);
+    undiagonalize(&mut a, &mut b, &mut c, &mut d);
+
+    // round 11
+    t0 = _mm256_unpacklo_epi64(m0, m1);
+    t1 = _mm256_unpacklo_epi64(m2, m3);
+    b0 = _mm256_blend_epi32(t0, t1, 0xF0);
+    g1(&mut a, &mut b, &mut c, &mut d, &mut b0);
+    t0 = _mm256_unpackhi_epi64(m0, m1);
+    t1 = _mm256_unpackhi_epi64(m2, m3);
+    b0 = _mm256_blend_epi32(t0, t1, 0xF0);
+    g2(&mut a, &mut b, &mut c, &mut d, &mut b0);
+    diagonalize(&mut a, &mut b, &mut c, &mut d);
+    t0 = _mm256_unpacklo_epi64(m7, m4);
+    t1 = _mm256_unpacklo_epi64(m5, m6);
+    b0 = _mm256_blend_epi32(t0, t1, 0xF0);
+    g1(&mut a, &mut b, &mut c, &mut d, &mut b0);
+    t0 = _mm256_unpackhi_epi64(m7, m4);
+    t1 = _mm256_unpackhi_epi64(m5, m6);
+    b0 = _mm256_blend_epi32(t0, t1, 0xF0);
+    g2(&mut a, &mut b, &mut c, &mut d, &mut b0);
+    undiagonalize(&mut a, &mut b, &mut c, &mut d);
+
+    // round 12
+    t0 = _mm256_unpacklo_epi64(m7, m2);
+    t1 = _mm256_unpackhi_epi64(m4, m6);
+    b0 = _mm256_blend_epi32(t0, t1, 0xF0);
+    g1(&mut a, &mut b, &mut c, &mut d, &mut b0);
+    t0 = _mm256_unpacklo_epi64(m5, m4);
+    t1 = _mm256_alignr_epi8(m3, m7, 8);
+    b0 = _mm256_blend_epi32(t0, t1, 0xF0);
+    g2(&mut a, &mut b, &mut c, &mut d, &mut b0);
+    diagonalize(&mut a, &mut b, &mut c, &mut d);
+    t0 = _mm256_unpackhi_epi64(m2, m0);
+    t1 = _mm256_blend_epi32(m5, m0, 0x33);
+    b0 = _mm256_blend_epi32(t0, t1, 0xF0);
+    g1(&mut a, &mut b, &mut c, &mut d, &mut b0);
+    t0 = _mm256_alignr_epi8(m6, m1, 8);
+    t1 = _mm256_blend_epi32(m3, m1, 0x33);
+    b0 = _mm256_blend_epi32(t0, t1, 0xF0);
+    g2(&mut a, &mut b, &mut c, &mut d, &mut b0);
+    undiagonalize(&mut a, &mut b, &mut c, &mut d);
+
+    a = xor(a, c);
+    b = xor(b, d);
+    a = xor(a, iv0);
+    b = xor(b, iv1);
+
+    storeu(a, words_low);
+    storeu(b, words_high);
+}
+
+#[target_feature(enable = "avx2")]
+pub unsafe fn compress1_loop(
+    input: &[u8],
+    words: &mut [Word; 8],
+    mut count: Count,
+    last_node: LastNode,
+    finalize: Finalize,
+    stride: Stride,
+) {
+    input_debug_asserts(input, finalize);
+
+    let mut local_words = *words;
+
+    let mut fin_offset = input.len().saturating_sub(1);
+    fin_offset -= fin_offset % stride.padded_blockbytes();
+    let mut buf = [0; BLOCKBYTES];
+    let (fin_block, fin_len, _) = final_block(input, fin_offset, &mut buf, stride);
+    let fin_last_block = flag_word(finalize.yes());
+    let fin_last_node = flag_word(finalize.yes() && last_node.yes());
+
+    let mut offset = 0;
+    loop {
+        let block;
+        let count_delta;
+        let last_block;
+        let last_node;
+        if offset == fin_offset {
+            block = fin_block;
+            count_delta = fin_len;
+            last_block = fin_last_block;
+            last_node = fin_last_node;
+        } else {
+            // This unsafe cast avoids bounds checks. There's guaranteed to be
+            // enough input because `offset < fin_offset`.
+            block = &*(input.as_ptr().add(offset) as *const [u8; BLOCKBYTES]);
+            count_delta = BLOCKBYTES;
+            last_block = flag_word(false);
+            last_node = flag_word(false);
+        };
+
+        count = count.wrapping_add(count_delta as Count);
+        compress_block(block, &mut local_words, count, last_block, last_node);
+
+        // Check for termination before bumping the offset, to avoid overflow.
+        if offset == fin_offset {
+            break;
+        }
+
+        offset += stride.padded_blockbytes();
+    }
+
+    *words = local_words;
+}
+
+// Performance note: Factoring out a G function here doesn't hurt performance,
+// unlike in the case of BLAKE2s where it hurts substantially. In fact, on my
+// machine, it helps a tiny bit. But the difference it tiny, so I'm going to
+// stick to the approach used by https://github.com/sneves/blake2-avx2
+// until/unless I can be sure the (tiny) improvement is consistent across
+// different Intel microarchitectures. Smaller code size is nice, but a
+// divergence between the BLAKE2b and BLAKE2s implementations is less nice.
+#[inline(always)]
+unsafe fn round(v: &mut [__m256i; 16], m: &[__m256i; 16], r: usize) {
+    v[0] = add(v[0], m[SIGMA[r][0] as usize]);
+    v[1] = add(v[1], m[SIGMA[r][2] as usize]);
+    v[2] = add(v[2], m[SIGMA[r][4] as usize]);
+    v[3] = add(v[3], m[SIGMA[r][6] as usize]);
+    v[0] = add(v[0], v[4]);
+    v[1] = add(v[1], v[5]);
+    v[2] = add(v[2], v[6]);
+    v[3] = add(v[3], v[7]);
+    v[12] = xor(v[12], v[0]);
+    v[13] = xor(v[13], v[1]);
+    v[14] = xor(v[14], v[2]);
+    v[15] = xor(v[15], v[3]);
+    v[12] = rot32(v[12]);
+    v[13] = rot32(v[13]);
+    v[14] = rot32(v[14]);
+    v[15] = rot32(v[15]);
+    v[8] = add(v[8], v[12]);
+    v[9] = add(v[9], v[13]);
+    v[10] = add(v[10], v[14]);
+    v[11] = add(v[11], v[15]);
+    v[4] = xor(v[4], v[8]);
+    v[5] = xor(v[5], v[9]);
+    v[6] = xor(v[6], v[10]);
+    v[7] = xor(v[7], v[11]);
+    v[4] = rot24(v[4]);
+    v[5] = rot24(v[5]);
+    v[6] = rot24(v[6]);
+    v[7] = rot24(v[7]);
+    v[0] = add(v[0], m[SIGMA[r][1] as usize]);
+    v[1] = add(v[1], m[SIGMA[r][3] as usize]);
+    v[2] = add(v[2], m[SIGMA[r][5] as usize]);
+    v[3] = add(v[3], m[SIGMA[r][7] as usize]);
+    v[0] = add(v[0], v[4]);
+    v[1] = add(v[1], v[5]);
+    v[2] = add(v[2], v[6]);
+    v[3] = add(v[3], v[7]);
+    v[12] = xor(v[12], v[0]);
+    v[13] = xor(v[13], v[1]);
+    v[14] = xor(v[14], v[2]);
+    v[15] = xor(v[15], v[3]);
+    v[12] = rot16(v[12]);
+    v[13] = rot16(v[13]);
+    v[14] = rot16(v[14]);
+    v[15] = rot16(v[15]);
+    v[8] = add(v[8], v[12]);
+    v[9] = add(v[9], v[13]);
+    v[10] = add(v[10], v[14]);
+    v[11] = add(v[11], v[15]);
+    v[4] = xor(v[4], v[8]);
+    v[5] = xor(v[5], v[9]);
+    v[6] = xor(v[6], v[10]);
+    v[7] = xor(v[7], v[11]);
+    v[4] = rot63(v[4]);
+    v[5] = rot63(v[5]);
+    v[6] = rot63(v[6]);
+    v[7] = rot63(v[7]);
+
+    v[0] = add(v[0], m[SIGMA[r][8] as usize]);
+    v[1] = add(v[1], m[SIGMA[r][10] as usize]);
+    v[2] = add(v[2], m[SIGMA[r][12] as usize]);
+    v[3] = add(v[3], m[SIGMA[r][14] as usize]);
+    v[0] = add(v[0], v[5]);
+    v[1] = add(v[1], v[6]);
+    v[2] = add(v[2], v[7]);
+    v[3] = add(v[3], v[4]);
+    v[15] = xor(v[15], v[0]);
+    v[12] = xor(v[12], v[1]);
+    v[13] = xor(v[13], v[2]);
+    v[14] = xor(v[14], v[3]);
+    v[15] = rot32(v[15]);
+    v[12] = rot32(v[12]);
+    v[13] = rot32(v[13]);
+    v[14] = rot32(v[14]);
+    v[10] = add(v[10], v[15]);
+    v[11] = add(v[11], v[12]);
+    v[8] = add(v[8], v[13]);
+    v[9] = add(v[9], v[14]);
+    v[5] = xor(v[5], v[10]);
+    v[6] = xor(v[6], v[11]);
+    v[7] = xor(v[7], v[8]);
+    v[4] = xor(v[4], v[9]);
+    v[5] = rot24(v[5]);
+    v[6] = rot24(v[6]);
+    v[7] = rot24(v[7]);
+    v[4] = rot24(v[4]);
+    v[0] = add(v[0], m[SIGMA[r][9] as usize]);
+    v[1] = add(v[1], m[SIGMA[r][11] as usize]);
+    v[2] = add(v[2], m[SIGMA[r][13] as usize]);
+    v[3] = add(v[3], m[SIGMA[r][15] as usize]);
+    v[0] = add(v[0], v[5]);
+    v[1] = add(v[1], v[6]);
+    v[2] = add(v[2], v[7]);
+    v[3] = add(v[3], v[4]);
+    v[15] = xor(v[15], v[0]);
+    v[12] = xor(v[12], v[1]);
+    v[13] = xor(v[13], v[2]);
+    v[14] = xor(v[14], v[3]);
+    v[15] = rot16(v[15]);
+    v[12] = rot16(v[12]);
+    v[13] = rot16(v[13]);
+    v[14] = rot16(v[14]);
+    v[10] = add(v[10], v[15]);
+    v[11] = add(v[11], v[12]);
+    v[8] = add(v[8], v[13]);
+    v[9] = add(v[9], v[14]);
+    v[5] = xor(v[5], v[10]);
+    v[6] = xor(v[6], v[11]);
+    v[7] = xor(v[7], v[8]);
+    v[4] = xor(v[4], v[9]);
+    v[5] = rot63(v[5]);
+    v[6] = rot63(v[6]);
+    v[7] = rot63(v[7]);
+    v[4] = rot63(v[4]);
+}
+
+// We'd rather make this a regular function with #[inline(always)], but for
+// some reason that blows up compile times by about 10 seconds, at least in
+// some cases (BLAKE2b avx2.rs). This macro seems to get the same performance
+// result, without the compile time issue.
+macro_rules! compress4_transposed {
+    (
+        $h_vecs:expr,
+        $msg_vecs:expr,
+        $count_low:expr,
+        $count_high:expr,
+        $lastblock:expr,
+        $lastnode:expr,
+    ) => {
+        let h_vecs: &mut [__m256i; 8] = $h_vecs;
+        let msg_vecs: &[__m256i; 16] = $msg_vecs;
+        let count_low: __m256i = $count_low;
+        let count_high: __m256i = $count_high;
+        let lastblock: __m256i = $lastblock;
+        let lastnode: __m256i = $lastnode;
+
+        let mut v = [
+            h_vecs[0],
+            h_vecs[1],
+            h_vecs[2],
+            h_vecs[3],
+            h_vecs[4],
+            h_vecs[5],
+            h_vecs[6],
+            h_vecs[7],
+            set1(IV[0]),
+            set1(IV[1]),
+            set1(IV[2]),
+            set1(IV[3]),
+            xor(set1(IV[4]), count_low),
+            xor(set1(IV[5]), count_high),
+            xor(set1(IV[6]), lastblock),
+            xor(set1(IV[7]), lastnode),
+        ];
+
+        round(&mut v, &msg_vecs, 0);
+        round(&mut v, &msg_vecs, 1);
+        round(&mut v, &msg_vecs, 2);
+        round(&mut v, &msg_vecs, 3);
+        round(&mut v, &msg_vecs, 4);
+        round(&mut v, &msg_vecs, 5);
+        round(&mut v, &msg_vecs, 6);
+        round(&mut v, &msg_vecs, 7);
+        round(&mut v, &msg_vecs, 8);
+        round(&mut v, &msg_vecs, 9);
+        round(&mut v, &msg_vecs, 10);
+        round(&mut v, &msg_vecs, 11);
+
+        h_vecs[0] = xor(xor(h_vecs[0], v[0]), v[8]);
+        h_vecs[1] = xor(xor(h_vecs[1], v[1]), v[9]);
+        h_vecs[2] = xor(xor(h_vecs[2], v[2]), v[10]);
+        h_vecs[3] = xor(xor(h_vecs[3], v[3]), v[11]);
+        h_vecs[4] = xor(xor(h_vecs[4], v[4]), v[12]);
+        h_vecs[5] = xor(xor(h_vecs[5], v[5]), v[13]);
+        h_vecs[6] = xor(xor(h_vecs[6], v[6]), v[14]);
+        h_vecs[7] = xor(xor(h_vecs[7], v[7]), v[15]);
+    };
+}
+
+#[inline(always)]
+unsafe fn interleave128(a: __m256i, b: __m256i) -> (__m256i, __m256i) {
+    (
+        _mm256_permute2x128_si256(a, b, 0x20),
+        _mm256_permute2x128_si256(a, b, 0x31),
+    )
+}
+
+// There are several ways to do a transposition. We could do it naively, with 8 separate
+// _mm256_set_epi64x instructions, referencing each of the 64 words explicitly. Or we could copy
+// the vecs into contiguous storage and then use gather instructions. This third approach is to use
+// a series of unpack instructions to interleave the vectors. In my benchmarks, interleaving is the
+// fastest approach. To test this, run `cargo +nightly bench --bench libtest load_4` in the
+// https://github.com/oconnor663/bao_experiments repo.
+#[inline(always)]
+unsafe fn transpose_vecs(
+    vec_a: __m256i,
+    vec_b: __m256i,
+    vec_c: __m256i,
+    vec_d: __m256i,
+) -> [__m256i; DEGREE] {
+    // Interleave 64-bit lates. The low unpack is lanes 00/22 and the high is 11/33.
+    let ab_02 = _mm256_unpacklo_epi64(vec_a, vec_b);
+    let ab_13 = _mm256_unpackhi_epi64(vec_a, vec_b);
+    let cd_02 = _mm256_unpacklo_epi64(vec_c, vec_d);
+    let cd_13 = _mm256_unpackhi_epi64(vec_c, vec_d);
+
+    // Interleave 128-bit lanes.
+    let (abcd_0, abcd_2) = interleave128(ab_02, cd_02);
+    let (abcd_1, abcd_3) = interleave128(ab_13, cd_13);
+
+    [abcd_0, abcd_1, abcd_2, abcd_3]
+}
+
+#[inline(always)]
+unsafe fn transpose_state_vecs(jobs: &[Job; DEGREE]) -> [__m256i; 8] {
+    // Load all the state words into transposed vectors, where the first vector
+    // has the first word of each state, etc. Transposing once at the beginning
+    // and once at the end is more efficient that repeating it for each block.
+    let words0 = array_refs!(&jobs[0].words, DEGREE, DEGREE);
+    let words1 = array_refs!(&jobs[1].words, DEGREE, DEGREE);
+    let words2 = array_refs!(&jobs[2].words, DEGREE, DEGREE);
+    let words3 = array_refs!(&jobs[3].words, DEGREE, DEGREE);
+    let [h0, h1, h2, h3] = transpose_vecs(
+        loadu(words0.0),
+        loadu(words1.0),
+        loadu(words2.0),
+        loadu(words3.0),
+    );
+    let [h4, h5, h6, h7] = transpose_vecs(
+        loadu(words0.1),
+        loadu(words1.1),
+        loadu(words2.1),
+        loadu(words3.1),
+    );
+    [h0, h1, h2, h3, h4, h5, h6, h7]
+}
+
+#[inline(always)]
+unsafe fn untranspose_state_vecs(h_vecs: &[__m256i; 8], jobs: &mut [Job; DEGREE]) {
+    // Un-transpose the updated state vectors back into the caller's arrays.
+    let [job0, job1, job2, job3] = jobs;
+    let words0 = mut_array_refs!(&mut job0.words, DEGREE, DEGREE);
+    let words1 = mut_array_refs!(&mut job1.words, DEGREE, DEGREE);
+    let words2 = mut_array_refs!(&mut job2.words, DEGREE, DEGREE);
+    let words3 = mut_array_refs!(&mut job3.words, DEGREE, DEGREE);
+    let out = transpose_vecs(h_vecs[0], h_vecs[1], h_vecs[2], h_vecs[3]);
+    storeu(out[0], words0.0);
+    storeu(out[1], words1.0);
+    storeu(out[2], words2.0);
+    storeu(out[3], words3.0);
+    let out = transpose_vecs(h_vecs[4], h_vecs[5], h_vecs[6], h_vecs[7]);
+    storeu(out[0], words0.1);
+    storeu(out[1], words1.1);
+    storeu(out[2], words2.1);
+    storeu(out[3], words3.1);
+}
+
+#[inline(always)]
+unsafe fn transpose_msg_vecs(blocks: [*const [u8; BLOCKBYTES]; DEGREE]) -> [__m256i; 16] {
+    // These input arrays have no particular alignment, so we use unaligned
+    // loads to read from them.
+    let block0 = blocks[0] as *const [Word; DEGREE];
+    let block1 = blocks[1] as *const [Word; DEGREE];
+    let block2 = blocks[2] as *const [Word; DEGREE];
+    let block3 = blocks[3] as *const [Word; DEGREE];
+    let [m0, m1, m2, m3] = transpose_vecs(
+        loadu(block0.add(0)),
+        loadu(block1.add(0)),
+        loadu(block2.add(0)),
+        loadu(block3.add(0)),
+    );
+    let [m4, m5, m6, m7] = transpose_vecs(
+        loadu(block0.add(1)),
+        loadu(block1.add(1)),
+        loadu(block2.add(1)),
+        loadu(block3.add(1)),
+    );
+    let [m8, m9, m10, m11] = transpose_vecs(
+        loadu(block0.add(2)),
+        loadu(block1.add(2)),
+        loadu(block2.add(2)),
+        loadu(block3.add(2)),
+    );
+    let [m12, m13, m14, m15] = transpose_vecs(
+        loadu(block0.add(3)),
+        loadu(block1.add(3)),
+        loadu(block2.add(3)),
+        loadu(block3.add(3)),
+    );
+    [
+        m0, m1, m2, m3, m4, m5, m6, m7, m8, m9, m10, m11, m12, m13, m14, m15,
+    ]
+}
+
+#[inline(always)]
+unsafe fn load_counts(jobs: &[Job; DEGREE]) -> (__m256i, __m256i) {
+    (
+        set4(
+            count_low(jobs[0].count),
+            count_low(jobs[1].count),
+            count_low(jobs[2].count),
+            count_low(jobs[3].count),
+        ),
+        set4(
+            count_high(jobs[0].count),
+            count_high(jobs[1].count),
+            count_high(jobs[2].count),
+            count_high(jobs[3].count),
+        ),
+    )
+}
+
+#[inline(always)]
+unsafe fn store_counts(jobs: &mut [Job; DEGREE], low: __m256i, high: __m256i) {
+    let low_ints: [Word; DEGREE] = mem::transmute(low);
+    let high_ints: [Word; DEGREE] = mem::transmute(high);
+    for i in 0..DEGREE {
+        jobs[i].count = assemble_count(low_ints[i], high_ints[i]);
+    }
+}
+
+#[inline(always)]
+unsafe fn add_to_counts(lo: &mut __m256i, hi: &mut __m256i, delta: __m256i) {
+    // If the low counts reach zero, that means they wrapped, unless the delta
+    // was also zero.
+    *lo = add(*lo, delta);
+    let lo_reached_zero = eq(*lo, set1(0));
+    let delta_was_zero = eq(delta, set1(0));
+    let hi_inc = and(set1(1), negate_and(delta_was_zero, lo_reached_zero));
+    *hi = add(*hi, hi_inc);
+}
+
+#[inline(always)]
+unsafe fn flags_vec(flags: [bool; DEGREE]) -> __m256i {
+    set4(
+        flag_word(flags[0]),
+        flag_word(flags[1]),
+        flag_word(flags[2]),
+        flag_word(flags[3]),
+    )
+}
+
+#[target_feature(enable = "avx2")]
+pub unsafe fn compress4_loop(jobs: &mut [Job; DEGREE], finalize: Finalize, stride: Stride) {
+    // If we're not finalizing, there can't be a partial block at the end.
+    for job in jobs.iter() {
+        input_debug_asserts(job.input, finalize);
+    }
+
+    let msg_ptrs = [
+        jobs[0].input.as_ptr(),
+        jobs[1].input.as_ptr(),
+        jobs[2].input.as_ptr(),
+        jobs[3].input.as_ptr(),
+    ];
+    let mut h_vecs = transpose_state_vecs(&jobs);
+    let (mut counts_lo, mut counts_hi) = load_counts(&jobs);
+
+    // Prepare the final blocks (note, which could be empty if the input is
+    // empty). Do all this before entering the main loop.
+    let min_len = jobs.iter().map(|job| job.input.len()).min().unwrap();
+    let mut fin_offset = min_len.saturating_sub(1);
+    fin_offset -= fin_offset % stride.padded_blockbytes();
+    // Performance note, making these buffers mem::uninitialized() seems to
+    // cause problems in the optimizer.
+    let mut buf0: [u8; BLOCKBYTES] = [0; BLOCKBYTES];
+    let mut buf1: [u8; BLOCKBYTES] = [0; BLOCKBYTES];
+    let mut buf2: [u8; BLOCKBYTES] = [0; BLOCKBYTES];
+    let mut buf3: [u8; BLOCKBYTES] = [0; BLOCKBYTES];
+    let (block0, len0, finalize0) = final_block(jobs[0].input, fin_offset, &mut buf0, stride);
+    let (block1, len1, finalize1) = final_block(jobs[1].input, fin_offset, &mut buf1, stride);
+    let (block2, len2, finalize2) = final_block(jobs[2].input, fin_offset, &mut buf2, stride);
+    let (block3, len3, finalize3) = final_block(jobs[3].input, fin_offset, &mut buf3, stride);
+    let fin_blocks: [*const [u8; BLOCKBYTES]; DEGREE] = [block0, block1, block2, block3];
+    let fin_counts_delta = set4(len0 as Word, len1 as Word, len2 as Word, len3 as Word);
+    let fin_last_block;
+    let fin_last_node;
+    if finalize.yes() {
+        fin_last_block = flags_vec([finalize0, finalize1, finalize2, finalize3]);
+        fin_last_node = flags_vec([
+            finalize0 && jobs[0].last_node.yes(),
+            finalize1 && jobs[1].last_node.yes(),
+            finalize2 && jobs[2].last_node.yes(),
+            finalize3 && jobs[3].last_node.yes(),
+        ]);
+    } else {
+        fin_last_block = set1(0);
+        fin_last_node = set1(0);
+    }
+
+    // The main loop.
+    let mut offset = 0;
+    loop {
+        let blocks;
+        let counts_delta;
+        let last_block;
+        let last_node;
+        if offset == fin_offset {
+            blocks = fin_blocks;
+            counts_delta = fin_counts_delta;
+            last_block = fin_last_block;
+            last_node = fin_last_node;
+        } else {
+            blocks = [
+                msg_ptrs[0].add(offset) as *const [u8; BLOCKBYTES],
+                msg_ptrs[1].add(offset) as *const [u8; BLOCKBYTES],
+                msg_ptrs[2].add(offset) as *const [u8; BLOCKBYTES],
+                msg_ptrs[3].add(offset) as *const [u8; BLOCKBYTES],
+            ];
+            counts_delta = set1(BLOCKBYTES as Word);
+            last_block = set1(0);
+            last_node = set1(0);
+        };
+
+        let m_vecs = transpose_msg_vecs(blocks);
+        add_to_counts(&mut counts_lo, &mut counts_hi, counts_delta);
+        compress4_transposed!(
+            &mut h_vecs,
+            &m_vecs,
+            counts_lo,
+            counts_hi,
+            last_block,
+            last_node,
+        );
+
+        // Check for termination before bumping the offset, to avoid overflow.
+        if offset == fin_offset {
+            break;
+        }
+
+        offset += stride.padded_blockbytes();
+    }
+
+    // Write out the results.
+    untranspose_state_vecs(&h_vecs, &mut *jobs);
+    store_counts(&mut *jobs, counts_lo, counts_hi);
+    let max_consumed = offset.saturating_add(stride.padded_blockbytes());
+    for job in jobs.iter_mut() {
+        let consumed = cmp::min(max_consumed, job.input.len());
+        job.input = &job.input[consumed..];
+    }
+}
diff --git a/third_party/rust/blake2b_simd/src/blake2bp.rs b/third_party/rust/blake2b_simd/src/blake2bp.rs
new file mode 100644
index 0000000000..7bdfad64f6
--- /dev/null
+++ b/third_party/rust/blake2b_simd/src/blake2bp.rs
@@ -0,0 +1,570 @@
+//! BLAKE2bp, a variant of BLAKE2b that uses SIMD more efficiently.
+//!
+//! The AVX2 implementation of BLAKE2bp is about twice as fast that of BLAKE2b.
+//! However, note that it's a different hash function, and it gives a different
+//! hash from BLAKE2b for the same input.
+//!
+//! # Example
+//!
+//! ```
+//! use blake2b_simd::blake2bp;
+//!
+//! let hash = blake2bp::Params::new()
+//!     .hash_length(16)
+//!     .key(b"The Magic Words are Squeamish Ossifrage")
+//!     .to_state()
+//!     .update(b"foo")
+//!     .update(b"bar")
+//!     .update(b"baz")
+//!     .finalize();
+//! assert_eq!("e69c7d2c42a5ac14948772231c68c552", &hash.to_hex());
+//! ```
+
+use crate::guts::{Finalize, Implementation, Job, LastNode, Stride};
+use crate::many;
+use crate::Count;
+use crate::Hash;
+use crate::Word;
+use crate::BLOCKBYTES;
+use crate::KEYBYTES;
+use crate::OUTBYTES;
+use core::cmp;
+use core::fmt;
+use core::mem::size_of;
+
+#[cfg(feature = "std")]
+use std;
+
+pub(crate) const DEGREE: usize = 4;
+
+/// Compute the BLAKE2bp hash of a slice of bytes all at once, using default
+/// parameters.
+///
+/// # Example
+///
+/// ```
+/// # use blake2b_simd::blake2bp::blake2bp;
+/// let expected = "8ca9ccee7946afcb686fe7556628b5ba1bf9a691da37ca58cd049354d99f3704\
+///                 2c007427e5f219b9ab5063707ec6823872dee413ee014b4d02f2ebb6abb5f643";
+/// let hash = blake2bp(b"foo");
+/// assert_eq!(expected, &hash.to_hex());
+/// ```
+pub fn blake2bp(input: &[u8]) -> Hash {
+    Params::new().hash(input)
+}
+
+/// A parameter builder for BLAKE2bp, just like the [`Params`](../struct.Params.html) type for
+/// BLAKE2b.
+///
+/// This builder only supports configuring the hash length and a secret key. This matches the
+/// options provided by the [reference
+/// implementation](https://github.com/BLAKE2/BLAKE2/blob/320c325437539ae91091ce62efec1913cd8093c2/ref/blake2.h#L162-L165).
+///
+/// # Example
+///
+/// ```
+/// use blake2b_simd::blake2bp;
+/// let mut state = blake2bp::Params::new().hash_length(32).to_state();
+/// ```
+#[derive(Clone)]
+pub struct Params {
+    hash_length: u8,
+    key_length: u8,
+    key: [u8; KEYBYTES],
+    implementation: Implementation,
+}
+
+impl Params {
+    /// Equivalent to `Params::default()`.
+    pub fn new() -> Self {
+        Self {
+            hash_length: OUTBYTES as u8,
+            key_length: 0,
+            key: [0; KEYBYTES],
+            implementation: Implementation::detect(),
+        }
+    }
+
+    fn to_words(&self) -> ([[Word; 8]; DEGREE], [Word; 8]) {
+        let mut base_params = crate::Params::new();
+        base_params
+            .hash_length(self.hash_length as usize)
+            .key(&self.key[..self.key_length as usize])
+            .fanout(DEGREE as u8)
+            .max_depth(2)
+            .max_leaf_length(0)
+            // Note that inner_hash_length is always OUTBYTES, regardless of the hash_length
+            // parameter. This isn't documented in the spec, but it matches the behavior of the
+            // reference implementation: https://github.com/BLAKE2/BLAKE2/blob/320c325437539ae91091ce62efec1913cd8093c2/ref/blake2bp-ref.c#L55
+            .inner_hash_length(OUTBYTES);
+        let leaf_words = |worker_index| {
+            base_params
+                .clone()
+                .node_offset(worker_index)
+                .node_depth(0)
+                // Note that setting the last_node flag here has no effect,
+                // because it isn't included in the state words.
+                .to_words()
+        };
+        let leaf_words = [leaf_words(0), leaf_words(1), leaf_words(2), leaf_words(3)];
+        let root_words = base_params
+            .clone()
+            .node_offset(0)
+            .node_depth(1)
+            // Note that setting the last_node flag here has no effect, because
+            // it isn't included in the state words. Also note that because
+            // we're only preserving its state words, the root node won't hash
+            // any key bytes.
+            .to_words();
+        (leaf_words, root_words)
+    }
+
+    /// Hash an input all at once with these parameters.
+    pub fn hash(&self, input: &[u8]) -> Hash {
+        // If there's a key, just fall back to using the State.
+        if self.key_length > 0 {
+            return self.to_state().update(input).finalize();
+        }
+        let (mut leaf_words, mut root_words) = self.to_words();
+        // Hash each leaf in parallel.
+        let jobs = leaf_words.iter_mut().enumerate().map(|(i, words)| {
+            let input_start = cmp::min(input.len(), i * BLOCKBYTES);
+            Job {
+                input: &input[input_start..],
+                words,
+                count: 0,
+                last_node: if i == DEGREE - 1 {
+                    LastNode::Yes
+                } else {
+                    LastNode::No
+                },
+            }
+        });
+        many::compress_many(jobs, self.implementation, Finalize::Yes, Stride::Parallel);
+        // Hash each leaf into the root.
+        finalize_root_words(
+            &leaf_words,
+            &mut root_words,
+            self.hash_length,
+            self.implementation,
+        )
+    }
+
+    /// Construct a BLAKE2bp `State` object based on these parameters.
+    pub fn to_state(&self) -> State {
+        State::with_params(self)
+    }
+
+    /// Set the length of the final hash, from 1 to `OUTBYTES` (64). Apart from controlling the
+    /// length of the final `Hash`, this is also associated data, and changing it will result in a
+    /// totally different hash.
+    pub fn hash_length(&mut self, length: usize) -> &mut Self {
+        assert!(
+            1 <= length && length <= OUTBYTES,
+            "Bad hash length: {}",
+            length
+        );
+        self.hash_length = length as u8;
+        self
+    }
+
+    /// Use a secret key, so that BLAKE2bp acts as a MAC. The maximum key length is `KEYBYTES`
+    /// (64). An empty key is equivalent to having no key at all.
+    pub fn key(&mut self, key: &[u8]) -> &mut Self {
+        assert!(key.len() <= KEYBYTES, "Bad key length: {}", key.len());
+        self.key_length = key.len() as u8;
+        self.key = [0; KEYBYTES];
+        self.key[..key.len()].copy_from_slice(key);
+        self
+    }
+}
+
+impl Default for Params {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl fmt::Debug for Params {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        write!(
+            f,
+            "Params {{ hash_length: {}, key_length: {} }}",
+            self.hash_length,
+            // NB: Don't print the key itself. Debug shouldn't leak secrets.
+            self.key_length,
+        )
+    }
+}
+
+/// An incremental hasher for BLAKE2bp, just like the [`State`](../struct.State.html) type for
+/// BLAKE2b.
+///
+/// # Example
+///
+/// ```
+/// use blake2b_simd::blake2bp;
+///
+/// let mut state = blake2bp::State::new();
+/// state.update(b"foo");
+/// state.update(b"bar");
+/// let hash = state.finalize();
+///
+/// let expected = "e654427b6ef02949471712263e59071abbb6aa94855674c1daeed6cfaf127c33\
+///                 dfa3205f7f7f71e4f0673d25fa82a368488911f446bccd323af3ab03f53e56e5";
+/// assert_eq!(expected, &hash.to_hex());
+/// ```
+#[derive(Clone)]
+pub struct State {
+    leaf_words: [[Word; 8]; DEGREE],
+    root_words: [Word; 8],
+    // Note that this buffer is twice as large as what compress4 needs. That guarantees that we
+    // have enough input when we compress to know we don't need to finalize any of the leaves.
+    buf: [u8; 2 * DEGREE * BLOCKBYTES],
+    buf_len: u16,
+    // Note that this is the *per-leaf* count.
+    count: Count,
+    hash_length: u8,
+    implementation: Implementation,
+    is_keyed: bool,
+}
+
+impl State {
+    /// Equivalent to `State::default()` or `Params::default().to_state()`.
+    pub fn new() -> Self {
+        Self::with_params(&Params::default())
+    }
+
+    fn with_params(params: &Params) -> Self {
+        let (leaf_words, root_words) = params.to_words();
+
+        // If a key is set, initalize the buffer to contain the key bytes. Note
+        // that only the leaves hash key bytes. The root doesn't, even though
+        // the key length it still set in its parameters. Again this isn't
+        // documented in the spec, but it matches the behavior of the reference
+        // implementation:
+        // https://github.com/BLAKE2/BLAKE2/blob/320c325437539ae91091ce62efec1913cd8093c2/ref/blake2bp-ref.c#L128
+        // This particular behavior (though not the inner hash length behavior
+        // above) is also corroborated by the official test vectors; see
+        // tests/vector_tests.rs.
+        let mut buf = [0; 2 * DEGREE * BLOCKBYTES];
+        let mut buf_len = 0;
+        if params.key_length > 0 {
+            for i in 0..DEGREE {
+                let keybytes = &params.key[..params.key_length as usize];
+                buf[i * BLOCKBYTES..][..keybytes.len()].copy_from_slice(keybytes);
+                buf_len = BLOCKBYTES * DEGREE;
+            }
+        }
+
+        Self {
+            leaf_words,
+            root_words,
+            buf,
+            buf_len: buf_len as u16,
+            count: 0, // count gets updated in self.compress()
+            hash_length: params.hash_length,
+            implementation: params.implementation,
+            is_keyed: params.key_length > 0,
+        }
+    }
+
+    fn fill_buf(&mut self, input: &mut &[u8]) {
+        let take = cmp::min(self.buf.len() - self.buf_len as usize, input.len());
+        self.buf[self.buf_len as usize..][..take].copy_from_slice(&input[..take]);
+        self.buf_len += take as u16;
+        *input = &input[take..];
+    }
+
+    fn compress_to_leaves(
+        leaves: &mut [[Word; 8]; DEGREE],
+        input: &[u8],
+        count: &mut Count,
+        implementation: Implementation,
+    ) {
+        // Input is assumed to be an even number of blocks for each leaf. Since
+        // we're not finilizing, debug asserts will fire otherwise.
+        let jobs = leaves.iter_mut().enumerate().map(|(i, words)| {
+            Job {
+                input: &input[i * BLOCKBYTES..],
+                words,
+                count: *count,
+                last_node: LastNode::No, // irrelevant when not finalizing
+            }
+        });
+        many::compress_many(jobs, implementation, Finalize::No, Stride::Parallel);
+        // Note that count is the bytes input *per-leaf*.
+        *count = count.wrapping_add((input.len() / DEGREE) as Count);
+    }
+
+    /// Add input to the hash. You can call `update` any number of times.
+    pub fn update(&mut self, mut input: &[u8]) -> &mut Self {
+        // If we have a partial buffer, try to complete it. If we complete it and there's more
+        // input waiting, we need to compress to make more room. However, because we need to be
+        // sure that *none* of the leaves would need to be finalized as part of this round of
+        // compression, we need to buffer more than we would for BLAKE2b.
+        if self.buf_len > 0 {
+            self.fill_buf(&mut input);
+            // The buffer is large enough for two compressions. If we've filled
+            // the buffer and there's still more input coming, then we have to
+            // do at least one compression. If there's enough input still
+            // coming that all the leaves are guaranteed to get more, do both
+            // compressions in the buffer. Otherwise, do just one and shift the
+            // back half of the buffer to the front.
+            if !input.is_empty() {
+                if input.len() > (DEGREE - 1) * BLOCKBYTES {
+                    // Enough input coming to do both compressions.
+                    Self::compress_to_leaves(
+                        &mut self.leaf_words,
+                        &self.buf,
+                        &mut self.count,
+                        self.implementation,
+                    );
+                    self.buf_len = 0;
+                } else {
+                    // Only enough input coming for one compression.
+                    Self::compress_to_leaves(
+                        &mut self.leaf_words,
+                        &self.buf[..DEGREE * BLOCKBYTES],
+                        &mut self.count,
+                        self.implementation,
+                    );
+                    self.buf_len = (DEGREE * BLOCKBYTES) as u16;
+                    let (buf_front, buf_back) = self.buf.split_at_mut(DEGREE * BLOCKBYTES);
+                    buf_front.copy_from_slice(buf_back);
+                }
+            }
+        }
+
+        // Now we directly compress as much input as possible, without copying
+        // it into the buffer. We need to make sure we buffer at least one byte
+        // for each of the leaves, so that we know we don't need to finalize
+        // them.
+        let needed_tail = (DEGREE - 1) * BLOCKBYTES + 1;
+        let mut bulk_bytes = input.len().saturating_sub(needed_tail);
+        bulk_bytes -= bulk_bytes % (DEGREE * BLOCKBYTES);
+        if bulk_bytes > 0 {
+            Self::compress_to_leaves(
+                &mut self.leaf_words,
+                &input[..bulk_bytes],
+                &mut self.count,
+                self.implementation,
+            );
+            input = &input[bulk_bytes..];
+        }
+
+        // Buffer any remaining input, to be either compressed or finalized in
+        // a subsequent call.
+        self.fill_buf(&mut input);
+        debug_assert_eq!(0, input.len());
+        self
+    }
+
+    /// Finalize the state and return a `Hash`. This method is idempotent, and calling it multiple
+    /// times will give the same result. It's also possible to `update` with more input in between.
+    pub fn finalize(&self) -> Hash {
+        // Hash whatever's remaining in the buffer and finalize the leaves.
+        let buf_len = self.buf_len as usize;
+        let mut leaves_copy = self.leaf_words;
+        let jobs = leaves_copy
+            .iter_mut()
+            .enumerate()
+            .map(|(leaf_index, leaf_words)| {
+                let input = &self.buf[cmp::min(leaf_index * BLOCKBYTES, buf_len)..buf_len];
+                Job {
+                    input,
+                    words: leaf_words,
+                    count: self.count,
+                    last_node: if leaf_index == DEGREE - 1 {
+                        LastNode::Yes
+                    } else {
+                        LastNode::No
+                    },
+                }
+            });
+        many::compress_many(jobs, self.implementation, Finalize::Yes, Stride::Parallel);
+
+        // Concatenate each leaf into the root and hash that.
+        let mut root_words_copy = self.root_words;
+        finalize_root_words(
+            &leaves_copy,
+            &mut root_words_copy,
+            self.hash_length,
+            self.implementation,
+        )
+    }
+
+    /// Return the total number of bytes input so far.
+    ///
+    /// Note that `count` doesn't include the bytes of the key block, if any.
+    /// It's exactly the total number of input bytes fed to `update`.
+    pub fn count(&self) -> Count {
+        // Remember that self.count is *per-leaf*.
+        let mut ret = self
+            .count
+            .wrapping_mul(DEGREE as Count)
+            .wrapping_add(self.buf_len as Count);
+        if self.is_keyed {
+            ret -= (DEGREE * BLOCKBYTES) as Count;
+        }
+        ret
+    }
+}
+
+#[cfg(feature = "std")]
+impl std::io::Write for State {
+    fn write(&mut self, buf: &[u8]) -> std::io::Result<usize> {
+        self.update(buf);
+        Ok(buf.len())
+    }
+
+    fn flush(&mut self) -> std::io::Result<()> {
+        Ok(())
+    }
+}
+
+impl fmt::Debug for State {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        write!(
+            f,
+            "State {{ count: {}, hash_length: {} }}",
+            self.count(),
+            self.hash_length,
+        )
+    }
+}
+
+impl Default for State {
+    fn default() -> Self {
+        Self::with_params(&Params::default())
+    }
+}
+
+// Compress each of the four finalized hashes into the root words as input,
+// using two compressions. Note that even if a future version of this
+// implementation supports the hash_length parameter and sets it as associated
+// data for all nodes, this step must still use the untruncated output of each
+// leaf. Note also that, as mentioned above, the root node doesn't hash any key
+// bytes.
+fn finalize_root_words(
+    leaf_words: &[[Word; 8]; DEGREE],
+    root_words: &mut [Word; 8],
+    hash_length: u8,
+    imp: Implementation,
+) -> Hash {
+    debug_assert_eq!(OUTBYTES, 8 * size_of::<Word>());
+    let mut block = [0; DEGREE * OUTBYTES];
+    for (word, chunk) in leaf_words
+        .iter()
+        .flat_map(|words| words.iter())
+        .zip(block.chunks_exact_mut(size_of::<Word>()))
+    {
+        chunk.copy_from_slice(&word.to_le_bytes());
+    }
+    imp.compress1_loop(
+        &block,
+        root_words,
+        0,
+        LastNode::Yes,
+        Finalize::Yes,
+        Stride::Serial,
+    );
+    Hash {
+        bytes: crate::state_words_to_bytes(&root_words),
+        len: hash_length,
+    }
+}
+
+pub(crate) fn force_portable(params: &mut Params) {
+    params.implementation = Implementation::portable();
+}
+
+#[cfg(test)]
+pub(crate) mod test {
+    use super::*;
+    use crate::paint_test_input;
+
+    // This is a simple reference implementation without the complicated buffering or parameter
+    // support of the real implementation. We need this because the official test vectors don't
+    // include any inputs large enough to exercise all the branches in the buffering logic.
+    fn blake2bp_reference(input: &[u8]) -> Hash {
+        let mut leaves = arrayvec::ArrayVec::<[_; DEGREE]>::new();
+        for leaf_index in 0..DEGREE {
+            leaves.push(
+                crate::Params::new()
+                    .fanout(DEGREE as u8)
+                    .max_depth(2)
+                    .node_offset(leaf_index as u64)
+                    .inner_hash_length(OUTBYTES)
+                    .to_state(),
+            );
+        }
+        leaves[DEGREE - 1].set_last_node(true);
+        for (i, chunk) in input.chunks(BLOCKBYTES).enumerate() {
+            leaves[i % DEGREE].update(chunk);
+        }
+        let mut root = crate::Params::new()
+            .fanout(DEGREE as u8)
+            .max_depth(2)
+            .node_depth(1)
+            .inner_hash_length(OUTBYTES)
+            .last_node(true)
+            .to_state();
+        for leaf in &mut leaves {
+            root.update(leaf.finalize().as_bytes());
+        }
+        root.finalize()
+    }
+
+    #[test]
+    fn test_against_reference() {
+        let mut buf = [0; 21 * BLOCKBYTES];
+        paint_test_input(&mut buf);
+        // - 8 blocks is just enought to fill the double buffer.
+        // - 9 blocks triggers the "perform one compression on the double buffer" case.
+        // - 11 blocks is the largest input where only one compression may be performed, on the
+        //   first half of the buffer, because there's not enough input to avoid needing to
+        //   finalize the second half.
+        // - 12 blocks triggers the "perform both compressions in the double buffer" case.
+        // - 15 blocks is the largest input where, after compressing 8 blocks from the buffer,
+        //   there's not enough input to hash directly from memory.
+        // - 16 blocks triggers "after emptying the buffer, hash directly from memory".
+        for num_blocks in 0..=20 {
+            for &extra in &[0, 1, BLOCKBYTES - 1] {
+                for &portable in &[false, true] {
+                    // eprintln!("\ncase -----");
+                    // dbg!(num_blocks);
+                    // dbg!(extra);
+                    // dbg!(portable);
+
+                    // First hash the input all at once, as a sanity check.
+                    let mut params = Params::new();
+                    if portable {
+                        force_portable(&mut params);
+                    }
+                    let input = &buf[..num_blocks * BLOCKBYTES + extra];
+                    let expected = blake2bp_reference(&input);
+                    let mut state = params.to_state();
+                    let found = state.update(input).finalize();
+                    assert_eq!(expected, found);
+
+                    // Then, do it again, but buffer 1 byte of input first. That causes the buffering
+                    // branch to trigger.
+                    let mut state = params.to_state();
+                    let maybe_one = cmp::min(1, input.len());
+                    state.update(&input[..maybe_one]);
+                    assert_eq!(maybe_one as Count, state.count());
+                    // Do a throwaway finalize here to check for idempotency.
+                    state.finalize();
+                    state.update(&input[maybe_one..]);
+                    assert_eq!(input.len() as Count, state.count());
+                    let found = state.finalize();
+                    assert_eq!(expected, found);
+
+                    // Finally, do it again with the all-at-once interface.
+                    assert_eq!(expected, blake2bp(input));
+                }
+            }
+        }
+    }
+}
diff --git a/third_party/rust/blake2b_simd/src/guts.rs b/third_party/rust/blake2b_simd/src/guts.rs
new file mode 100644
index 0000000000..9fcacf319c
--- /dev/null
+++ b/third_party/rust/blake2b_simd/src/guts.rs
@@ -0,0 +1,565 @@
+use crate::*;
+use arrayref::array_ref;
+use core::cmp;
+
+#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+pub const MAX_DEGREE: usize = 4;
+
+#[cfg(not(any(target_arch = "x86", target_arch = "x86_64")))]
+pub const MAX_DEGREE: usize = 1;
+
+// Variants other than Portable are unreachable in no_std, unless CPU features
+// are explicitly enabled for the build with e.g. RUSTFLAGS="-C target-feature=avx2".
+// This might change in the future if is_x86_feature_detected moves into libcore.
+#[allow(dead_code)]
+#[derive(Clone, Copy, Debug, Eq, PartialEq)]
+enum Platform {
+    Portable,
+    #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+    SSE41,
+    #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+    AVX2,
+}
+
+#[derive(Clone, Copy, Debug)]
+pub struct Implementation(Platform);
+
+impl Implementation {
+    pub fn detect() -> Self {
+        // Try the different implementations in order of how fast/modern they
+        // are. Currently on non-x86, everything just uses portable.
+        #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+        {
+            if let Some(avx2_impl) = Self::avx2_if_supported() {
+                return avx2_impl;
+            }
+        }
+        #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+        {
+            if let Some(sse41_impl) = Self::sse41_if_supported() {
+                return sse41_impl;
+            }
+        }
+        Self::portable()
+    }
+
+    pub fn portable() -> Self {
+        Implementation(Platform::Portable)
+    }
+
+    #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+    #[allow(unreachable_code)]
+    pub fn sse41_if_supported() -> Option<Self> {
+        // Check whether SSE4.1 support is assumed by the build.
+        #[cfg(target_feature = "sse4.1")]
+        {
+            return Some(Implementation(Platform::SSE41));
+        }
+        // Otherwise dynamically check for support if we can.
+        #[cfg(feature = "std")]
+        {
+            if is_x86_feature_detected!("sse4.1") {
+                return Some(Implementation(Platform::SSE41));
+            }
+        }
+        None
+    }
+
+    #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+    #[allow(unreachable_code)]
+    pub fn avx2_if_supported() -> Option<Self> {
+        // Check whether AVX2 support is assumed by the build.
+        #[cfg(target_feature = "avx2")]
+        {
+            return Some(Implementation(Platform::AVX2));
+        }
+        // Otherwise dynamically check for support if we can.
+        #[cfg(feature = "std")]
+        {
+            if is_x86_feature_detected!("avx2") {
+                return Some(Implementation(Platform::AVX2));
+            }
+        }
+        None
+    }
+
+    pub fn degree(&self) -> usize {
+        match self.0 {
+            #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+            Platform::AVX2 => avx2::DEGREE,
+            #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+            Platform::SSE41 => sse41::DEGREE,
+            Platform::Portable => 1,
+        }
+    }
+
+    pub fn compress1_loop(
+        &self,
+        input: &[u8],
+        words: &mut [Word; 8],
+        count: Count,
+        last_node: LastNode,
+        finalize: Finalize,
+        stride: Stride,
+    ) {
+        match self.0 {
+            #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+            Platform::AVX2 => unsafe {
+                avx2::compress1_loop(input, words, count, last_node, finalize, stride);
+            },
+            // Note that there's an SSE version of compress1 in the official C
+            // implementation, but I haven't ported it yet.
+            _ => {
+                portable::compress1_loop(input, words, count, last_node, finalize, stride);
+            }
+        }
+    }
+
+    pub fn compress2_loop(&self, jobs: &mut [Job; 2], finalize: Finalize, stride: Stride) {
+        match self.0 {
+            #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+            Platform::AVX2 | Platform::SSE41 => unsafe {
+                sse41::compress2_loop(jobs, finalize, stride)
+            },
+            _ => panic!("unsupported"),
+        }
+    }
+
+    pub fn compress4_loop(&self, jobs: &mut [Job; 4], finalize: Finalize, stride: Stride) {
+        match self.0 {
+            #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+            Platform::AVX2 => unsafe { avx2::compress4_loop(jobs, finalize, stride) },
+            _ => panic!("unsupported"),
+        }
+    }
+}
+
+pub struct Job<'a, 'b> {
+    pub input: &'a [u8],
+    pub words: &'b mut [Word; 8],
+    pub count: Count,
+    pub last_node: LastNode,
+}
+
+impl<'a, 'b> core::fmt::Debug for Job<'a, 'b> {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        // NB: Don't print the words. Leaking them would allow length extension.
+        write!(
+            f,
+            "Job {{ input_len: {}, count: {}, last_node: {} }}",
+            self.input.len(),
+            self.count,
+            self.last_node.yes(),
+        )
+    }
+}
+
+// Finalize could just be a bool, but this is easier to read at callsites.
+#[derive(Clone, Copy, Debug)]
+pub enum Finalize {
+    Yes,
+    No,
+}
+
+impl Finalize {
+    pub fn yes(&self) -> bool {
+        match self {
+            Finalize::Yes => true,
+            Finalize::No => false,
+        }
+    }
+}
+
+// Like Finalize, this is easier to read at callsites.
+#[derive(Clone, Copy, Debug)]
+pub enum LastNode {
+    Yes,
+    No,
+}
+
+impl LastNode {
+    pub fn yes(&self) -> bool {
+        match self {
+            LastNode::Yes => true,
+            LastNode::No => false,
+        }
+    }
+}
+
+#[derive(Clone, Copy, Debug)]
+pub enum Stride {
+    Serial,   // BLAKE2b/BLAKE2s
+    Parallel, // BLAKE2bp/BLAKE2sp
+}
+
+impl Stride {
+    pub fn padded_blockbytes(&self) -> usize {
+        match self {
+            Stride::Serial => BLOCKBYTES,
+            Stride::Parallel => blake2bp::DEGREE * BLOCKBYTES,
+        }
+    }
+}
+
+pub(crate) fn count_low(count: Count) -> Word {
+    count as Word
+}
+
+pub(crate) fn count_high(count: Count) -> Word {
+    (count >> 8 * size_of::<Word>()) as Word
+}
+
+pub(crate) fn assemble_count(low: Word, high: Word) -> Count {
+    low as Count + ((high as Count) << 8 * size_of::<Word>())
+}
+
+pub(crate) fn flag_word(flag: bool) -> Word {
+    if flag {
+        !0
+    } else {
+        0
+    }
+}
+
+// Pull a array reference at the given offset straight from the input, if
+// there's a full block of input available. If there's only a partial block,
+// copy it into the provided buffer, and return an array reference that. Along
+// with the array, return the number of bytes of real input, and whether the
+// input can be finalized (i.e. whether there aren't any more bytes after this
+// block). Note that this is written so that the optimizer can elide bounds
+// checks, see: https://godbolt.org/z/0hH2bC
+pub fn final_block<'a>(
+    input: &'a [u8],
+    offset: usize,
+    buffer: &'a mut [u8; BLOCKBYTES],
+    stride: Stride,
+) -> (&'a [u8; BLOCKBYTES], usize, bool) {
+    let capped_offset = cmp::min(offset, input.len());
+    let offset_slice = &input[capped_offset..];
+    if offset_slice.len() >= BLOCKBYTES {
+        let block = array_ref!(offset_slice, 0, BLOCKBYTES);
+        let should_finalize = offset_slice.len() <= stride.padded_blockbytes();
+        (block, BLOCKBYTES, should_finalize)
+    } else {
+        // Copy the final block to the front of the block buffer. The rest of
+        // the buffer is assumed to be initialized to zero.
+        buffer[..offset_slice.len()].copy_from_slice(offset_slice);
+        (buffer, offset_slice.len(), true)
+    }
+}
+
+pub fn input_debug_asserts(input: &[u8], finalize: Finalize) {
+    // If we're not finalizing, the input must not be empty, and it must be an
+    // even multiple of the block size.
+    if !finalize.yes() {
+        debug_assert!(!input.is_empty());
+        debug_assert_eq!(0, input.len() % BLOCKBYTES);
+    }
+}
+
+#[cfg(test)]
+mod test {
+    use super::*;
+    use arrayvec::ArrayVec;
+    use core::mem::size_of;
+
+    #[test]
+    fn test_detection() {
+        assert_eq!(Platform::Portable, Implementation::portable().0);
+
+        #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+        #[cfg(feature = "std")]
+        {
+            if is_x86_feature_detected!("avx2") {
+                assert_eq!(Platform::AVX2, Implementation::detect().0);
+                assert_eq!(
+                    Platform::AVX2,
+                    Implementation::avx2_if_supported().unwrap().0
+                );
+                assert_eq!(
+                    Platform::SSE41,
+                    Implementation::sse41_if_supported().unwrap().0
+                );
+            } else if is_x86_feature_detected!("sse4.1") {
+                assert_eq!(Platform::SSE41, Implementation::detect().0);
+                assert!(Implementation::avx2_if_supported().is_none());
+                assert_eq!(
+                    Platform::SSE41,
+                    Implementation::sse41_if_supported().unwrap().0
+                );
+            } else {
+                assert_eq!(Platform::Portable, Implementation::detect().0);
+                assert!(Implementation::avx2_if_supported().is_none());
+                assert!(Implementation::sse41_if_supported().is_none());
+            }
+        }
+    }
+
+    // TODO: Move all of these case tests into the implementation files.
+    fn exercise_cases<F>(mut f: F)
+    where
+        F: FnMut(Stride, usize, LastNode, Finalize, Count),
+    {
+        // Chose counts to hit the relevant overflow cases.
+        let counts = &[
+            (0 as Count),
+            ((1 as Count) << (8 * size_of::<Word>())) - BLOCKBYTES as Count,
+            (0 as Count).wrapping_sub(BLOCKBYTES as Count),
+        ];
+        for &stride in &[Stride::Serial, Stride::Parallel] {
+            let lengths = [
+                0,
+                1,
+                BLOCKBYTES - 1,
+                BLOCKBYTES,
+                BLOCKBYTES + 1,
+                2 * BLOCKBYTES - 1,
+                2 * BLOCKBYTES,
+                2 * BLOCKBYTES + 1,
+                stride.padded_blockbytes() - 1,
+                stride.padded_blockbytes(),
+                stride.padded_blockbytes() + 1,
+                2 * stride.padded_blockbytes() - 1,
+                2 * stride.padded_blockbytes(),
+                2 * stride.padded_blockbytes() + 1,
+            ];
+            for &length in &lengths {
+                for &last_node in &[LastNode::No, LastNode::Yes] {
+                    for &finalize in &[Finalize::No, Finalize::Yes] {
+                        if !finalize.yes() && (length == 0 || length % BLOCKBYTES != 0) {
+                            // Skip these cases, they're invalid.
+                            continue;
+                        }
+                        for &count in counts {
+                            // eprintln!("\ncase -----");
+                            // dbg!(stride);
+                            // dbg!(length);
+                            // dbg!(last_node);
+                            // dbg!(finalize);
+                            // dbg!(count);
+
+                            f(stride, length, last_node, finalize, count);
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+    fn initial_test_words(input_index: usize) -> [Word; 8] {
+        crate::Params::new()
+            .node_offset(input_index as u64)
+            .to_words()
+    }
+
+    // Use the portable implementation, one block at a time, to compute the
+    // final state words expected for a given test case.
+    fn reference_compression(
+        input: &[u8],
+        stride: Stride,
+        last_node: LastNode,
+        finalize: Finalize,
+        mut count: Count,
+        input_index: usize,
+    ) -> [Word; 8] {
+        let mut words = initial_test_words(input_index);
+        let mut offset = 0;
+        while offset == 0 || offset < input.len() {
+            let block_size = cmp::min(BLOCKBYTES, input.len() - offset);
+            let maybe_finalize = if offset + stride.padded_blockbytes() < input.len() {
+                Finalize::No
+            } else {
+                finalize
+            };
+            portable::compress1_loop(
+                &input[offset..][..block_size],
+                &mut words,
+                count,
+                last_node,
+                maybe_finalize,
+                Stride::Serial,
+            );
+            offset += stride.padded_blockbytes();
+            count = count.wrapping_add(BLOCKBYTES as Count);
+        }
+        words
+    }
+
+    // For various loop lengths and finalization parameters, make sure that the
+    // implementation gives the same answer as the portable implementation does
+    // when invoked one block at a time. (So even the portable implementation
+    // itself is being tested here, to make sure its loop is correct.) Note
+    // that this doesn't include any fixed test vectors; those are taken from
+    // the blake2-kat.json file (copied from upstream) and tested elsewhere.
+    fn exercise_compress1_loop(implementation: Implementation) {
+        let mut input = [0; 100 * BLOCKBYTES];
+        paint_test_input(&mut input);
+
+        exercise_cases(|stride, length, last_node, finalize, count| {
+            let reference_words =
+                reference_compression(&input[..length], stride, last_node, finalize, count, 0);
+
+            let mut test_words = initial_test_words(0);
+            implementation.compress1_loop(
+                &input[..length],
+                &mut test_words,
+                count,
+                last_node,
+                finalize,
+                stride,
+            );
+            assert_eq!(reference_words, test_words);
+        });
+    }
+
+    #[test]
+    fn test_compress1_loop_portable() {
+        exercise_compress1_loop(Implementation::portable());
+    }
+
+    #[test]
+    #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+    fn test_compress1_loop_sse41() {
+        // Currently this just falls back to portable, but we test it anyway.
+        if let Some(imp) = Implementation::sse41_if_supported() {
+            exercise_compress1_loop(imp);
+        }
+    }
+
+    #[test]
+    #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+    fn test_compress1_loop_avx2() {
+        if let Some(imp) = Implementation::avx2_if_supported() {
+            exercise_compress1_loop(imp);
+        }
+    }
+
+    // I use ArrayVec everywhere in here becuase currently these tests pass
+    // under no_std. I might decide that's not worth maintaining at some point,
+    // since really all we care about with no_std is that the library builds,
+    // but for now it's here. Everything is keyed off of this N constant so
+    // that it's easy to copy the code to exercise_compress4_loop.
+    fn exercise_compress2_loop(implementation: Implementation) {
+        const N: usize = 2;
+
+        let mut input_buffer = [0; 100 * BLOCKBYTES];
+        paint_test_input(&mut input_buffer);
+        let mut inputs = ArrayVec::<[_; N]>::new();
+        for i in 0..N {
+            inputs.push(&input_buffer[i..]);
+        }
+
+        exercise_cases(|stride, length, last_node, finalize, count| {
+            let mut reference_words = ArrayVec::<[_; N]>::new();
+            for i in 0..N {
+                let words = reference_compression(
+                    &inputs[i][..length],
+                    stride,
+                    last_node,
+                    finalize,
+                    count.wrapping_add((i * BLOCKBYTES) as Count),
+                    i,
+                );
+                reference_words.push(words);
+            }
+
+            let mut test_words = ArrayVec::<[_; N]>::new();
+            for i in 0..N {
+                test_words.push(initial_test_words(i));
+            }
+            let mut jobs = ArrayVec::<[_; N]>::new();
+            for (i, words) in test_words.iter_mut().enumerate() {
+                jobs.push(Job {
+                    input: &inputs[i][..length],
+                    words,
+                    count: count.wrapping_add((i * BLOCKBYTES) as Count),
+                    last_node,
+                });
+            }
+            let mut jobs = jobs.into_inner().expect("full");
+            implementation.compress2_loop(&mut jobs, finalize, stride);
+
+            for i in 0..N {
+                assert_eq!(reference_words[i], test_words[i], "words {} unequal", i);
+            }
+        });
+    }
+
+    #[test]
+    #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+    fn test_compress2_loop_sse41() {
+        if let Some(imp) = Implementation::sse41_if_supported() {
+            exercise_compress2_loop(imp);
+        }
+    }
+
+    #[test]
+    #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+    fn test_compress2_loop_avx2() {
+        // Currently this just falls back to SSE4.1, but we test it anyway.
+        if let Some(imp) = Implementation::avx2_if_supported() {
+            exercise_compress2_loop(imp);
+        }
+    }
+
+    // Copied from exercise_compress2_loop, with a different value of N and an
+    // interior call to compress4_loop.
+    fn exercise_compress4_loop(implementation: Implementation) {
+        const N: usize = 4;
+
+        let mut input_buffer = [0; 100 * BLOCKBYTES];
+        paint_test_input(&mut input_buffer);
+        let mut inputs = ArrayVec::<[_; N]>::new();
+        for i in 0..N {
+            inputs.push(&input_buffer[i..]);
+        }
+
+        exercise_cases(|stride, length, last_node, finalize, count| {
+            let mut reference_words = ArrayVec::<[_; N]>::new();
+            for i in 0..N {
+                let words = reference_compression(
+                    &inputs[i][..length],
+                    stride,
+                    last_node,
+                    finalize,
+                    count.wrapping_add((i * BLOCKBYTES) as Count),
+                    i,
+                );
+                reference_words.push(words);
+            }
+
+            let mut test_words = ArrayVec::<[_; N]>::new();
+            for i in 0..N {
+                test_words.push(initial_test_words(i));
+            }
+            let mut jobs = ArrayVec::<[_; N]>::new();
+            for (i, words) in test_words.iter_mut().enumerate() {
+                jobs.push(Job {
+                    input: &inputs[i][..length],
+                    words,
+                    count: count.wrapping_add((i * BLOCKBYTES) as Count),
+                    last_node,
+                });
+            }
+            let mut jobs = jobs.into_inner().expect("full");
+            implementation.compress4_loop(&mut jobs, finalize, stride);
+
+            for i in 0..N {
+                assert_eq!(reference_words[i], test_words[i], "words {} unequal", i);
+            }
+        });
+    }
+
+    #[test]
+    #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+    fn test_compress4_loop_avx2() {
+        if let Some(imp) = Implementation::avx2_if_supported() {
+            exercise_compress4_loop(imp);
+        }
+    }
+
+    #[test]
+    fn sanity_check_count_size() {
+        assert_eq!(size_of::<Count>(), 2 * size_of::<Word>());
+    }
+}
diff --git a/third_party/rust/blake2b_simd/src/lib.rs b/third_party/rust/blake2b_simd/src/lib.rs
new file mode 100644
index 0000000000..cadef7f9a9
--- /dev/null
+++ b/third_party/rust/blake2b_simd/src/lib.rs
@@ -0,0 +1,674 @@
+//! [![GitHub](https://img.shields.io/github/tag/oconnor663/blake2_simd.svg?label=GitHub)](https://github.com/oconnor663/blake2_simd) [![crates.io](https://img.shields.io/crates/v/blake2b_simd.svg)](https://crates.io/crates/blake2b_simd) [![Build Status](https://travis-ci.org/oconnor663/blake2_simd.svg?branch=master)](https://travis-ci.org/oconnor663/blake2_simd)
+//!
+//! An implementation of the BLAKE2b and BLAKE2bp hash functions. See also
+//! [`blake2s_simd`](https://docs.rs/blake2s_simd).
+//!
+//! This crate includes:
+//!
+//! - 100% stable Rust.
+//! - SIMD implementations based on Samuel Neves' [`blake2-avx2`](https://github.com/sneves/blake2-avx2).
+//!   These are very fast. For benchmarks, see [the Performance section of the
+//!   README](https://github.com/oconnor663/blake2_simd#performance).
+//! - Portable, safe implementations for other platforms.
+//! - Dynamic CPU feature detection. Binaries include multiple implementations by default and
+//!   choose the fastest one the processor supports at runtime.
+//! - All the features from the [the BLAKE2 spec](https://blake2.net/blake2.pdf), like adjustable
+//!   length, keying, and associated data for tree hashing.
+//! - `no_std` support. The `std` Cargo feature is on by default, for CPU feature detection and
+//!   for implementing `std::io::Write`.
+//! - Support for computing multiple BLAKE2b hashes in parallel, matching the efficiency of
+//!   BLAKE2bp. See the [`many`](many/index.html) module.
+//!
+//! # Example
+//!
+//! ```
+//! use blake2b_simd::{blake2b, Params};
+//!
+//! let expected = "ca002330e69d3e6b84a46a56a6533fd79d51d97a3bb7cad6c2ff43b354185d6d\
+//!                 c1e723fb3db4ae0737e120378424c714bb982d9dc5bbd7a0ab318240ddd18f8d";
+//! let hash = blake2b(b"foo");
+//! assert_eq!(expected, &hash.to_hex());
+//!
+//! let hash = Params::new()
+//!     .hash_length(16)
+//!     .key(b"The Magic Words are Squeamish Ossifrage")
+//!     .personal(b"L. P. Waterhouse")
+//!     .to_state()
+//!     .update(b"foo")
+//!     .update(b"bar")
+//!     .update(b"baz")
+//!     .finalize();
+//! assert_eq!("ee8ff4e9be887297cf79348dc35dab56", &hash.to_hex());
+//! ```
+
+#![cfg_attr(not(feature = "std"), no_std)]
+
+use arrayref::{array_refs, mut_array_refs};
+use core::cmp;
+use core::fmt;
+use core::mem::size_of;
+
+#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+mod avx2;
+mod portable;
+#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+mod sse41;
+
+pub mod blake2bp;
+mod guts;
+pub mod many;
+
+#[cfg(test)]
+mod test;
+
+type Word = u64;
+type Count = u128;
+
+/// The max hash length.
+pub const OUTBYTES: usize = 8 * size_of::<Word>();
+/// The max key length.
+pub const KEYBYTES: usize = 8 * size_of::<Word>();
+/// The max salt length.
+pub const SALTBYTES: usize = 2 * size_of::<Word>();
+/// The max personalization length.
+pub const PERSONALBYTES: usize = 2 * size_of::<Word>();
+/// The number input bytes passed to each call to the compression function. Small benchmarks need
+/// to use an even multiple of `BLOCKBYTES`, or else their apparent throughput will be low.
+pub const BLOCKBYTES: usize = 16 * size_of::<Word>();
+
+const IV: [Word; 8] = [
+    0x6A09E667F3BCC908,
+    0xBB67AE8584CAA73B,
+    0x3C6EF372FE94F82B,
+    0xA54FF53A5F1D36F1,
+    0x510E527FADE682D1,
+    0x9B05688C2B3E6C1F,
+    0x1F83D9ABFB41BD6B,
+    0x5BE0CD19137E2179,
+];
+
+const SIGMA: [[u8; 16]; 12] = [
+    [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15],
+    [14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3],
+    [11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4],
+    [7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8],
+    [9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13],
+    [2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9],
+    [12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11],
+    [13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10],
+    [6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5],
+    [10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0],
+    [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15],
+    [14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3],
+];
+
+/// Compute the BLAKE2b hash of a slice of bytes all at once, using default
+/// parameters.
+///
+/// # Example
+///
+/// ```
+/// # use blake2b_simd::{blake2b, Params};
+/// let expected = "ca002330e69d3e6b84a46a56a6533fd79d51d97a3bb7cad6c2ff43b354185d6d\
+///                 c1e723fb3db4ae0737e120378424c714bb982d9dc5bbd7a0ab318240ddd18f8d";
+/// let hash = blake2b(b"foo");
+/// assert_eq!(expected, &hash.to_hex());
+/// ```
+pub fn blake2b(input: &[u8]) -> Hash {
+    Params::new().hash(input)
+}
+
+/// A parameter builder that exposes all the non-default BLAKE2 features.
+///
+/// Apart from `hash_length`, which controls the length of the final `Hash`,
+/// all of these parameters are just associated data that gets mixed with the
+/// input. For more details, see [the BLAKE2 spec](https://blake2.net/blake2.pdf).
+///
+/// Several of the parameters have a valid range defined in the spec and
+/// documented below. Trying to set an invalid parameter will panic.
+///
+/// # Example
+///
+/// ```
+/// # use blake2b_simd::Params;
+/// // Create a Params object with a secret key and a non-default length.
+/// let mut params = Params::new();
+/// params.key(b"my secret key");
+/// params.hash_length(16);
+///
+/// // Use those params to hash an input all at once.
+/// let hash = params.hash(b"my input");
+///
+/// // Or use those params to build an incremental State.
+/// let mut state = params.to_state();
+/// ```
+#[derive(Clone)]
+pub struct Params {
+    hash_length: u8,
+    key_length: u8,
+    key_block: [u8; BLOCKBYTES],
+    salt: [u8; SALTBYTES],
+    personal: [u8; PERSONALBYTES],
+    fanout: u8,
+    max_depth: u8,
+    max_leaf_length: u32,
+    node_offset: u64,
+    node_depth: u8,
+    inner_hash_length: u8,
+    last_node: guts::LastNode,
+    implementation: guts::Implementation,
+}
+
+impl Params {
+    /// Equivalent to `Params::default()`.
+    #[inline]
+    pub fn new() -> Self {
+        Self {
+            hash_length: OUTBYTES as u8,
+            key_length: 0,
+            key_block: [0; BLOCKBYTES],
+            salt: [0; SALTBYTES],
+            personal: [0; PERSONALBYTES],
+            // NOTE: fanout and max_depth don't default to zero!
+            fanout: 1,
+            max_depth: 1,
+            max_leaf_length: 0,
+            node_offset: 0,
+            node_depth: 0,
+            inner_hash_length: 0,
+            last_node: guts::LastNode::No,
+            implementation: guts::Implementation::detect(),
+        }
+    }
+
+    #[inline(always)]
+    fn to_words(&self) -> [Word; 8] {
+        let (salt_left, salt_right) = array_refs!(&self.salt, SALTBYTES / 2, SALTBYTES / 2);
+        let (personal_left, personal_right) =
+            array_refs!(&self.personal, PERSONALBYTES / 2, PERSONALBYTES / 2);
+        [
+            IV[0]
+                ^ self.hash_length as u64
+                ^ (self.key_length as u64) << 8
+                ^ (self.fanout as u64) << 16
+                ^ (self.max_depth as u64) << 24
+                ^ (self.max_leaf_length as u64) << 32,
+            IV[1] ^ self.node_offset,
+            IV[2] ^ self.node_depth as u64 ^ (self.inner_hash_length as u64) << 8,
+            IV[3],
+            IV[4] ^ Word::from_le_bytes(*salt_left),
+            IV[5] ^ Word::from_le_bytes(*salt_right),
+            IV[6] ^ Word::from_le_bytes(*personal_left),
+            IV[7] ^ Word::from_le_bytes(*personal_right),
+        ]
+    }
+
+    /// Hash an input all at once with these parameters.
+    #[inline]
+    pub fn hash(&self, input: &[u8]) -> Hash {
+        // If there's a key, just fall back to using the State.
+        if self.key_length > 0 {
+            return self.to_state().update(input).finalize();
+        }
+        let mut words = self.to_words();
+        self.implementation.compress1_loop(
+            input,
+            &mut words,
+            0,
+            self.last_node,
+            guts::Finalize::Yes,
+            guts::Stride::Serial,
+        );
+        Hash {
+            bytes: state_words_to_bytes(&words),
+            len: self.hash_length,
+        }
+    }
+
+    /// Construct a `State` object based on these parameters, for hashing input
+    /// incrementally.
+    pub fn to_state(&self) -> State {
+        State::with_params(self)
+    }
+
+    /// Set the length of the final hash in bytes, from 1 to `OUTBYTES` (64). Apart from
+    /// controlling the length of the final `Hash`, this is also associated data, and changing it
+    /// will result in a totally different hash.
+    #[inline]
+    pub fn hash_length(&mut self, length: usize) -> &mut Self {
+        assert!(
+            1 <= length && length <= OUTBYTES,
+            "Bad hash length: {}",
+            length
+        );
+        self.hash_length = length as u8;
+        self
+    }
+
+    /// Use a secret key, so that BLAKE2 acts as a MAC. The maximum key length is `KEYBYTES` (64).
+    /// An empty key is equivalent to having no key at all.
+    #[inline]
+    pub fn key(&mut self, key: &[u8]) -> &mut Self {
+        assert!(key.len() <= KEYBYTES, "Bad key length: {}", key.len());
+        self.key_length = key.len() as u8;
+        self.key_block = [0; BLOCKBYTES];
+        self.key_block[..key.len()].copy_from_slice(key);
+        self
+    }
+
+    /// At most `SALTBYTES` (16). Shorter salts are padded with null bytes. An empty salt is
+    /// equivalent to having no salt at all.
+    #[inline]
+    pub fn salt(&mut self, salt: &[u8]) -> &mut Self {
+        assert!(salt.len() <= SALTBYTES, "Bad salt length: {}", salt.len());
+        self.salt = [0; SALTBYTES];
+        self.salt[..salt.len()].copy_from_slice(salt);
+        self
+    }
+
+    /// At most `PERSONALBYTES` (16). Shorter personalizations are padded with null bytes. An empty
+    /// personalization is equivalent to having no personalization at all.
+    #[inline]
+    pub fn personal(&mut self, personalization: &[u8]) -> &mut Self {
+        assert!(
+            personalization.len() <= PERSONALBYTES,
+            "Bad personalization length: {}",
+            personalization.len()
+        );
+        self.personal = [0; PERSONALBYTES];
+        self.personal[..personalization.len()].copy_from_slice(personalization);
+        self
+    }
+
+    /// From 0 (meaning unlimited) to 255. The default is 1 (meaning sequential).
+    #[inline]
+    pub fn fanout(&mut self, fanout: u8) -> &mut Self {
+        self.fanout = fanout;
+        self
+    }
+
+    /// From 0 (meaning BLAKE2X B2 hashes), through 1 (the default, meaning sequential) to 255 (meaning unlimited).
+    #[inline]
+    pub fn max_depth(&mut self, depth: u8) -> &mut Self {
+        self.max_depth = depth;
+        self
+    }
+
+    /// From 0 (the default, meaning unlimited or sequential) to `2^32 - 1`.
+    #[inline]
+    pub fn max_leaf_length(&mut self, length: u32) -> &mut Self {
+        self.max_leaf_length = length;
+        self
+    }
+
+    /// From 0 (the default, meaning first, leftmost, leaf, or sequential) to `2^64 - 1`.
+    #[inline]
+    pub fn node_offset(&mut self, offset: u64) -> &mut Self {
+        self.node_offset = offset;
+        self
+    }
+
+    /// From 0 (the default, meaning leaf or sequential) to 255.
+    #[inline]
+    pub fn node_depth(&mut self, depth: u8) -> &mut Self {
+        self.node_depth = depth;
+        self
+    }
+
+    /// From 0 (the default, meaning sequential) to `OUTBYTES` (64).
+    #[inline]
+    pub fn inner_hash_length(&mut self, length: usize) -> &mut Self {
+        assert!(length <= OUTBYTES, "Bad inner hash length: {}", length);
+        self.inner_hash_length = length as u8;
+        self
+    }
+
+    /// Indicates the rightmost node in a row. This can also be changed on the
+    /// `State` object, potentially after hashing has begun. See
+    /// [`State::set_last_node`].
+    ///
+    /// [`State::set_last_node`]: struct.State.html#method.set_last_node
+    #[inline]
+    pub fn last_node(&mut self, last_node: bool) -> &mut Self {
+        self.last_node = if last_node {
+            guts::LastNode::Yes
+        } else {
+            guts::LastNode::No
+        };
+        self
+    }
+}
+
+impl Default for Params {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl fmt::Debug for Params {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        write!(
+            f,
+            "Params {{ hash_length: {}, key_length: {}, salt: {:?}, personal: {:?}, fanout: {}, \
+             max_depth: {}, max_leaf_length: {}, node_offset: {}, node_depth: {}, \
+             inner_hash_length: {}, last_node: {} }}",
+            self.hash_length,
+            // NB: Don't print the key itself. Debug shouldn't leak secrets.
+            self.key_length,
+            &self.salt,
+            &self.personal,
+            self.fanout,
+            self.max_depth,
+            self.max_leaf_length,
+            self.node_offset,
+            self.node_depth,
+            self.inner_hash_length,
+            self.last_node.yes(),
+        )
+    }
+}
+
+/// An incremental hasher for BLAKE2b.
+///
+/// To construct a `State` with non-default parameters, see `Params::to_state`.
+///
+/// # Example
+///
+/// ```
+/// use blake2b_simd::{State, blake2b};
+///
+/// let mut state = blake2b_simd::State::new();
+///
+/// state.update(b"foo");
+/// assert_eq!(blake2b(b"foo"), state.finalize());
+///
+/// state.update(b"bar");
+/// assert_eq!(blake2b(b"foobar"), state.finalize());
+/// ```
+#[derive(Clone)]
+pub struct State {
+    words: [Word; 8],
+    count: Count,
+    buf: [u8; BLOCKBYTES],
+    buflen: u8,
+    last_node: guts::LastNode,
+    hash_length: u8,
+    implementation: guts::Implementation,
+    is_keyed: bool,
+}
+
+impl State {
+    /// Equivalent to `State::default()` or `Params::default().to_state()`.
+    pub fn new() -> Self {
+        Self::with_params(&Params::default())
+    }
+
+    fn with_params(params: &Params) -> Self {
+        let mut state = Self {
+            words: params.to_words(),
+            count: 0,
+            buf: [0; BLOCKBYTES],
+            buflen: 0,
+            last_node: params.last_node,
+            hash_length: params.hash_length,
+            implementation: params.implementation,
+            is_keyed: params.key_length > 0,
+        };
+        if state.is_keyed {
+            state.buf = params.key_block;
+            state.buflen = state.buf.len() as u8;
+        }
+        state
+    }
+
+    fn fill_buf(&mut self, input: &mut &[u8]) {
+        let take = cmp::min(BLOCKBYTES - self.buflen as usize, input.len());
+        self.buf[self.buflen as usize..self.buflen as usize + take].copy_from_slice(&input[..take]);
+        self.buflen += take as u8;
+        *input = &input[take..];
+    }
+
+    // If the state already has some input in its buffer, try to fill the buffer and perform a
+    // compression. However, only do the compression if there's more input coming, otherwise it
+    // will give the wrong hash it the caller finalizes immediately after.
+    fn compress_buffer_if_possible(&mut self, input: &mut &[u8]) {
+        if self.buflen > 0 {
+            self.fill_buf(input);
+            if !input.is_empty() {
+                self.implementation.compress1_loop(
+                    &self.buf,
+                    &mut self.words,
+                    self.count,
+                    self.last_node,
+                    guts::Finalize::No,
+                    guts::Stride::Serial,
+                );
+                self.count = self.count.wrapping_add(BLOCKBYTES as Count);
+                self.buflen = 0;
+            }
+        }
+    }
+
+    /// Add input to the hash. You can call `update` any number of times.
+    pub fn update(&mut self, mut input: &[u8]) -> &mut Self {
+        // If we have a partial buffer, try to complete it.
+        self.compress_buffer_if_possible(&mut input);
+        // While there's more than a block of input left (which also means we cleared the buffer
+        // above), compress blocks directly without copying.
+        let mut end = input.len().saturating_sub(1);
+        end -= end % BLOCKBYTES;
+        if end > 0 {
+            self.implementation.compress1_loop(
+                &input[..end],
+                &mut self.words,
+                self.count,
+                self.last_node,
+                guts::Finalize::No,
+                guts::Stride::Serial,
+            );
+            self.count = self.count.wrapping_add(end as Count);
+            input = &input[end..];
+        }
+        // Buffer any remaining input, to be either compressed or finalized in a subsequent call.
+        // Note that this represents some copying overhead, which in theory we could avoid in
+        // all-at-once setting. A function hardcoded for exactly BLOCKSIZE input bytes is about 10%
+        // faster than using this implementation for the same input.
+        self.fill_buf(&mut input);
+        self
+    }
+
+    /// Finalize the state and return a `Hash`. This method is idempotent, and calling it multiple
+    /// times will give the same result. It's also possible to `update` with more input in between.
+    pub fn finalize(&self) -> Hash {
+        let mut words_copy = self.words;
+        self.implementation.compress1_loop(
+            &self.buf[..self.buflen as usize],
+            &mut words_copy,
+            self.count,
+            self.last_node,
+            guts::Finalize::Yes,
+            guts::Stride::Serial,
+        );
+        Hash {
+            bytes: state_words_to_bytes(&words_copy),
+            len: self.hash_length,
+        }
+    }
+
+    /// Set a flag indicating that this is the last node of its level in a tree hash. This is
+    /// equivalent to [`Params::last_node`], except that it can be set at any time before calling
+    /// `finalize`. That allows callers to begin hashing a node without knowing ahead of time
+    /// whether it's the last in its level. For more details about the intended use of this flag
+    /// [the BLAKE2 spec].
+    ///
+    /// [`Params::last_node`]: struct.Params.html#method.last_node
+    /// [the BLAKE2 spec]: https://blake2.net/blake2.pdf
+    pub fn set_last_node(&mut self, last_node: bool) -> &mut Self {
+        self.last_node = if last_node {
+            guts::LastNode::Yes
+        } else {
+            guts::LastNode::No
+        };
+        self
+    }
+
+    /// Return the total number of bytes input so far.
+    ///
+    /// Note that `count` doesn't include the bytes of the key block, if any.
+    /// It's exactly the total number of input bytes fed to `update`.
+    pub fn count(&self) -> Count {
+        let mut ret = self.count.wrapping_add(self.buflen as Count);
+        if self.is_keyed {
+            ret -= BLOCKBYTES as Count;
+        }
+        ret
+    }
+}
+
+#[inline(always)]
+fn state_words_to_bytes(state_words: &[Word; 8]) -> [u8; OUTBYTES] {
+    let mut bytes = [0; OUTBYTES];
+    {
+        const W: usize = size_of::<Word>();
+        let refs = mut_array_refs!(&mut bytes, W, W, W, W, W, W, W, W);
+        *refs.0 = state_words[0].to_le_bytes();
+        *refs.1 = state_words[1].to_le_bytes();
+        *refs.2 = state_words[2].to_le_bytes();
+        *refs.3 = state_words[3].to_le_bytes();
+        *refs.4 = state_words[4].to_le_bytes();
+        *refs.5 = state_words[5].to_le_bytes();
+        *refs.6 = state_words[6].to_le_bytes();
+        *refs.7 = state_words[7].to_le_bytes();
+    }
+    bytes
+}
+
+#[cfg(feature = "std")]
+impl std::io::Write for State {
+    fn write(&mut self, buf: &[u8]) -> std::io::Result<usize> {
+        self.update(buf);
+        Ok(buf.len())
+    }
+
+    fn flush(&mut self) -> std::io::Result<()> {
+        Ok(())
+    }
+}
+
+impl fmt::Debug for State {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        // NB: Don't print the words. Leaking them would allow length extension.
+        write!(
+            f,
+            "State {{ count: {}, hash_length: {}, last_node: {} }}",
+            self.count(),
+            self.hash_length,
+            self.last_node.yes(),
+        )
+    }
+}
+
+impl Default for State {
+    fn default() -> Self {
+        Self::with_params(&Params::default())
+    }
+}
+
+type HexString = arrayvec::ArrayString<[u8; 2 * OUTBYTES]>;
+
+/// A finalized BLAKE2 hash, with constant-time equality.
+#[derive(Clone, Copy)]
+pub struct Hash {
+    bytes: [u8; OUTBYTES],
+    len: u8,
+}
+
+impl Hash {
+    /// Convert the hash to a byte slice. Note that if you're using BLAKE2 as a MAC, you need
+    /// constant time equality, which `&[u8]` doesn't provide.
+    pub fn as_bytes(&self) -> &[u8] {
+        &self.bytes[..self.len as usize]
+    }
+
+    /// Convert the hash to a byte array. Note that if you're using BLAKE2 as a
+    /// MAC, you need constant time equality, which arrays don't provide. This
+    /// panics in debug mode if the length of the hash isn't `OUTBYTES`.
+    #[inline]
+    pub fn as_array(&self) -> &[u8; OUTBYTES] {
+        debug_assert_eq!(self.len as usize, OUTBYTES);
+        &self.bytes
+    }
+
+    /// Convert the hash to a lowercase hexadecimal
+    /// [`ArrayString`](https://docs.rs/arrayvec/0.4/arrayvec/struct.ArrayString.html).
+    pub fn to_hex(&self) -> HexString {
+        bytes_to_hex(self.as_bytes())
+    }
+}
+
+fn bytes_to_hex(bytes: &[u8]) -> HexString {
+    let mut s = arrayvec::ArrayString::new();
+    let table = b"0123456789abcdef";
+    for &b in bytes {
+        s.push(table[(b >> 4) as usize] as char);
+        s.push(table[(b & 0xf) as usize] as char);
+    }
+    s
+}
+
+/// This implementation is constant time, if the two hashes are the same length.
+impl PartialEq for Hash {
+    fn eq(&self, other: &Hash) -> bool {
+        constant_time_eq::constant_time_eq(&self.as_bytes(), &other.as_bytes())
+    }
+}
+
+/// This implementation is constant time, if the slice is the same length as the hash.
+impl PartialEq<[u8]> for Hash {
+    fn eq(&self, other: &[u8]) -> bool {
+        constant_time_eq::constant_time_eq(&self.as_bytes(), other)
+    }
+}
+
+impl Eq for Hash {}
+
+impl AsRef<[u8]> for Hash {
+    fn as_ref(&self) -> &[u8] {
+        self.as_bytes()
+    }
+}
+
+impl fmt::Debug for Hash {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        write!(f, "Hash(0x{})", self.to_hex())
+    }
+}
+
+// Paint a byte pattern that won't repeat, so that we don't accidentally miss
+// buffer offset bugs. This is the same as what Bao uses in its tests.
+#[cfg(test)]
+fn paint_test_input(buf: &mut [u8]) {
+    let mut offset = 0;
+    let mut counter: u32 = 1;
+    while offset < buf.len() {
+        let bytes = counter.to_le_bytes();
+        let take = cmp::min(bytes.len(), buf.len() - offset);
+        buf[offset..][..take].copy_from_slice(&bytes[..take]);
+        counter += 1;
+        offset += take;
+    }
+}
+
+// This module is pub for internal benchmarks only. Please don't use it.
+#[doc(hidden)]
+pub mod benchmarks {
+    use super::*;
+
+    pub fn force_portable(params: &mut Params) {
+        params.implementation = guts::Implementation::portable();
+    }
+
+    pub fn force_portable_blake2bp(params: &mut blake2bp::Params) {
+        blake2bp::force_portable(params);
+    }
+}
diff --git a/third_party/rust/blake2b_simd/src/many.rs b/third_party/rust/blake2b_simd/src/many.rs
new file mode 100644
index 0000000000..1588a3fb92
--- /dev/null
+++ b/third_party/rust/blake2b_simd/src/many.rs
@@ -0,0 +1,529 @@
+//! Interfaces for hashing multiple inputs at once, using SIMD more
+//! efficiently.
+//!
+//! The throughput of these interfaces is comparable to BLAKE2bp, about twice
+//! the throughput of regular BLAKE2b when AVX2 is available.
+//!
+//! These interfaces can accept any number of inputs, and the implementation
+//! does its best to parallelize them. In general, the more inputs you can pass
+//! in at once the better. If you need to batch your inputs in smaller groups,
+//! see the [`degree`](fn.degree.html) function for a good batch size.
+//!
+//! The implementation keeps working in parallel even when inputs are of
+//! different lengths, by managing a working set of jobs whose input isn't yet
+//! exhausted. However, if one or two inputs are much longer than the others,
+//! and they're encountered only at the end, there might not be any remaining
+//! work to parallelize them with. In this case, sorting the inputs
+//! longest-first can improve parallelism.
+//!
+//! # Example
+//!
+//! ```
+//! use blake2b_simd::{blake2b, State, many::update_many};
+//!
+//! let mut states = [
+//!     State::new(),
+//!     State::new(),
+//!     State::new(),
+//!     State::new(),
+//! ];
+//!
+//! let inputs = [
+//!     &b"foo"[..],
+//!     &b"bar"[..],
+//!     &b"baz"[..],
+//!     &b"bing"[..],
+//! ];
+//!
+//! update_many(states.iter_mut().zip(inputs.iter()));
+//!
+//! for (state, input) in states.iter_mut().zip(inputs.iter()) {
+//!     assert_eq!(blake2b(input), state.finalize());
+//! }
+//! ```
+
+use crate::guts::{self, Finalize, Implementation, Job, LastNode, Stride};
+use crate::state_words_to_bytes;
+use crate::Count;
+use crate::Hash;
+use crate::Params;
+use crate::State;
+use crate::Word;
+use crate::BLOCKBYTES;
+use arrayref::array_mut_ref;
+use arrayvec::ArrayVec;
+use core::fmt;
+
+/// The largest possible value of [`degree`](fn.degree.html) on the target
+/// platform.
+///
+/// Note that this constant reflects the parallelism degree supported by this
+/// crate, so it will change over time as support is added or removed. For
+/// example, when Rust stabilizes AVX-512 support and this crate adds an
+/// AVX-512 implementation, this constant will double on x86 targets. If that
+/// implementation is an optional feature (e.g. because it's nightly-only), the
+/// value of this constant will depend on that optional feature also.
+pub const MAX_DEGREE: usize = guts::MAX_DEGREE;
+
+/// The parallelism degree of the implementation, detected at runtime. If you
+/// hash your inputs in small batches, making the batch size a multiple of
+/// `degree` will generally give good performance.
+///
+/// For example, an x86 processor that supports AVX2 can compute four BLAKE2b
+/// hashes in parallel, so `degree` returns 4 on that machine. If you call
+/// [`hash_many`] with only three inputs, that's not enough to use the AVX2
+/// implementation, and your average throughput will be lower. Likewise if you
+/// call it with five inputs of equal length, the first four will be hashed in
+/// parallel with AVX2, but the last one will have to be hashed by itself, and
+/// again your average throughput will be lower.
+///
+/// As noted in the module level docs, performance is more complicated if your
+/// inputs are of different lengths. When parallelizing long and short inputs
+/// together, the longer ones will have bytes left over, and the implementation
+/// will try to parallelize those leftover bytes with subsequent inputs. The
+/// more inputs available in that case, the more the implementation will be
+/// able to parallelize.
+///
+/// If you need a constant batch size, for example to collect inputs in an
+/// array, see [`MAX_DEGREE`].
+///
+/// [`hash_many`]: fn.hash_many.html
+/// [`MAX_DEGREE`]: constant.MAX_DEGREE.html
+pub fn degree() -> usize {
+    guts::Implementation::detect().degree()
+}
+
+type JobsVec<'a, 'b> = ArrayVec<[Job<'a, 'b>; guts::MAX_DEGREE]>;
+
+#[inline(always)]
+fn fill_jobs_vec<'a, 'b>(
+    jobs_iter: &mut impl Iterator<Item = Job<'a, 'b>>,
+    vec: &mut JobsVec<'a, 'b>,
+    target_len: usize,
+) {
+    while vec.len() < target_len {
+        if let Some(job) = jobs_iter.next() {
+            vec.push(job);
+        } else {
+            break;
+        }
+    }
+}
+
+#[inline(always)]
+fn evict_finished<'a, 'b>(vec: &mut JobsVec<'a, 'b>, num_jobs: usize) {
+    // Iterate backwards so that removal doesn't cause an out-of-bounds panic.
+    for i in (0..num_jobs).rev() {
+        // Note that is_empty() is only valid because we know all these jobs
+        // have been run at least once. Otherwise we could confuse the empty
+        // input for a finished job, which would be incorrect.
+        //
+        // Avoid a panic branch here in release mode.
+        debug_assert!(vec.len() > i);
+        if vec.len() > i && vec[i].input.is_empty() {
+            // Note that calling pop_at() repeatedly has some overhead, because
+            // later elements need to be shifted up. However, the JobsVec is
+            // small, and this approach guarantees that jobs are encountered in
+            // order.
+            vec.pop_at(i);
+        }
+    }
+}
+
+pub(crate) fn compress_many<'a, 'b, I>(
+    jobs: I,
+    imp: Implementation,
+    finalize: Finalize,
+    stride: Stride,
+) where
+    I: IntoIterator<Item = Job<'a, 'b>>,
+{
+    // Fuse is important for correctness, since each of these blocks tries to
+    // advance the iterator, even if a previous block emptied it.
+    let mut jobs_iter = jobs.into_iter().fuse();
+    let mut jobs_vec = JobsVec::new();
+
+    if imp.degree() >= 4 {
+        loop {
+            fill_jobs_vec(&mut jobs_iter, &mut jobs_vec, 4);
+            if jobs_vec.len() < 4 {
+                break;
+            }
+            let jobs_array = array_mut_ref!(jobs_vec, 0, 4);
+            imp.compress4_loop(jobs_array, finalize, stride);
+            evict_finished(&mut jobs_vec, 4);
+        }
+    }
+
+    if imp.degree() >= 2 {
+        loop {
+            fill_jobs_vec(&mut jobs_iter, &mut jobs_vec, 2);
+            if jobs_vec.len() < 2 {
+                break;
+            }
+            let jobs_array = array_mut_ref!(jobs_vec, 0, 2);
+            imp.compress2_loop(jobs_array, finalize, stride);
+            evict_finished(&mut jobs_vec, 2);
+        }
+    }
+
+    for job in jobs_vec.into_iter().chain(jobs_iter) {
+        let Job {
+            input,
+            words,
+            count,
+            last_node,
+        } = job;
+        imp.compress1_loop(input, words, count, last_node, finalize, stride);
+    }
+}
+
+/// Update any number of `State` objects at once.
+///
+/// # Example
+///
+/// ```
+/// use blake2b_simd::{blake2b, State, many::update_many};
+///
+/// let mut states = [
+///     State::new(),
+///     State::new(),
+///     State::new(),
+///     State::new(),
+/// ];
+///
+/// let inputs = [
+///     &b"foo"[..],
+///     &b"bar"[..],
+///     &b"baz"[..],
+///     &b"bing"[..],
+/// ];
+///
+/// update_many(states.iter_mut().zip(inputs.iter()));
+///
+/// for (state, input) in states.iter_mut().zip(inputs.iter()) {
+///     assert_eq!(blake2b(input), state.finalize());
+/// }
+/// ```
+pub fn update_many<'a, 'b, I, T>(pairs: I)
+where
+    I: IntoIterator<Item = (&'a mut State, &'b T)>,
+    T: 'b + AsRef<[u8]> + ?Sized,
+{
+    // Get the guts::Implementation from the first state, if any.
+    let mut peekable_pairs = pairs.into_iter().peekable();
+    let implementation = if let Some((state, _)) = peekable_pairs.peek() {
+        state.implementation
+    } else {
+        // No work items, just short circuit.
+        return;
+    };
+
+    // Adapt the pairs iterator into a Jobs iterator, but skip over the Jobs
+    // where there's not actually any work to do (e.g. because there's not much
+    // input and it's all just going in the State buffer).
+    let jobs = peekable_pairs.flat_map(|(state, input_t)| {
+        let mut input = input_t.as_ref();
+        // For each pair, if the State has some input in its buffer, try to
+        // finish that buffer. If there wasn't enough input to do that --
+        // or if the input was empty to begin with -- skip this pair.
+        state.compress_buffer_if_possible(&mut input);
+        if input.is_empty() {
+            return None;
+        }
+        // Now we know the buffer is empty and there's more input. Make sure we
+        // buffer the final block, because update() doesn't finalize.
+        let mut last_block_start = input.len() - 1;
+        last_block_start -= last_block_start % BLOCKBYTES;
+        let (blocks, last_block) = input.split_at(last_block_start);
+        state.buf[..last_block.len()].copy_from_slice(last_block);
+        state.buflen = last_block.len() as u8;
+        // Finally, if the full blocks slice is non-empty, prepare that job for
+        // compression, and bump the State count.
+        if blocks.is_empty() {
+            None
+        } else {
+            let count = state.count;
+            state.count = state.count.wrapping_add(blocks.len() as Count);
+            Some(Job {
+                input: blocks,
+                words: &mut state.words,
+                count,
+                last_node: state.last_node,
+            })
+        }
+    });
+
+    // Run all the Jobs in the iterator.
+    compress_many(jobs, implementation, Finalize::No, Stride::Serial);
+}
+
+/// A job for the [`hash_many`] function. After calling [`hash_many`] on a
+/// collection of `HashManyJob` objects, you can call [`to_hash`] on each job
+/// to get the result.
+///
+/// [`hash_many`]: fn.hash_many.html
+/// [`to_hash`]: struct.HashManyJob.html#method.to_hash
+#[derive(Clone)]
+pub struct HashManyJob<'a> {
+    words: [Word; 8],
+    count: Count,
+    last_node: LastNode,
+    hash_length: u8,
+    input: &'a [u8],
+    finished: bool,
+    implementation: guts::Implementation,
+}
+
+impl<'a> HashManyJob<'a> {
+    /// Construct a new `HashManyJob` from a set of hashing parameters and an
+    /// input.
+    #[inline]
+    pub fn new(params: &Params, input: &'a [u8]) -> Self {
+        let mut words = params.to_words();
+        let mut count = 0;
+        let mut finished = false;
+        // If we have key bytes, compress them into the state words. If there's
+        // no additional input, this compression needs to finalize and set
+        // finished=true.
+        if params.key_length > 0 {
+            let mut finalization = Finalize::No;
+            if input.is_empty() {
+                finalization = Finalize::Yes;
+                finished = true;
+            }
+            params.implementation.compress1_loop(
+                &params.key_block,
+                &mut words,
+                0,
+                params.last_node,
+                finalization,
+                Stride::Serial,
+            );
+            count = BLOCKBYTES as Count;
+        }
+        Self {
+            words,
+            count,
+            last_node: params.last_node,
+            hash_length: params.hash_length,
+            input,
+            finished,
+            implementation: params.implementation,
+        }
+    }
+
+    /// Get the hash from a finished job. If you call this before calling
+    /// [`hash_many`], it will panic in debug mode.
+    ///
+    /// [`hash_many`]: fn.hash_many.html
+    #[inline]
+    pub fn to_hash(&self) -> Hash {
+        debug_assert!(self.finished, "job hasn't been run yet");
+        Hash {
+            bytes: state_words_to_bytes(&self.words),
+            len: self.hash_length,
+        }
+    }
+}
+
+impl<'a> fmt::Debug for HashManyJob<'a> {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        // NB: Don't print the words. Leaking them would allow length extension.
+        write!(
+            f,
+            "HashManyJob {{ count: {}, hash_length: {}, last_node: {}, input_len: {} }}",
+            self.count,
+            self.hash_length,
+            self.last_node.yes(),
+            self.input.len(),
+        )
+    }
+}
+
+/// Hash any number of complete inputs all at once.
+///
+/// This is slightly more efficient than using `update_many` with `State`
+/// objects, because it doesn't need to do any buffering.
+///
+/// Running `hash_many` on the same `HashManyJob` object more than once has no
+/// effect.
+///
+/// # Example
+///
+/// ```
+/// use blake2b_simd::{blake2b, Params, many::{HashManyJob, hash_many}};
+///
+/// let inputs = [
+///     &b"foo"[..],
+///     &b"bar"[..],
+///     &b"baz"[..],
+///     &b"bing"[..],
+/// ];
+///
+/// let mut params = Params::new();
+/// params.hash_length(16);
+///
+/// let mut jobs = [
+///     HashManyJob::new(&params, inputs[0]),
+///     HashManyJob::new(&params, inputs[1]),
+///     HashManyJob::new(&params, inputs[2]),
+///     HashManyJob::new(&params, inputs[3]),
+/// ];
+///
+/// hash_many(jobs.iter_mut());
+///
+/// for (input, job) in inputs.iter().zip(jobs.iter()) {
+///     let expected = params.hash(input);
+///     assert_eq!(expected, job.to_hash());
+/// }
+/// ```
+pub fn hash_many<'a, 'b, I>(hash_many_jobs: I)
+where
+    'b: 'a,
+    I: IntoIterator<Item = &'a mut HashManyJob<'b>>,
+{
+    // Get the guts::Implementation from the first job, if any.
+    let mut peekable_jobs = hash_many_jobs.into_iter().peekable();
+    let implementation = if let Some(job) = peekable_jobs.peek() {
+        job.implementation
+    } else {
+        // No work items, just short circuit.
+        return;
+    };
+
+    // In the jobs iterator, skip HashManyJobs that have already been run. This
+    // is less because we actually expect callers to call hash_many twice
+    // (though they're allowed to if they want), and more because
+    // HashManyJob::new might need to finalize if there are key bytes but no
+    // input. Tying the job lifetime to the Params reference is an alternative,
+    // but I've found it too constraining in practice. We could also put key
+    // bytes in every HashManyJob, but that would add unnecessary storage and
+    // zeroing for all callers.
+    let unfinished_jobs = peekable_jobs.into_iter().filter(|j| !j.finished);
+    let jobs = unfinished_jobs.map(|j| {
+        j.finished = true;
+        Job {
+            input: j.input,
+            words: &mut j.words,
+            count: j.count,
+            last_node: j.last_node,
+        }
+    });
+    compress_many(jobs, implementation, Finalize::Yes, Stride::Serial);
+}
+
+#[cfg(test)]
+mod test {
+    use super::*;
+    use crate::guts;
+    use crate::paint_test_input;
+    use crate::BLOCKBYTES;
+    use arrayvec::ArrayVec;
+
+    #[test]
+    fn test_degree() {
+        assert!(degree() <= MAX_DEGREE);
+
+        #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+        #[cfg(feature = "std")]
+        {
+            if is_x86_feature_detected!("avx2") {
+                assert!(degree() >= 4);
+            }
+            if is_x86_feature_detected!("sse4.1") {
+                assert!(degree() >= 2);
+            }
+        }
+    }
+
+    #[test]
+    fn test_hash_many() {
+        // Use a length of inputs that will exercise all of the power-of-two loops.
+        const LEN: usize = 2 * guts::MAX_DEGREE - 1;
+
+        // Rerun LEN inputs LEN different times, with the empty input starting in a
+        // different spot each time.
+        let mut input = [0; LEN * BLOCKBYTES];
+        paint_test_input(&mut input);
+        for start_offset in 0..LEN {
+            let mut inputs: [&[u8]; LEN] = [&[]; LEN];
+            for i in 0..LEN {
+                let chunks = (i + start_offset) % LEN;
+                inputs[i] = &input[..chunks * BLOCKBYTES];
+            }
+
+            let mut params: ArrayVec<[Params; LEN]> = ArrayVec::new();
+            for i in 0..LEN {
+                let mut p = Params::new();
+                p.node_offset(i as u64);
+                if i % 2 == 1 {
+                    p.last_node(true);
+                    p.key(b"foo");
+                }
+                params.push(p);
+            }
+
+            let mut jobs: ArrayVec<[HashManyJob; LEN]> = ArrayVec::new();
+            for i in 0..LEN {
+                jobs.push(HashManyJob::new(&params[i], inputs[i]));
+            }
+
+            hash_many(&mut jobs);
+
+            // Check the outputs.
+            for i in 0..LEN {
+                let expected = params[i].hash(inputs[i]);
+                assert_eq!(expected, jobs[i].to_hash());
+            }
+        }
+    }
+
+    #[test]
+    fn test_update_many() {
+        // Use a length of inputs that will exercise all of the power-of-two loops.
+        const LEN: usize = 2 * guts::MAX_DEGREE - 1;
+
+        // Rerun LEN inputs LEN different times, with the empty input starting in a
+        // different spot each time.
+        let mut input = [0; LEN * BLOCKBYTES];
+        paint_test_input(&mut input);
+        for start_offset in 0..LEN {
+            let mut inputs: [&[u8]; LEN] = [&[]; LEN];
+            for i in 0..LEN {
+                let chunks = (i + start_offset) % LEN;
+                inputs[i] = &input[..chunks * BLOCKBYTES];
+            }
+
+            let mut params: ArrayVec<[Params; LEN]> = ArrayVec::new();
+            for i in 0..LEN {
+                let mut p = Params::new();
+                p.node_offset(i as u64);
+                if i % 2 == 1 {
+                    p.last_node(true);
+                    p.key(b"foo");
+                }
+                params.push(p);
+            }
+
+            let mut states: ArrayVec<[State; LEN]> = ArrayVec::new();
+            for i in 0..LEN {
+                states.push(params[i].to_state());
+            }
+
+            // Run each input twice through, to exercise buffering.
+            update_many(states.iter_mut().zip(inputs.iter()));
+            update_many(states.iter_mut().zip(inputs.iter()));
+
+            // Check the outputs.
+            for i in 0..LEN {
+                let mut reference_state = params[i].to_state();
+                // Again, run the input twice.
+                reference_state.update(inputs[i]);
+                reference_state.update(inputs[i]);
+                assert_eq!(reference_state.finalize(), states[i].finalize());
+                assert_eq!(2 * inputs[i].len() as Count, states[i].count());
+            }
+        }
+    }
+}
diff --git a/third_party/rust/blake2b_simd/src/portable.rs b/third_party/rust/blake2b_simd/src/portable.rs
new file mode 100644
index 0000000000..5a667ef2a8
--- /dev/null
+++ b/third_party/rust/blake2b_simd/src/portable.rs
@@ -0,0 +1,166 @@
+use arrayref::{array_ref, array_refs};
+
+use super::*;
+use crate::guts::{
+    count_high, count_low, final_block, flag_word, input_debug_asserts, Finalize, LastNode, Stride,
+};
+
+// G is the mixing function, called eight times per round in the compression
+// function. V is the 16-word state vector of the compression function, usually
+// described as a 4x4 matrix. A, B, C, and D are the mixing indices, set by the
+// caller first to the four columns of V, and then to its four diagonals. X and
+// Y are words of input, chosen by the caller according to the message
+// schedule, SIGMA.
+#[inline(always)]
+fn g(v: &mut [Word; 16], a: usize, b: usize, c: usize, d: usize, x: Word, y: Word) {
+    v[a] = v[a].wrapping_add(v[b]).wrapping_add(x);
+    v[d] = (v[d] ^ v[a]).rotate_right(32);
+    v[c] = v[c].wrapping_add(v[d]);
+    v[b] = (v[b] ^ v[c]).rotate_right(24);
+    v[a] = v[a].wrapping_add(v[b]).wrapping_add(y);
+    v[d] = (v[d] ^ v[a]).rotate_right(16);
+    v[c] = v[c].wrapping_add(v[d]);
+    v[b] = (v[b] ^ v[c]).rotate_right(63);
+}
+
+#[inline(always)]
+fn round(r: usize, m: &[Word; 16], v: &mut [Word; 16]) {
+    // Select the message schedule based on the round.
+    let s = SIGMA[r];
+
+    // Mix the columns.
+    g(v, 0, 4, 8, 12, m[s[0] as usize], m[s[1] as usize]);
+    g(v, 1, 5, 9, 13, m[s[2] as usize], m[s[3] as usize]);
+    g(v, 2, 6, 10, 14, m[s[4] as usize], m[s[5] as usize]);
+    g(v, 3, 7, 11, 15, m[s[6] as usize], m[s[7] as usize]);
+
+    // Mix the rows.
+    g(v, 0, 5, 10, 15, m[s[8] as usize], m[s[9] as usize]);
+    g(v, 1, 6, 11, 12, m[s[10] as usize], m[s[11] as usize]);
+    g(v, 2, 7, 8, 13, m[s[12] as usize], m[s[13] as usize]);
+    g(v, 3, 4, 9, 14, m[s[14] as usize], m[s[15] as usize]);
+}
+
+#[inline(always)]
+fn compress_block(
+    block: &[u8; BLOCKBYTES],
+    words: &mut [Word; 8],
+    count: Count,
+    last_block: Word,
+    last_node: Word,
+) {
+    // Initialize the compression state.
+    let mut v = [
+        words[0],
+        words[1],
+        words[2],
+        words[3],
+        words[4],
+        words[5],
+        words[6],
+        words[7],
+        IV[0],
+        IV[1],
+        IV[2],
+        IV[3],
+        IV[4] ^ count_low(count),
+        IV[5] ^ count_high(count),
+        IV[6] ^ last_block,
+        IV[7] ^ last_node,
+    ];
+
+    // Parse the message bytes as ints in little endian order.
+    const W: usize = size_of::<Word>();
+    let msg_refs = array_refs!(block, W, W, W, W, W, W, W, W, W, W, W, W, W, W, W, W);
+    let m = [
+        Word::from_le_bytes(*msg_refs.0),
+        Word::from_le_bytes(*msg_refs.1),
+        Word::from_le_bytes(*msg_refs.2),
+        Word::from_le_bytes(*msg_refs.3),
+        Word::from_le_bytes(*msg_refs.4),
+        Word::from_le_bytes(*msg_refs.5),
+        Word::from_le_bytes(*msg_refs.6),
+        Word::from_le_bytes(*msg_refs.7),
+        Word::from_le_bytes(*msg_refs.8),
+        Word::from_le_bytes(*msg_refs.9),
+        Word::from_le_bytes(*msg_refs.10),
+        Word::from_le_bytes(*msg_refs.11),
+        Word::from_le_bytes(*msg_refs.12),
+        Word::from_le_bytes(*msg_refs.13),
+        Word::from_le_bytes(*msg_refs.14),
+        Word::from_le_bytes(*msg_refs.15),
+    ];
+
+    round(0, &m, &mut v);
+    round(1, &m, &mut v);
+    round(2, &m, &mut v);
+    round(3, &m, &mut v);
+    round(4, &m, &mut v);
+    round(5, &m, &mut v);
+    round(6, &m, &mut v);
+    round(7, &m, &mut v);
+    round(8, &m, &mut v);
+    round(9, &m, &mut v);
+    round(10, &m, &mut v);
+    round(11, &m, &mut v);
+
+    words[0] ^= v[0] ^ v[8];
+    words[1] ^= v[1] ^ v[9];
+    words[2] ^= v[2] ^ v[10];
+    words[3] ^= v[3] ^ v[11];
+    words[4] ^= v[4] ^ v[12];
+    words[5] ^= v[5] ^ v[13];
+    words[6] ^= v[6] ^ v[14];
+    words[7] ^= v[7] ^ v[15];
+}
+
+pub fn compress1_loop(
+    input: &[u8],
+    words: &mut [Word; 8],
+    mut count: Count,
+    last_node: LastNode,
+    finalize: Finalize,
+    stride: Stride,
+) {
+    input_debug_asserts(input, finalize);
+
+    let mut local_words = *words;
+
+    let mut fin_offset = input.len().saturating_sub(1);
+    fin_offset -= fin_offset % stride.padded_blockbytes();
+    let mut buf = [0; BLOCKBYTES];
+    let (fin_block, fin_len, _) = final_block(input, fin_offset, &mut buf, stride);
+    let fin_last_block = flag_word(finalize.yes());
+    let fin_last_node = flag_word(finalize.yes() && last_node.yes());
+
+    let mut offset = 0;
+    loop {
+        let block;
+        let count_delta;
+        let last_block;
+        let last_node;
+        if offset == fin_offset {
+            block = fin_block;
+            count_delta = fin_len;
+            last_block = fin_last_block;
+            last_node = fin_last_node;
+        } else {
+            block = array_ref!(input, offset, BLOCKBYTES);
+            count_delta = BLOCKBYTES;
+            last_block = flag_word(false);
+            last_node = flag_word(false);
+        };
+
+        count = count.wrapping_add(count_delta as Count);
+        compress_block(block, &mut local_words, count, last_block, last_node);
+
+        // Check for termination before bumping the offset, to avoid overflow.
+        if offset == fin_offset {
+            break;
+        }
+
+        offset += stride.padded_blockbytes();
+    }
+
+    *words = local_words;
+}
diff --git a/third_party/rust/blake2b_simd/src/sse41.rs b/third_party/rust/blake2b_simd/src/sse41.rs
new file mode 100644
index 0000000000..3a55dc8c35
--- /dev/null
+++ b/third_party/rust/blake2b_simd/src/sse41.rs
@@ -0,0 +1,454 @@
+#[cfg(target_arch = "x86")]
+use core::arch::x86::*;
+#[cfg(target_arch = "x86_64")]
+use core::arch::x86_64::*;
+
+use crate::guts::{
+    assemble_count, count_high, count_low, final_block, flag_word, input_debug_asserts, Finalize,
+    Job, Stride,
+};
+use crate::{Word, BLOCKBYTES, IV, SIGMA};
+use arrayref::{array_refs, mut_array_refs};
+use core::cmp;
+use core::mem;
+
+pub const DEGREE: usize = 2;
+
+#[inline(always)]
+unsafe fn loadu(src: *const [Word; DEGREE]) -> __m128i {
+    // This is an unaligned load, so the pointer cast is allowed.
+    _mm_loadu_si128(src as *const __m128i)
+}
+
+#[inline(always)]
+unsafe fn storeu(src: __m128i, dest: *mut [Word; DEGREE]) {
+    // This is an unaligned store, so the pointer cast is allowed.
+    _mm_storeu_si128(dest as *mut __m128i, src)
+}
+
+#[inline(always)]
+unsafe fn add(a: __m128i, b: __m128i) -> __m128i {
+    _mm_add_epi64(a, b)
+}
+
+#[inline(always)]
+unsafe fn eq(a: __m128i, b: __m128i) -> __m128i {
+    _mm_cmpeq_epi64(a, b)
+}
+
+#[inline(always)]
+unsafe fn and(a: __m128i, b: __m128i) -> __m128i {
+    _mm_and_si128(a, b)
+}
+
+#[inline(always)]
+unsafe fn negate_and(a: __m128i, b: __m128i) -> __m128i {
+    // Note that "and not" implies the reverse of the actual arg order.
+    _mm_andnot_si128(a, b)
+}
+
+#[inline(always)]
+unsafe fn xor(a: __m128i, b: __m128i) -> __m128i {
+    _mm_xor_si128(a, b)
+}
+
+#[inline(always)]
+unsafe fn set1(x: u64) -> __m128i {
+    _mm_set1_epi64x(x as i64)
+}
+
+#[inline(always)]
+unsafe fn set2(a: u64, b: u64) -> __m128i {
+    // There's no _mm_setr_epi64x, so note the arg order is backwards.
+    _mm_set_epi64x(b as i64, a as i64)
+}
+
+// Adapted from https://github.com/rust-lang-nursery/stdsimd/pull/479.
+macro_rules! _MM_SHUFFLE {
+    ($z:expr, $y:expr, $x:expr, $w:expr) => {
+        ($z << 6) | ($y << 4) | ($x << 2) | $w
+    };
+}
+
+#[inline(always)]
+unsafe fn rot32(x: __m128i) -> __m128i {
+    _mm_shuffle_epi32(x, _MM_SHUFFLE!(2, 3, 0, 1))
+}
+
+#[inline(always)]
+unsafe fn rot24(x: __m128i) -> __m128i {
+    let rotate24 = _mm_setr_epi8(3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10);
+    _mm_shuffle_epi8(x, rotate24)
+}
+
+#[inline(always)]
+unsafe fn rot16(x: __m128i) -> __m128i {
+    let rotate16 = _mm_setr_epi8(2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9);
+    _mm_shuffle_epi8(x, rotate16)
+}
+
+#[inline(always)]
+unsafe fn rot63(x: __m128i) -> __m128i {
+    _mm_or_si128(_mm_srli_epi64(x, 63), add(x, x))
+}
+
+#[inline(always)]
+unsafe fn round(v: &mut [__m128i; 16], m: &[__m128i; 16], r: usize) {
+    v[0] = add(v[0], m[SIGMA[r][0] as usize]);
+    v[1] = add(v[1], m[SIGMA[r][2] as usize]);
+    v[2] = add(v[2], m[SIGMA[r][4] as usize]);
+    v[3] = add(v[3], m[SIGMA[r][6] as usize]);
+    v[0] = add(v[0], v[4]);
+    v[1] = add(v[1], v[5]);
+    v[2] = add(v[2], v[6]);
+    v[3] = add(v[3], v[7]);
+    v[12] = xor(v[12], v[0]);
+    v[13] = xor(v[13], v[1]);
+    v[14] = xor(v[14], v[2]);
+    v[15] = xor(v[15], v[3]);
+    v[12] = rot32(v[12]);
+    v[13] = rot32(v[13]);
+    v[14] = rot32(v[14]);
+    v[15] = rot32(v[15]);
+    v[8] = add(v[8], v[12]);
+    v[9] = add(v[9], v[13]);
+    v[10] = add(v[10], v[14]);
+    v[11] = add(v[11], v[15]);
+    v[4] = xor(v[4], v[8]);
+    v[5] = xor(v[5], v[9]);
+    v[6] = xor(v[6], v[10]);
+    v[7] = xor(v[7], v[11]);
+    v[4] = rot24(v[4]);
+    v[5] = rot24(v[5]);
+    v[6] = rot24(v[6]);
+    v[7] = rot24(v[7]);
+    v[0] = add(v[0], m[SIGMA[r][1] as usize]);
+    v[1] = add(v[1], m[SIGMA[r][3] as usize]);
+    v[2] = add(v[2], m[SIGMA[r][5] as usize]);
+    v[3] = add(v[3], m[SIGMA[r][7] as usize]);
+    v[0] = add(v[0], v[4]);
+    v[1] = add(v[1], v[5]);
+    v[2] = add(v[2], v[6]);
+    v[3] = add(v[3], v[7]);
+    v[12] = xor(v[12], v[0]);
+    v[13] = xor(v[13], v[1]);
+    v[14] = xor(v[14], v[2]);
+    v[15] = xor(v[15], v[3]);
+    v[12] = rot16(v[12]);
+    v[13] = rot16(v[13]);
+    v[14] = rot16(v[14]);
+    v[15] = rot16(v[15]);
+    v[8] = add(v[8], v[12]);
+    v[9] = add(v[9], v[13]);
+    v[10] = add(v[10], v[14]);
+    v[11] = add(v[11], v[15]);
+    v[4] = xor(v[4], v[8]);
+    v[5] = xor(v[5], v[9]);
+    v[6] = xor(v[6], v[10]);
+    v[7] = xor(v[7], v[11]);
+    v[4] = rot63(v[4]);
+    v[5] = rot63(v[5]);
+    v[6] = rot63(v[6]);
+    v[7] = rot63(v[7]);
+
+    v[0] = add(v[0], m[SIGMA[r][8] as usize]);
+    v[1] = add(v[1], m[SIGMA[r][10] as usize]);
+    v[2] = add(v[2], m[SIGMA[r][12] as usize]);
+    v[3] = add(v[3], m[SIGMA[r][14] as usize]);
+    v[0] = add(v[0], v[5]);
+    v[1] = add(v[1], v[6]);
+    v[2] = add(v[2], v[7]);
+    v[3] = add(v[3], v[4]);
+    v[15] = xor(v[15], v[0]);
+    v[12] = xor(v[12], v[1]);
+    v[13] = xor(v[13], v[2]);
+    v[14] = xor(v[14], v[3]);
+    v[15] = rot32(v[15]);
+    v[12] = rot32(v[12]);
+    v[13] = rot32(v[13]);
+    v[14] = rot32(v[14]);
+    v[10] = add(v[10], v[15]);
+    v[11] = add(v[11], v[12]);
+    v[8] = add(v[8], v[13]);
+    v[9] = add(v[9], v[14]);
+    v[5] = xor(v[5], v[10]);
+    v[6] = xor(v[6], v[11]);
+    v[7] = xor(v[7], v[8]);
+    v[4] = xor(v[4], v[9]);
+    v[5] = rot24(v[5]);
+    v[6] = rot24(v[6]);
+    v[7] = rot24(v[7]);
+    v[4] = rot24(v[4]);
+    v[0] = add(v[0], m[SIGMA[r][9] as usize]);
+    v[1] = add(v[1], m[SIGMA[r][11] as usize]);
+    v[2] = add(v[2], m[SIGMA[r][13] as usize]);
+    v[3] = add(v[3], m[SIGMA[r][15] as usize]);
+    v[0] = add(v[0], v[5]);
+    v[1] = add(v[1], v[6]);
+    v[2] = add(v[2], v[7]);
+    v[3] = add(v[3], v[4]);
+    v[15] = xor(v[15], v[0]);
+    v[12] = xor(v[12], v[1]);
+    v[13] = xor(v[13], v[2]);
+    v[14] = xor(v[14], v[3]);
+    v[15] = rot16(v[15]);
+    v[12] = rot16(v[12]);
+    v[13] = rot16(v[13]);
+    v[14] = rot16(v[14]);
+    v[10] = add(v[10], v[15]);
+    v[11] = add(v[11], v[12]);
+    v[8] = add(v[8], v[13]);
+    v[9] = add(v[9], v[14]);
+    v[5] = xor(v[5], v[10]);
+    v[6] = xor(v[6], v[11]);
+    v[7] = xor(v[7], v[8]);
+    v[4] = xor(v[4], v[9]);
+    v[5] = rot63(v[5]);
+    v[6] = rot63(v[6]);
+    v[7] = rot63(v[7]);
+    v[4] = rot63(v[4]);
+}
+
+// We'd rather make this a regular function with #[inline(always)], but for
+// some reason that blows up compile times by about 10 seconds, at least in
+// some cases (BLAKE2b avx2.rs). This macro seems to get the same performance
+// result, without the compile time issue.
+macro_rules! compress2_transposed {
+    (
+        $h_vecs:expr,
+        $msg_vecs:expr,
+        $count_low:expr,
+        $count_high:expr,
+        $lastblock:expr,
+        $lastnode:expr,
+    ) => {
+        let h_vecs: &mut [__m128i; 8] = $h_vecs;
+        let msg_vecs: &[__m128i; 16] = $msg_vecs;
+        let count_low: __m128i = $count_low;
+        let count_high: __m128i = $count_high;
+        let lastblock: __m128i = $lastblock;
+        let lastnode: __m128i = $lastnode;
+        let mut v = [
+            h_vecs[0],
+            h_vecs[1],
+            h_vecs[2],
+            h_vecs[3],
+            h_vecs[4],
+            h_vecs[5],
+            h_vecs[6],
+            h_vecs[7],
+            set1(IV[0]),
+            set1(IV[1]),
+            set1(IV[2]),
+            set1(IV[3]),
+            xor(set1(IV[4]), count_low),
+            xor(set1(IV[5]), count_high),
+            xor(set1(IV[6]), lastblock),
+            xor(set1(IV[7]), lastnode),
+        ];
+
+        round(&mut v, &msg_vecs, 0);
+        round(&mut v, &msg_vecs, 1);
+        round(&mut v, &msg_vecs, 2);
+        round(&mut v, &msg_vecs, 3);
+        round(&mut v, &msg_vecs, 4);
+        round(&mut v, &msg_vecs, 5);
+        round(&mut v, &msg_vecs, 6);
+        round(&mut v, &msg_vecs, 7);
+        round(&mut v, &msg_vecs, 8);
+        round(&mut v, &msg_vecs, 9);
+        round(&mut v, &msg_vecs, 10);
+        round(&mut v, &msg_vecs, 11);
+
+        h_vecs[0] = xor(xor(h_vecs[0], v[0]), v[8]);
+        h_vecs[1] = xor(xor(h_vecs[1], v[1]), v[9]);
+        h_vecs[2] = xor(xor(h_vecs[2], v[2]), v[10]);
+        h_vecs[3] = xor(xor(h_vecs[3], v[3]), v[11]);
+        h_vecs[4] = xor(xor(h_vecs[4], v[4]), v[12]);
+        h_vecs[5] = xor(xor(h_vecs[5], v[5]), v[13]);
+        h_vecs[6] = xor(xor(h_vecs[6], v[6]), v[14]);
+        h_vecs[7] = xor(xor(h_vecs[7], v[7]), v[15]);
+    };
+}
+
+#[inline(always)]
+unsafe fn transpose_vecs(a: __m128i, b: __m128i) -> [__m128i; DEGREE] {
+    let a_words: [Word; DEGREE] = mem::transmute(a);
+    let b_words: [Word; DEGREE] = mem::transmute(b);
+    [set2(a_words[0], b_words[0]), set2(a_words[1], b_words[1])]
+}
+
+#[inline(always)]
+unsafe fn transpose_state_vecs(jobs: &[Job; DEGREE]) -> [__m128i; 8] {
+    // Load all the state words into transposed vectors, where the first vector
+    // has the first word of each state, etc. Transposing once at the beginning
+    // and once at the end is more efficient that repeating it for each block.
+    let words0 = array_refs!(&jobs[0].words, DEGREE, DEGREE, DEGREE, DEGREE);
+    let words1 = array_refs!(&jobs[1].words, DEGREE, DEGREE, DEGREE, DEGREE);
+    let [h0, h1] = transpose_vecs(loadu(words0.0), loadu(words1.0));
+    let [h2, h3] = transpose_vecs(loadu(words0.1), loadu(words1.1));
+    let [h4, h5] = transpose_vecs(loadu(words0.2), loadu(words1.2));
+    let [h6, h7] = transpose_vecs(loadu(words0.3), loadu(words1.3));
+    [h0, h1, h2, h3, h4, h5, h6, h7]
+}
+
+#[inline(always)]
+unsafe fn untranspose_state_vecs(h_vecs: &[__m128i; 8], jobs: &mut [Job; DEGREE]) {
+    // Un-transpose the updated state vectors back into the caller's arrays.
+    let [job0, job1] = jobs;
+    let words0 = mut_array_refs!(&mut job0.words, DEGREE, DEGREE, DEGREE, DEGREE);
+    let words1 = mut_array_refs!(&mut job1.words, DEGREE, DEGREE, DEGREE, DEGREE);
+
+    let out = transpose_vecs(h_vecs[0], h_vecs[1]);
+    storeu(out[0], words0.0);
+    storeu(out[1], words1.0);
+    let out = transpose_vecs(h_vecs[2], h_vecs[3]);
+    storeu(out[0], words0.1);
+    storeu(out[1], words1.1);
+    let out = transpose_vecs(h_vecs[4], h_vecs[5]);
+    storeu(out[0], words0.2);
+    storeu(out[1], words1.2);
+    let out = transpose_vecs(h_vecs[6], h_vecs[7]);
+    storeu(out[0], words0.3);
+    storeu(out[1], words1.3);
+}
+
+#[inline(always)]
+unsafe fn transpose_msg_vecs(blocks: [*const [u8; BLOCKBYTES]; DEGREE]) -> [__m128i; 16] {
+    // These input arrays have no particular alignment, so we use unaligned
+    // loads to read from them.
+    let block0 = blocks[0] as *const [Word; DEGREE];
+    let block1 = blocks[1] as *const [Word; DEGREE];
+    let [m0, m1] = transpose_vecs(loadu(block0.add(0)), loadu(block1.add(0)));
+    let [m2, m3] = transpose_vecs(loadu(block0.add(1)), loadu(block1.add(1)));
+    let [m4, m5] = transpose_vecs(loadu(block0.add(2)), loadu(block1.add(2)));
+    let [m6, m7] = transpose_vecs(loadu(block0.add(3)), loadu(block1.add(3)));
+    let [m8, m9] = transpose_vecs(loadu(block0.add(4)), loadu(block1.add(4)));
+    let [m10, m11] = transpose_vecs(loadu(block0.add(5)), loadu(block1.add(5)));
+    let [m12, m13] = transpose_vecs(loadu(block0.add(6)), loadu(block1.add(6)));
+    let [m14, m15] = transpose_vecs(loadu(block0.add(7)), loadu(block1.add(7)));
+    [
+        m0, m1, m2, m3, m4, m5, m6, m7, m8, m9, m10, m11, m12, m13, m14, m15,
+    ]
+}
+
+#[inline(always)]
+unsafe fn load_counts(jobs: &[Job; DEGREE]) -> (__m128i, __m128i) {
+    (
+        set2(count_low(jobs[0].count), count_low(jobs[1].count)),
+        set2(count_high(jobs[0].count), count_high(jobs[1].count)),
+    )
+}
+
+#[inline(always)]
+unsafe fn store_counts(jobs: &mut [Job; DEGREE], low: __m128i, high: __m128i) {
+    let low_ints: [Word; DEGREE] = mem::transmute(low);
+    let high_ints: [Word; DEGREE] = mem::transmute(high);
+    for i in 0..DEGREE {
+        jobs[i].count = assemble_count(low_ints[i], high_ints[i]);
+    }
+}
+
+#[inline(always)]
+unsafe fn add_to_counts(lo: &mut __m128i, hi: &mut __m128i, delta: __m128i) {
+    // If the low counts reach zero, that means they wrapped, unless the delta
+    // was also zero.
+    *lo = add(*lo, delta);
+    let lo_reached_zero = eq(*lo, set1(0));
+    let delta_was_zero = eq(delta, set1(0));
+    let hi_inc = and(set1(1), negate_and(delta_was_zero, lo_reached_zero));
+    *hi = add(*hi, hi_inc);
+}
+
+#[inline(always)]
+unsafe fn flags_vec(flags: [bool; DEGREE]) -> __m128i {
+    set2(flag_word(flags[0]), flag_word(flags[1]))
+}
+
+#[target_feature(enable = "sse4.1")]
+pub unsafe fn compress2_loop(jobs: &mut [Job; DEGREE], finalize: Finalize, stride: Stride) {
+    // If we're not finalizing, there can't be a partial block at the end.
+    for job in jobs.iter() {
+        input_debug_asserts(job.input, finalize);
+    }
+
+    let msg_ptrs = [jobs[0].input.as_ptr(), jobs[1].input.as_ptr()];
+    let mut h_vecs = transpose_state_vecs(&jobs);
+    let (mut counts_lo, mut counts_hi) = load_counts(&jobs);
+
+    // Prepare the final blocks (note, which could be empty if the input is
+    // empty). Do all this before entering the main loop.
+    let min_len = jobs.iter().map(|job| job.input.len()).min().unwrap();
+    let mut fin_offset = min_len.saturating_sub(1);
+    fin_offset -= fin_offset % stride.padded_blockbytes();
+    // Performance note, making these buffers mem::uninitialized() seems to
+    // cause problems in the optimizer.
+    let mut buf0: [u8; BLOCKBYTES] = [0; BLOCKBYTES];
+    let mut buf1: [u8; BLOCKBYTES] = [0; BLOCKBYTES];
+    let (block0, len0, finalize0) = final_block(jobs[0].input, fin_offset, &mut buf0, stride);
+    let (block1, len1, finalize1) = final_block(jobs[1].input, fin_offset, &mut buf1, stride);
+    let fin_blocks: [*const [u8; BLOCKBYTES]; DEGREE] = [block0, block1];
+    let fin_counts_delta = set2(len0 as Word, len1 as Word);
+    let fin_last_block;
+    let fin_last_node;
+    if finalize.yes() {
+        fin_last_block = flags_vec([finalize0, finalize1]);
+        fin_last_node = flags_vec([
+            finalize0 && jobs[0].last_node.yes(),
+            finalize1 && jobs[1].last_node.yes(),
+        ]);
+    } else {
+        fin_last_block = set1(0);
+        fin_last_node = set1(0);
+    }
+
+    // The main loop.
+    let mut offset = 0;
+    loop {
+        let blocks;
+        let counts_delta;
+        let last_block;
+        let last_node;
+        if offset == fin_offset {
+            blocks = fin_blocks;
+            counts_delta = fin_counts_delta;
+            last_block = fin_last_block;
+            last_node = fin_last_node;
+        } else {
+            blocks = [
+                msg_ptrs[0].add(offset) as *const [u8; BLOCKBYTES],
+                msg_ptrs[1].add(offset) as *const [u8; BLOCKBYTES],
+            ];
+            counts_delta = set1(BLOCKBYTES as Word);
+            last_block = set1(0);
+            last_node = set1(0);
+        };
+
+        let m_vecs = transpose_msg_vecs(blocks);
+        add_to_counts(&mut counts_lo, &mut counts_hi, counts_delta);
+        compress2_transposed!(
+            &mut h_vecs,
+            &m_vecs,
+            counts_lo,
+            counts_hi,
+            last_block,
+            last_node,
+        );
+
+        // Check for termination before bumping the offset, to avoid overflow.
+        if offset == fin_offset {
+            break;
+        }
+
+        offset += stride.padded_blockbytes();
+    }
+
+    // Write out the results.
+    untranspose_state_vecs(&h_vecs, &mut *jobs);
+    store_counts(&mut *jobs, counts_lo, counts_hi);
+    let max_consumed = offset.saturating_add(stride.padded_blockbytes());
+    for job in jobs.iter_mut() {
+        let consumed = cmp::min(max_consumed, job.input.len());
+        job.input = &job.input[consumed..];
+    }
+}
diff --git a/third_party/rust/blake2b_simd/src/test.rs b/third_party/rust/blake2b_simd/src/test.rs
new file mode 100644
index 0000000000..9ca8e87751
--- /dev/null
+++ b/third_party/rust/blake2b_simd/src/test.rs
@@ -0,0 +1,201 @@
+use super::*;
+
+const EMPTY_HASH: &str = "786a02f742015903c6c6fd852552d272912f4740e15847618a86e217f71f5419\
+                          d25e1031afee585313896444934eb04b903a685b1448b755d56f701afe9be2ce";
+const ABC_HASH: &str = "ba80a53f981c4d0d6a2797b69f12f6e94c212f14685ac4b74b12bb6fdbffa2d1\
+                        7d87c5392aab792dc252d5de4533cc9518d38aa8dbf1925ab92386edd4009923";
+const ONE_BLOCK_HASH: &str = "865939e120e6805438478841afb739ae4250cf372653078a065cdcfffca4caf7\
+                              98e6d462b65d658fc165782640eded70963449ae1500fb0f24981d7727e22c41";
+const THOUSAND_HASH: &str = "1ee4e51ecab5210a518f26150e882627ec839967f19d763e1508b12cfefed148\
+                             58f6a1c9d1f969bc224dc9440f5a6955277e755b9c513f9ba4421c5e50c8d787";
+
+#[test]
+fn test_update_state() {
+    let io = &[
+        (&b""[..], EMPTY_HASH),
+        (&b"abc"[..], ABC_HASH),
+        (&[0; BLOCKBYTES], ONE_BLOCK_HASH),
+        (&[0; 1000], THOUSAND_HASH),
+    ];
+    // Test each input all at once.
+    for &(input, output) in io {
+        let hash = blake2b(input);
+        assert_eq!(&hash.to_hex(), output, "hash mismatch");
+    }
+    // Now in two chunks. This is especially important for the ONE_BLOCK case, because it would be
+    // a mistake for update() to call compress, even though the buffer is full.
+    for &(input, output) in io {
+        let mut state = State::new();
+        let split = input.len() / 2;
+        state.update(&input[..split]);
+        assert_eq!(split as Count, state.count());
+        state.update(&input[split..]);
+        assert_eq!(input.len() as Count, state.count());
+        let hash = state.finalize();
+        assert_eq!(&hash.to_hex(), output, "hash mismatch");
+    }
+    // Now one byte at a time.
+    for &(input, output) in io {
+        let mut state = State::new();
+        let mut count = 0;
+        for &b in input {
+            state.update(&[b]);
+            count += 1;
+            assert_eq!(count, state.count());
+        }
+        let hash = state.finalize();
+        assert_eq!(&hash.to_hex(), output, "hash mismatch");
+    }
+}
+
+#[test]
+fn test_multiple_finalizes() {
+    let mut state = State::new();
+    assert_eq!(&state.finalize().to_hex(), EMPTY_HASH, "hash mismatch");
+    assert_eq!(&state.finalize().to_hex(), EMPTY_HASH, "hash mismatch");
+    assert_eq!(&state.finalize().to_hex(), EMPTY_HASH, "hash mismatch");
+    state.update(b"abc");
+    assert_eq!(&state.finalize().to_hex(), ABC_HASH, "hash mismatch");
+    assert_eq!(&state.finalize().to_hex(), ABC_HASH, "hash mismatch");
+    assert_eq!(&state.finalize().to_hex(), ABC_HASH, "hash mismatch");
+}
+
+#[cfg(feature = "std")]
+#[test]
+fn test_write() {
+    use std::io::prelude::*;
+
+    let mut state = State::new();
+    state.write_all(&[0; 1000]).unwrap();
+    let hash = state.finalize();
+    assert_eq!(&hash.to_hex(), THOUSAND_HASH, "hash mismatch");
+}
+
+// You can check this case against the equivalent Python:
+//
+// import hashlib
+// hashlib.blake2b(
+//     b'foo',
+//     digest_size=18,
+//     key=b"bar",
+//     salt=b"bazbazbazbazbazb",
+//     person=b"bing bing bing b",
+//     fanout=2,
+//     depth=3,
+//     leaf_size=0x04050607,
+//     node_offset=0x08090a0b0c0d0e0f,
+//     node_depth=16,
+//     inner_size=17,
+//     last_node=True,
+// ).hexdigest()
+#[test]
+fn test_all_parameters() {
+    let mut params = Params::new();
+    params
+        .hash_length(18)
+        // Make sure a shorter key properly overwrites a longer one.
+        .key(b"not the real key")
+        .key(b"bar")
+        .salt(b"bazbazbazbazbazb")
+        .personal(b"bing bing bing b")
+        .fanout(2)
+        .max_depth(3)
+        .max_leaf_length(0x04050607)
+        .node_offset(0x08090a0b0c0d0e0f)
+        .node_depth(16)
+        .inner_hash_length(17)
+        .last_node(true);
+
+    // Check the State API.
+    assert_eq!(
+        "ec0f59cb65f92e7fcca1280ba859a6925ded",
+        &params.to_state().update(b"foo").finalize().to_hex()
+    );
+
+    // Check the all-at-once API.
+    assert_eq!(
+        "ec0f59cb65f92e7fcca1280ba859a6925ded",
+        &params.hash(b"foo").to_hex()
+    );
+}
+
+#[test]
+fn test_all_parameters_blake2bp() {
+    let mut params = blake2bp::Params::new();
+    params
+        .hash_length(18)
+        // Make sure a shorter key properly overwrites a longer one.
+        .key(b"not the real key")
+        .key(b"bar");
+
+    // Check the State API.
+    assert_eq!(
+        "8c54e888a8a01c63da6585c058fe54ea81df",
+        &params.to_state().update(b"foo").finalize().to_hex()
+    );
+
+    // Check the all-at-once API.
+    assert_eq!(
+        "8c54e888a8a01c63da6585c058fe54ea81df",
+        &params.hash(b"foo").to_hex()
+    );
+}
+
+#[test]
+#[should_panic]
+fn test_short_hash_length_panics() {
+    Params::new().hash_length(0);
+}
+
+#[test]
+#[should_panic]
+fn test_long_hash_length_panics() {
+    Params::new().hash_length(OUTBYTES + 1);
+}
+
+#[test]
+#[should_panic]
+fn test_long_key_panics() {
+    Params::new().key(&[0; KEYBYTES + 1]);
+}
+
+#[test]
+#[should_panic]
+fn test_long_salt_panics() {
+    Params::new().salt(&[0; SALTBYTES + 1]);
+}
+
+#[test]
+#[should_panic]
+fn test_long_personal_panics() {
+    Params::new().personal(&[0; PERSONALBYTES + 1]);
+}
+
+#[test]
+fn test_zero_max_depth_supported() {
+    Params::new().max_depth(0);
+}
+
+#[test]
+#[should_panic]
+fn test_long_inner_hash_length_panics() {
+    Params::new().inner_hash_length(OUTBYTES + 1);
+}
+
+#[test]
+#[should_panic]
+fn test_blake2bp_short_hash_length_panics() {
+    blake2bp::Params::new().hash_length(0);
+}
+
+#[test]
+#[should_panic]
+fn test_blake2bp_long_hash_length_panics() {
+    blake2bp::Params::new().hash_length(OUTBYTES + 1);
+}
+
+#[test]
+#[should_panic]
+fn test_blake2bp_long_key_panics() {
+    blake2bp::Params::new().key(&[0; KEYBYTES + 1]);
+}
author	Daniel Baumann <daniel.baumann@progress-linux.org>	2024-04-28 14:29:10 +0000
committer	Daniel Baumann <daniel.baumann@progress-linux.org>	2024-04-28 14:29:10 +0000
commit	2aa4a82499d4becd2284cdb482213d541b8804dd (patch)
tree	b80bf8bf13c3766139fbacc530efd0dd9d54394c /third_party/rust/blake2b_simd/src
parent	Initial commit. (diff)
download	firefox-upstream.tar.xz firefox-upstream.zip