4 files changed, 716 insertions, 0 deletions
diff --git a/rust/vendor/sha2/src/sha256/aarch64.rs b/rust/vendor/sha2/src/sha256/aarch64.rs
new file mode 100644
index 0000000..9d220a3
--- /dev/null
+++ b/rust/vendor/sha2/src/sha256/aarch64.rs
@@ -0,0 +1,159 @@
+//! SHA-256 `aarch64` backend.
+
+// Implementation adapted from mbedtls.
+
+// TODO: stdarch intrinsics: RustCrypto/hashes#257
+
+use core::arch::{aarch64::*, asm};
+
+use crate::consts::K32;
+
+cpufeatures::new!(sha2_hwcap, "sha2");
+
+pub fn compress(state: &mut [u32; 8], blocks: &[[u8; 64]]) {
+    // TODO: Replace with https://github.com/rust-lang/rfcs/pull/2725
+    // after stabilization
+    if sha2_hwcap::get() {
+        unsafe { sha256_compress(state, blocks) }
+    } else {
+        super::soft::compress(state, blocks);
+    }
+}
+
+#[target_feature(enable = "sha2")]
+unsafe fn sha256_compress(state: &mut [u32; 8], blocks: &[[u8; 64]]) {
+    // SAFETY: Requires the sha2 feature.
+
+    // Load state into vectors.
+    let mut abcd = vld1q_u32(state[0..4].as_ptr());
+    let mut efgh = vld1q_u32(state[4..8].as_ptr());
+
+    // Iterate through the message blocks.
+    for block in blocks {
+        // Keep original state values.
+        let abcd_orig = abcd;
+        let efgh_orig = efgh;
+
+        // Load the message block into vectors, assuming little endianness.
+        let mut s0 = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(block[0..16].as_ptr())));
+        let mut s1 = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(block[16..32].as_ptr())));
+        let mut s2 = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(block[32..48].as_ptr())));
+        let mut s3 = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(block[48..64].as_ptr())));
+
+        // Rounds 0 to 3
+        let mut tmp = vaddq_u32(s0, vld1q_u32(&K32[0]));
+        let mut abcd_prev = abcd;
+        abcd = vsha256hq_u32(abcd_prev, efgh, tmp);
+        efgh = vsha256h2q_u32(efgh, abcd_prev, tmp);
+
+        // Rounds 4 to 7
+        tmp = vaddq_u32(s1, vld1q_u32(&K32[4]));
+        abcd_prev = abcd;
+        abcd = vsha256hq_u32(abcd_prev, efgh, tmp);
+        efgh = vsha256h2q_u32(efgh, abcd_prev, tmp);
+
+        // Rounds 8 to 11
+        tmp = vaddq_u32(s2, vld1q_u32(&K32[8]));
+        abcd_prev = abcd;
+        abcd = vsha256hq_u32(abcd_prev, efgh, tmp);
+        efgh = vsha256h2q_u32(efgh, abcd_prev, tmp);
+
+        // Rounds 12 to 15
+        tmp = vaddq_u32(s3, vld1q_u32(&K32[12]));
+        abcd_prev = abcd;
+        abcd = vsha256hq_u32(abcd_prev, efgh, tmp);
+        efgh = vsha256h2q_u32(efgh, abcd_prev, tmp);
+
+        for t in (16..64).step_by(16) {
+            // Rounds t to t + 3
+            s0 = vsha256su1q_u32(vsha256su0q_u32(s0, s1), s2, s3);
+            tmp = vaddq_u32(s0, vld1q_u32(&K32[t]));
+            abcd_prev = abcd;
+            abcd = vsha256hq_u32(abcd_prev, efgh, tmp);
+            efgh = vsha256h2q_u32(efgh, abcd_prev, tmp);
+
+            // Rounds t + 4 to t + 7
+            s1 = vsha256su1q_u32(vsha256su0q_u32(s1, s2), s3, s0);
+            tmp = vaddq_u32(s1, vld1q_u32(&K32[t + 4]));
+            abcd_prev = abcd;
+            abcd = vsha256hq_u32(abcd_prev, efgh, tmp);
+            efgh = vsha256h2q_u32(efgh, abcd_prev, tmp);
+
+            // Rounds t + 8 to t + 11
+            s2 = vsha256su1q_u32(vsha256su0q_u32(s2, s3), s0, s1);
+            tmp = vaddq_u32(s2, vld1q_u32(&K32[t + 8]));
+            abcd_prev = abcd;
+            abcd = vsha256hq_u32(abcd_prev, efgh, tmp);
+            efgh = vsha256h2q_u32(efgh, abcd_prev, tmp);
+
+            // Rounds t + 12 to t + 15
+            s3 = vsha256su1q_u32(vsha256su0q_u32(s3, s0), s1, s2);
+            tmp = vaddq_u32(s3, vld1q_u32(&K32[t + 12]));
+            abcd_prev = abcd;
+            abcd = vsha256hq_u32(abcd_prev, efgh, tmp);
+            efgh = vsha256h2q_u32(efgh, abcd_prev, tmp);
+        }
+
+        // Add the block-specific state to the original state.
+        abcd = vaddq_u32(abcd, abcd_orig);
+        efgh = vaddq_u32(efgh, efgh_orig);
+    }
+
+    // Store vectors into state.
+    vst1q_u32(state[0..4].as_mut_ptr(), abcd);
+    vst1q_u32(state[4..8].as_mut_ptr(), efgh);
+}
+
+// TODO remove these polyfills once SHA2 intrinsics land
+
+#[inline(always)]
+unsafe fn vsha256hq_u32(
+    mut hash_efgh: uint32x4_t,
+    hash_abcd: uint32x4_t,
+    wk: uint32x4_t,
+) -> uint32x4_t {
+    asm!(
+        "SHA256H {:q}, {:q}, {:v}.4S",
+        inout(vreg) hash_efgh, in(vreg) hash_abcd, in(vreg) wk,
+        options(pure, nomem, nostack, preserves_flags)
+    );
+    hash_efgh
+}
+
+#[inline(always)]
+unsafe fn vsha256h2q_u32(
+    mut hash_efgh: uint32x4_t,
+    hash_abcd: uint32x4_t,
+    wk: uint32x4_t,
+) -> uint32x4_t {
+    asm!(
+        "SHA256H2 {:q}, {:q}, {:v}.4S",
+        inout(vreg) hash_efgh, in(vreg) hash_abcd, in(vreg) wk,
+        options(pure, nomem, nostack, preserves_flags)
+    );
+    hash_efgh
+}
+
+#[inline(always)]
+unsafe fn vsha256su0q_u32(mut w0_3: uint32x4_t, w4_7: uint32x4_t) -> uint32x4_t {
+    asm!(
+        "SHA256SU0 {:v}.4S, {:v}.4S",
+        inout(vreg) w0_3, in(vreg) w4_7,
+        options(pure, nomem, nostack, preserves_flags)
+    );
+    w0_3
+}
+
+#[inline(always)]
+unsafe fn vsha256su1q_u32(
+    mut tw0_3: uint32x4_t,
+    w8_11: uint32x4_t,
+    w12_15: uint32x4_t,
+) -> uint32x4_t {
+    asm!(
+        "SHA256SU1 {:v}.4S, {:v}.4S, {:v}.4S",
+        inout(vreg) tw0_3, in(vreg) w8_11, in(vreg) w12_15,
+        options(pure, nomem, nostack, preserves_flags)
+    );
+    tw0_3
+}
diff --git a/rust/vendor/sha2/src/sha256/loongarch64_asm.rs b/rust/vendor/sha2/src/sha256/loongarch64_asm.rs
new file mode 100644
index 0000000..c80fce8
--- /dev/null
+++ b/rust/vendor/sha2/src/sha256/loongarch64_asm.rs
@@ -0,0 +1,227 @@
+//! LoongArch64 assembly backend
+
+macro_rules! c {
+    ($($l:expr)*) => {
+        concat!($($l ,)*)
+    };
+}
+
+macro_rules! rounda {
+    ($i:literal, $a:literal, $b:literal, $c:literal, $d:literal, $e:literal, $f:literal, $g:literal, $h:literal) => {
+        c!(
+            "ld.w    $a5, $a1, (" $i " * 4);"
+            "revb.2h $a5, $a5;"
+            "rotri.w $a5, $a5, 16;"
+            roundtail!($i, $a, $b, $c, $d, $e, $f, $g, $h)
+        )
+    };
+}
+
+macro_rules! roundb {
+    ($i:literal, $a:literal, $b:literal, $c:literal, $d:literal, $e:literal, $f:literal, $g:literal, $h:literal) => {
+        c!(
+            "ld.w    $a4, $sp, (((" $i " - 15) & 0xF) * 4);"
+            "ld.w    $a5, $sp, (((" $i " - 16) & 0xF) * 4);"
+            "ld.w    $a6, $sp, (((" $i " -  7) & 0xF) * 4);"
+            "add.w   $a5, $a5, $a6;"
+            "rotri.w $a6, $a4, 18;"
+            "srli.w  $a7, $a4, 3;"
+            "rotri.w $a4, $a4, 7;"
+            "xor     $a6, $a6, $a7;"
+            "xor     $a4, $a4, $a6;"
+            "add.w   $a5, $a5, $a4;"
+            "ld.w    $a4, $sp, (((" $i " -  2) & 0xF) * 4);"
+            "rotri.w $a6, $a4, 19;"
+            "srli.w  $a7, $a4, 10;"
+            "rotri.w $a4, $a4, 17;"
+            "xor     $a6, $a6, $a7;"
+            "xor     $a4, $a4, $a6;"
+            "add.w   $a5, $a5, $a4;"
+            roundtail!($i, $a, $b, $c, $d, $e, $f, $g, $h)
+        )
+    };
+}
+
+macro_rules! roundtail {
+    ($i:literal, $a:literal, $b:literal, $c:literal, $d:literal, $e:literal, $f:literal, $g:literal, $h:literal) => {
+        c!(
+            // Part 0
+            "rotri.w $a6, " $e ", 11;"
+            "rotri.w $a7, " $e ", 25;"
+            "rotri.w $a4, " $e ", 6;"
+            "xor     $a6, $a6, $a7;"
+            "xor     $a4, $a4, $a6;"
+            "xor     $a6, " $g ", " $f ";"
+            "ld.w    $a7, $a3, " $i " * 4;"
+            "and     $a6, $a6, " $e ";"
+            "xor     $a6, $a6, " $g ";"
+            "add.w   $a4, $a4, $a6;"
+            "add.w   $a4, $a4, $a7;"
+            "add.w   " $h ", " $h ", $a5;"
+            "add.w   " $h ", " $h ", $a4;"
+            // Part 1
+            "add.w   " $d ", " $d ", " $h ";"
+            // Part 2
+            "rotri.w $a6, " $a ", 13;"
+            "rotri.w $a7, " $a ", 22;"
+            "rotri.w $a4, " $a ", 2;"
+            "xor     $a6, $a6, $a7;"
+            "xor     $a4, $a4, $a6;"
+            "add.w   " $h ", " $h ", $a4;"
+            "or      $a4, " $c ", " $b ";"
+            "and     $a6, " $c ", " $b ";"
+            "and     $a4, $a4, " $a ";"
+            "or      $a4, $a4, $a6;"
+            "add.w   " $h ", " $h ", $a4;"
+            "st.w    $a5, $sp, ((" $i " & 0xF) * 4);"
+        )
+    };
+}
+
+pub fn compress(state: &mut [u32; 8], blocks: &[[u8; 64]]) {
+    if blocks.is_empty() {
+        return;
+    }
+
+    unsafe {
+        core::arch::asm!(
+            // Allocate scratch stack space
+            "addi.d  $sp, $sp, -64;",
+
+            // Load state
+            "ld.w    $t0, $a0, 0",
+            "ld.w    $t1, $a0, 4",
+            "ld.w    $t2, $a0, 8",
+            "ld.w    $t3, $a0, 12",
+            "ld.w    $t4, $a0, 16",
+            "ld.w    $t5, $a0, 20",
+            "ld.w    $t6, $a0, 24",
+            "ld.w    $t7, $a0, 28",
+
+            "42:",
+
+            // Do 64 rounds of hashing
+            rounda!( 0, "$t0" , "$t1" , "$t2", "$t3", "$t4", "$t5", "$t6", "$t7"),
+            rounda!( 1, "$t7", "$t0" , "$t1" , "$t2", "$t3", "$t4", "$t5", "$t6"),
+            rounda!( 2, "$t6", "$t7", "$t0" , "$t1" , "$t2", "$t3", "$t4", "$t5"),
+            rounda!( 3, "$t5", "$t6", "$t7", "$t0" , "$t1" , "$t2", "$t3", "$t4"),
+            rounda!( 4, "$t4", "$t5", "$t6", "$t7", "$t0" , "$t1" , "$t2", "$t3"),
+            rounda!( 5, "$t3", "$t4", "$t5", "$t6", "$t7", "$t0" , "$t1" , "$t2"),
+            rounda!( 6, "$t2", "$t3", "$t4", "$t5", "$t6", "$t7", "$t0" , "$t1"),
+            rounda!( 7, "$t1" , "$t2", "$t3", "$t4", "$t5", "$t6", "$t7", "$t0"),
+            rounda!( 8, "$t0" , "$t1" , "$t2", "$t3", "$t4", "$t5", "$t6", "$t7"),
+            rounda!( 9, "$t7", "$t0" , "$t1" , "$t2", "$t3", "$t4", "$t5", "$t6"),
+            rounda!(10, "$t6", "$t7", "$t0" , "$t1" , "$t2", "$t3", "$t4", "$t5"),
+            rounda!(11, "$t5", "$t6", "$t7", "$t0" , "$t1" , "$t2", "$t3", "$t4"),
+            rounda!(12, "$t4", "$t5", "$t6", "$t7", "$t0" , "$t1" , "$t2", "$t3"),
+            rounda!(13, "$t3", "$t4", "$t5", "$t6", "$t7", "$t0" , "$t1" , "$t2"),
+            rounda!(14, "$t2", "$t3", "$t4", "$t5", "$t6", "$t7", "$t0" , "$t1"),
+            rounda!(15, "$t1" , "$t2", "$t3", "$t4", "$t5", "$t6", "$t7", "$t0"),
+            roundb!(16, "$t0" , "$t1" , "$t2", "$t3", "$t4", "$t5", "$t6", "$t7"),
+            roundb!(17, "$t7", "$t0" , "$t1" , "$t2", "$t3", "$t4", "$t5", "$t6"),
+            roundb!(18, "$t6", "$t7", "$t0" , "$t1" , "$t2", "$t3", "$t4", "$t5"),
+            roundb!(19, "$t5", "$t6", "$t7", "$t0" , "$t1" , "$t2", "$t3", "$t4"),
+            roundb!(20, "$t4", "$t5", "$t6", "$t7", "$t0" , "$t1" , "$t2", "$t3"),
+            roundb!(21, "$t3", "$t4", "$t5", "$t6", "$t7", "$t0" , "$t1" , "$t2"),
+            roundb!(22, "$t2", "$t3", "$t4", "$t5", "$t6", "$t7", "$t0" , "$t1"),
+            roundb!(23, "$t1" , "$t2", "$t3", "$t4", "$t5", "$t6", "$t7", "$t0"),
+            roundb!(24, "$t0" , "$t1" , "$t2", "$t3", "$t4", "$t5", "$t6", "$t7"),
+            roundb!(25, "$t7", "$t0" , "$t1" , "$t2", "$t3", "$t4", "$t5", "$t6"),
+            roundb!(26, "$t6", "$t7", "$t0" , "$t1" , "$t2", "$t3", "$t4", "$t5"),
+            roundb!(27, "$t5", "$t6", "$t7", "$t0" , "$t1" , "$t2", "$t3", "$t4"),
+            roundb!(28, "$t4", "$t5", "$t6", "$t7", "$t0" , "$t1" , "$t2", "$t3"),
+            roundb!(29, "$t3", "$t4", "$t5", "$t6", "$t7", "$t0" , "$t1" , "$t2"),
+            roundb!(30, "$t2", "$t3", "$t4", "$t5", "$t6", "$t7", "$t0" , "$t1"),
+            roundb!(31, "$t1" , "$t2", "$t3", "$t4", "$t5", "$t6", "$t7", "$t0"),
+            roundb!(32, "$t0" , "$t1" , "$t2", "$t3", "$t4", "$t5", "$t6", "$t7"),
+            roundb!(33, "$t7", "$t0" , "$t1" , "$t2", "$t3", "$t4", "$t5", "$t6"),
+            roundb!(34, "$t6", "$t7", "$t0" , "$t1" , "$t2", "$t3", "$t4", "$t5"),
+            roundb!(35, "$t5", "$t6", "$t7", "$t0" , "$t1" , "$t2", "$t3", "$t4"),
+            roundb!(36, "$t4", "$t5", "$t6", "$t7", "$t0" , "$t1" , "$t2", "$t3"),
+            roundb!(37, "$t3", "$t4", "$t5", "$t6", "$t7", "$t0" , "$t1" , "$t2"),
+            roundb!(38, "$t2", "$t3", "$t4", "$t5", "$t6", "$t7", "$t0" , "$t1"),
+            roundb!(39, "$t1" , "$t2", "$t3", "$t4", "$t5", "$t6", "$t7", "$t0"),
+            roundb!(40, "$t0" , "$t1" , "$t2", "$t3", "$t4", "$t5", "$t6", "$t7"),
+            roundb!(41, "$t7", "$t0" , "$t1" , "$t2", "$t3", "$t4", "$t5", "$t6"),
+            roundb!(42, "$t6", "$t7", "$t0" , "$t1" , "$t2", "$t3", "$t4", "$t5"),
+            roundb!(43, "$t5", "$t6", "$t7", "$t0" , "$t1" , "$t2", "$t3", "$t4"),
+            roundb!(44, "$t4", "$t5", "$t6", "$t7", "$t0" , "$t1" , "$t2", "$t3"),
+            roundb!(45, "$t3", "$t4", "$t5", "$t6", "$t7", "$t0" , "$t1" , "$t2"),
+            roundb!(46, "$t2", "$t3", "$t4", "$t5", "$t6", "$t7", "$t0" , "$t1"),
+            roundb!(47, "$t1" , "$t2", "$t3", "$t4", "$t5", "$t6", "$t7", "$t0"),
+            roundb!(48, "$t0" , "$t1" , "$t2", "$t3", "$t4", "$t5", "$t6", "$t7"),
+            roundb!(49, "$t7", "$t0" , "$t1" , "$t2", "$t3", "$t4", "$t5", "$t6"),
+            roundb!(50, "$t6", "$t7", "$t0" , "$t1" , "$t2", "$t3", "$t4", "$t5"),
+            roundb!(51, "$t5", "$t6", "$t7", "$t0" , "$t1" , "$t2", "$t3", "$t4"),
+            roundb!(52, "$t4", "$t5", "$t6", "$t7", "$t0" , "$t1" , "$t2", "$t3"),
+            roundb!(53, "$t3", "$t4", "$t5", "$t6", "$t7", "$t0" , "$t1" , "$t2"),
+            roundb!(54, "$t2", "$t3", "$t4", "$t5", "$t6", "$t7", "$t0" , "$t1"),
+            roundb!(55, "$t1" , "$t2", "$t3", "$t4", "$t5", "$t6", "$t7", "$t0"),
+            roundb!(56, "$t0" , "$t1" , "$t2", "$t3", "$t4", "$t5", "$t6", "$t7"),
+            roundb!(57, "$t7", "$t0" , "$t1" , "$t2", "$t3", "$t4", "$t5", "$t6"),
+            roundb!(58, "$t6", "$t7", "$t0" , "$t1" , "$t2", "$t3", "$t4", "$t5"),
+            roundb!(59, "$t5", "$t6", "$t7", "$t0" , "$t1" , "$t2", "$t3", "$t4"),
+            roundb!(60, "$t4", "$t5", "$t6", "$t7", "$t0" , "$t1" , "$t2", "$t3"),
+            roundb!(61, "$t3", "$t4", "$t5", "$t6", "$t7", "$t0" , "$t1" , "$t2"),
+            roundb!(62, "$t2", "$t3", "$t4", "$t5", "$t6", "$t7", "$t0" , "$t1"),
+            roundb!(63, "$t1" , "$t2", "$t3", "$t4", "$t5", "$t6", "$t7", "$t0"),
+
+            // Update state registers
+            "ld.w    $a4, $a0, 0",  // a
+            "ld.w    $a5, $a0, 4",  // b
+            "ld.w    $a6, $a0, 8",  // c
+            "ld.w    $a7, $a0, 12", // d
+            "add.w   $t0, $t0, $a4",
+            "add.w   $t1, $t1, $a5",
+            "add.w   $t2, $t2, $a6",
+            "add.w   $t3, $t3, $a7",
+            "ld.w    $a4, $a0, 16", // e
+            "ld.w    $a5, $a0, 20", // f
+            "ld.w    $a6, $a0, 24", // g
+            "ld.w    $a7, $a0, 28", // h
+            "add.w   $t4, $t4, $a4",
+            "add.w   $t5, $t5, $a5",
+            "add.w   $t6, $t6, $a6",
+            "add.w   $t7, $t7, $a7",
+
+            // Save updated state
+            "st.w    $t0, $a0, 0",
+            "st.w    $t1, $a0, 4",
+            "st.w    $t2, $a0, 8",
+            "st.w    $t3, $a0, 12",
+            "st.w    $t4, $a0, 16",
+            "st.w    $t5, $a0, 20",
+            "st.w    $t6, $a0, 24",
+            "st.w    $t7, $a0, 28",
+
+            // Looping over blocks
+            "addi.d  $a1, $a1, 64",
+            "addi.d  $a2, $a2, -1",
+            "bnez    $a2, 42b",
+
+            // Restore stack register
+            "addi.d  $sp, $sp, 64",
+
+            in("$a0") state,
+            inout("$a1") blocks.as_ptr() => _,
+            inout("$a2") blocks.len() => _,
+            in("$a3") crate::consts::K32.as_ptr(),
+
+            // Clobbers
+            out("$a4") _,
+            out("$a5") _,
+            out("$a6") _,
+            out("$a7") _,
+            out("$t0") _,
+            out("$t1") _,
+            out("$t2") _,
+            out("$t3") _,
+            out("$t4") _,
+            out("$t5") _,
+            out("$t6") _,
+            out("$t7") _,
+
+            options(preserves_flags),
+        );
+    }
+}
diff --git a/rust/vendor/sha2/src/sha256/soft.rs b/rust/vendor/sha2/src/sha256/soft.rs
new file mode 100644
index 0000000..34826a7
--- /dev/null
+++ b/rust/vendor/sha2/src/sha256/soft.rs
@@ -0,0 +1,218 @@
+#![allow(clippy::many_single_char_names)]
+use crate::consts::BLOCK_LEN;
+use core::convert::TryInto;
+
+#[inline(always)]
+fn shl(v: [u32; 4], o: u32) -> [u32; 4] {
+    [v[0] >> o, v[1] >> o, v[2] >> o, v[3] >> o]
+}
+
+#[inline(always)]
+fn shr(v: [u32; 4], o: u32) -> [u32; 4] {
+    [v[0] << o, v[1] << o, v[2] << o, v[3] << o]
+}
+
+#[inline(always)]
+fn or(a: [u32; 4], b: [u32; 4]) -> [u32; 4] {
+    [a[0] | b[0], a[1] | b[1], a[2] | b[2], a[3] | b[3]]
+}
+
+#[inline(always)]
+fn xor(a: [u32; 4], b: [u32; 4]) -> [u32; 4] {
+    [a[0] ^ b[0], a[1] ^ b[1], a[2] ^ b[2], a[3] ^ b[3]]
+}
+
+#[inline(always)]
+fn add(a: [u32; 4], b: [u32; 4]) -> [u32; 4] {
+    [
+        a[0].wrapping_add(b[0]),
+        a[1].wrapping_add(b[1]),
+        a[2].wrapping_add(b[2]),
+        a[3].wrapping_add(b[3]),
+    ]
+}
+
+fn sha256load(v2: [u32; 4], v3: [u32; 4]) -> [u32; 4] {
+    [v3[3], v2[0], v2[1], v2[2]]
+}
+
+fn sha256swap(v0: [u32; 4]) -> [u32; 4] {
+    [v0[2], v0[3], v0[0], v0[1]]
+}
+
+fn sha256msg1(v0: [u32; 4], v1: [u32; 4]) -> [u32; 4] {
+    // sigma 0 on vectors
+    #[inline]
+    fn sigma0x4(x: [u32; 4]) -> [u32; 4] {
+        let t1 = or(shl(x, 7), shr(x, 25));
+        let t2 = or(shl(x, 18), shr(x, 14));
+        let t3 = shl(x, 3);
+        xor(xor(t1, t2), t3)
+    }
+
+    add(v0, sigma0x4(sha256load(v0, v1)))
+}
+
+fn sha256msg2(v4: [u32; 4], v3: [u32; 4]) -> [u32; 4] {
+    macro_rules! sigma1 {
+        ($a:expr) => {
+            $a.rotate_right(17) ^ $a.rotate_right(19) ^ ($a >> 10)
+        };
+    }
+
+    let [x3, x2, x1, x0] = v4;
+    let [w15, w14, _, _] = v3;
+
+    let w16 = x0.wrapping_add(sigma1!(w14));
+    let w17 = x1.wrapping_add(sigma1!(w15));
+    let w18 = x2.wrapping_add(sigma1!(w16));
+    let w19 = x3.wrapping_add(sigma1!(w17));
+
+    [w19, w18, w17, w16]
+}
+
+fn sha256_digest_round_x2(cdgh: [u32; 4], abef: [u32; 4], wk: [u32; 4]) -> [u32; 4] {
+    macro_rules! big_sigma0 {
+        ($a:expr) => {
+            ($a.rotate_right(2) ^ $a.rotate_right(13) ^ $a.rotate_right(22))
+        };
+    }
+    macro_rules! big_sigma1 {
+        ($a:expr) => {
+            ($a.rotate_right(6) ^ $a.rotate_right(11) ^ $a.rotate_right(25))
+        };
+    }
+    macro_rules! bool3ary_202 {
+        ($a:expr, $b:expr, $c:expr) => {
+            $c ^ ($a & ($b ^ $c))
+        };
+    } // Choose, MD5F, SHA1C
+    macro_rules! bool3ary_232 {
+        ($a:expr, $b:expr, $c:expr) => {
+            ($a & $b) ^ ($a & $c) ^ ($b & $c)
+        };
+    } // Majority, SHA1M
+
+    let [_, _, wk1, wk0] = wk;
+    let [a0, b0, e0, f0] = abef;
+    let [c0, d0, g0, h0] = cdgh;
+
+    // a round
+    let x0 = big_sigma1!(e0)
+        .wrapping_add(bool3ary_202!(e0, f0, g0))
+        .wrapping_add(wk0)
+        .wrapping_add(h0);
+    let y0 = big_sigma0!(a0).wrapping_add(bool3ary_232!(a0, b0, c0));
+    let (a1, b1, c1, d1, e1, f1, g1, h1) = (
+        x0.wrapping_add(y0),
+        a0,
+        b0,
+        c0,
+        x0.wrapping_add(d0),
+        e0,
+        f0,
+        g0,
+    );
+
+    // a round
+    let x1 = big_sigma1!(e1)
+        .wrapping_add(bool3ary_202!(e1, f1, g1))
+        .wrapping_add(wk1)
+        .wrapping_add(h1);
+    let y1 = big_sigma0!(a1).wrapping_add(bool3ary_232!(a1, b1, c1));
+    let (a2, b2, _, _, e2, f2, _, _) = (
+        x1.wrapping_add(y1),
+        a1,
+        b1,
+        c1,
+        x1.wrapping_add(d1),
+        e1,
+        f1,
+        g1,
+    );
+
+    [a2, b2, e2, f2]
+}
+
+fn schedule(v0: [u32; 4], v1: [u32; 4], v2: [u32; 4], v3: [u32; 4]) -> [u32; 4] {
+    let t1 = sha256msg1(v0, v1);
+    let t2 = sha256load(v2, v3);
+    let t3 = add(t1, t2);
+    sha256msg2(t3, v3)
+}
+
+macro_rules! rounds4 {
+    ($abef:ident, $cdgh:ident, $rest:expr, $i:expr) => {{
+        let t1 = add($rest, crate::consts::K32X4[$i]);
+        $cdgh = sha256_digest_round_x2($cdgh, $abef, t1);
+        let t2 = sha256swap(t1);
+        $abef = sha256_digest_round_x2($abef, $cdgh, t2);
+    }};
+}
+
+macro_rules! schedule_rounds4 {
+    (
+        $abef:ident, $cdgh:ident,
+        $w0:expr, $w1:expr, $w2:expr, $w3:expr, $w4:expr,
+        $i: expr
+    ) => {{
+        $w4 = schedule($w0, $w1, $w2, $w3);
+        rounds4!($abef, $cdgh, $w4, $i);
+    }};
+}
+
+/// Process a block with the SHA-256 algorithm.
+fn sha256_digest_block_u32(state: &mut [u32; 8], block: &[u32; 16]) {
+    let mut abef = [state[0], state[1], state[4], state[5]];
+    let mut cdgh = [state[2], state[3], state[6], state[7]];
+
+    // Rounds 0..64
+    let mut w0 = [block[3], block[2], block[1], block[0]];
+    let mut w1 = [block[7], block[6], block[5], block[4]];
+    let mut w2 = [block[11], block[10], block[9], block[8]];
+    let mut w3 = [block[15], block[14], block[13], block[12]];
+    let mut w4;
+
+    rounds4!(abef, cdgh, w0, 0);
+    rounds4!(abef, cdgh, w1, 1);
+    rounds4!(abef, cdgh, w2, 2);
+    rounds4!(abef, cdgh, w3, 3);
+    schedule_rounds4!(abef, cdgh, w0, w1, w2, w3, w4, 4);
+    schedule_rounds4!(abef, cdgh, w1, w2, w3, w4, w0, 5);
+    schedule_rounds4!(abef, cdgh, w2, w3, w4, w0, w1, 6);
+    schedule_rounds4!(abef, cdgh, w3, w4, w0, w1, w2, 7);
+    schedule_rounds4!(abef, cdgh, w4, w0, w1, w2, w3, 8);
+    schedule_rounds4!(abef, cdgh, w0, w1, w2, w3, w4, 9);
+    schedule_rounds4!(abef, cdgh, w1, w2, w3, w4, w0, 10);
+    schedule_rounds4!(abef, cdgh, w2, w3, w4, w0, w1, 11);
+    schedule_rounds4!(abef, cdgh, w3, w4, w0, w1, w2, 12);
+    schedule_rounds4!(abef, cdgh, w4, w0, w1, w2, w3, 13);
+    schedule_rounds4!(abef, cdgh, w0, w1, w2, w3, w4, 14);
+    schedule_rounds4!(abef, cdgh, w1, w2, w3, w4, w0, 15);
+
+    let [a, b, e, f] = abef;
+    let [c, d, g, h] = cdgh;
+
+    state[0] = state[0].wrapping_add(a);
+    state[1] = state[1].wrapping_add(b);
+    state[2] = state[2].wrapping_add(c);
+    state[3] = state[3].wrapping_add(d);
+    state[4] = state[4].wrapping_add(e);
+    state[5] = state[5].wrapping_add(f);
+    state[6] = state[6].wrapping_add(g);
+    state[7] = state[7].wrapping_add(h);
+}
+
+pub fn compress(state: &mut [u32; 8], blocks: &[[u8; 64]]) {
+    let mut block_u32 = [0u32; BLOCK_LEN];
+    // since LLVM can't properly use aliasing yet it will make
+    // unnecessary state stores without this copy
+    let mut state_cpy = *state;
+    for block in blocks {
+        for (o, chunk) in block_u32.iter_mut().zip(block.chunks_exact(4)) {
+            *o = u32::from_be_bytes(chunk.try_into().unwrap());
+        }
+        sha256_digest_block_u32(&mut state_cpy, &block_u32);
+    }
+    *state = state_cpy;
+}
diff --git a/rust/vendor/sha2/src/sha256/x86.rs b/rust/vendor/sha2/src/sha256/x86.rs
new file mode 100644
index 0000000..4601938
--- /dev/null
+++ b/rust/vendor/sha2/src/sha256/x86.rs
@@ -0,0 +1,112 @@
+//! SHA-256 `x86`/`x86_64` backend
+
+#![allow(clippy::many_single_char_names)]
+
+#[cfg(target_arch = "x86")]
+use core::arch::x86::*;
+#[cfg(target_arch = "x86_64")]
+use core::arch::x86_64::*;
+
+unsafe fn schedule(v0: __m128i, v1: __m128i, v2: __m128i, v3: __m128i) -> __m128i {
+    let t1 = _mm_sha256msg1_epu32(v0, v1);
+    let t2 = _mm_alignr_epi8(v3, v2, 4);
+    let t3 = _mm_add_epi32(t1, t2);
+    _mm_sha256msg2_epu32(t3, v3)
+}
+
+macro_rules! rounds4 {
+    ($abef:ident, $cdgh:ident, $rest:expr, $i:expr) => {{
+        let k = crate::consts::K32X4[$i];
+        let kv = _mm_set_epi32(k[0] as i32, k[1] as i32, k[2] as i32, k[3] as i32);
+        let t1 = _mm_add_epi32($rest, kv);
+        $cdgh = _mm_sha256rnds2_epu32($cdgh, $abef, t1);
+        let t2 = _mm_shuffle_epi32(t1, 0x0E);
+        $abef = _mm_sha256rnds2_epu32($abef, $cdgh, t2);
+    }};
+}
+
+macro_rules! schedule_rounds4 {
+    (
+        $abef:ident, $cdgh:ident,
+        $w0:expr, $w1:expr, $w2:expr, $w3:expr, $w4:expr,
+        $i: expr
+    ) => {{
+        $w4 = schedule($w0, $w1, $w2, $w3);
+        rounds4!($abef, $cdgh, $w4, $i);
+    }};
+}
+
+// we use unaligned loads with `__m128i` pointers
+#[allow(clippy::cast_ptr_alignment)]
+#[target_feature(enable = "sha,sse2,ssse3,sse4.1")]
+unsafe fn digest_blocks(state: &mut [u32; 8], blocks: &[[u8; 64]]) {
+    #[allow(non_snake_case)]
+    let MASK: __m128i = _mm_set_epi64x(
+        0x0C0D_0E0F_0809_0A0Bu64 as i64,
+        0x0405_0607_0001_0203u64 as i64,
+    );
+
+    let state_ptr = state.as_ptr() as *const __m128i;
+    let dcba = _mm_loadu_si128(state_ptr.add(0));
+    let efgh = _mm_loadu_si128(state_ptr.add(1));
+
+    let cdab = _mm_shuffle_epi32(dcba, 0xB1);
+    let efgh = _mm_shuffle_epi32(efgh, 0x1B);
+    let mut abef = _mm_alignr_epi8(cdab, efgh, 8);
+    let mut cdgh = _mm_blend_epi16(efgh, cdab, 0xF0);
+
+    for block in blocks {
+        let abef_save = abef;
+        let cdgh_save = cdgh;
+
+        let data_ptr = block.as_ptr() as *const __m128i;
+        let mut w0 = _mm_shuffle_epi8(_mm_loadu_si128(data_ptr.add(0)), MASK);
+        let mut w1 = _mm_shuffle_epi8(_mm_loadu_si128(data_ptr.add(1)), MASK);
+        let mut w2 = _mm_shuffle_epi8(_mm_loadu_si128(data_ptr.add(2)), MASK);
+        let mut w3 = _mm_shuffle_epi8(_mm_loadu_si128(data_ptr.add(3)), MASK);
+        let mut w4;
+
+        rounds4!(abef, cdgh, w0, 0);
+        rounds4!(abef, cdgh, w1, 1);
+        rounds4!(abef, cdgh, w2, 2);
+        rounds4!(abef, cdgh, w3, 3);
+        schedule_rounds4!(abef, cdgh, w0, w1, w2, w3, w4, 4);
+        schedule_rounds4!(abef, cdgh, w1, w2, w3, w4, w0, 5);
+        schedule_rounds4!(abef, cdgh, w2, w3, w4, w0, w1, 6);
+        schedule_rounds4!(abef, cdgh, w3, w4, w0, w1, w2, 7);
+        schedule_rounds4!(abef, cdgh, w4, w0, w1, w2, w3, 8);
+        schedule_rounds4!(abef, cdgh, w0, w1, w2, w3, w4, 9);
+        schedule_rounds4!(abef, cdgh, w1, w2, w3, w4, w0, 10);
+        schedule_rounds4!(abef, cdgh, w2, w3, w4, w0, w1, 11);
+        schedule_rounds4!(abef, cdgh, w3, w4, w0, w1, w2, 12);
+        schedule_rounds4!(abef, cdgh, w4, w0, w1, w2, w3, 13);
+        schedule_rounds4!(abef, cdgh, w0, w1, w2, w3, w4, 14);
+        schedule_rounds4!(abef, cdgh, w1, w2, w3, w4, w0, 15);
+
+        abef = _mm_add_epi32(abef, abef_save);
+        cdgh = _mm_add_epi32(cdgh, cdgh_save);
+    }
+
+    let feba = _mm_shuffle_epi32(abef, 0x1B);
+    let dchg = _mm_shuffle_epi32(cdgh, 0xB1);
+    let dcba = _mm_blend_epi16(feba, dchg, 0xF0);
+    let hgef = _mm_alignr_epi8(dchg, feba, 8);
+
+    let state_ptr_mut = state.as_mut_ptr() as *mut __m128i;
+    _mm_storeu_si128(state_ptr_mut.add(0), dcba);
+    _mm_storeu_si128(state_ptr_mut.add(1), hgef);
+}
+
+cpufeatures::new!(shani_cpuid, "sha", "sse2", "ssse3", "sse4.1");
+
+pub fn compress(state: &mut [u32; 8], blocks: &[[u8; 64]]) {
+    // TODO: Replace with https://github.com/rust-lang/rfcs/pull/2725
+    // after stabilization
+    if shani_cpuid::get() {
+        unsafe {
+            digest_blocks(state, blocks);
+        }
+    } else {
+        super::soft::compress(state, blocks);
+    }
+}