summaryrefslogtreecommitdiffstats
path: root/rust/vendor/sha2/src/sha256
diff options
context:
space:
mode:
Diffstat (limited to 'rust/vendor/sha2/src/sha256')
-rw-r--r--rust/vendor/sha2/src/sha256/aarch64.rs159
-rw-r--r--rust/vendor/sha2/src/sha256/loongarch64_asm.rs227
-rw-r--r--rust/vendor/sha2/src/sha256/soft.rs218
-rw-r--r--rust/vendor/sha2/src/sha256/x86.rs112
4 files changed, 716 insertions, 0 deletions
diff --git a/rust/vendor/sha2/src/sha256/aarch64.rs b/rust/vendor/sha2/src/sha256/aarch64.rs
new file mode 100644
index 0000000..9d220a3
--- /dev/null
+++ b/rust/vendor/sha2/src/sha256/aarch64.rs
@@ -0,0 +1,159 @@
+//! SHA-256 `aarch64` backend.
+
+// Implementation adapted from mbedtls.
+
+// TODO: stdarch intrinsics: RustCrypto/hashes#257
+
+use core::arch::{aarch64::*, asm};
+
+use crate::consts::K32;
+
+cpufeatures::new!(sha2_hwcap, "sha2");
+
+pub fn compress(state: &mut [u32; 8], blocks: &[[u8; 64]]) {
+ // TODO: Replace with https://github.com/rust-lang/rfcs/pull/2725
+ // after stabilization
+ if sha2_hwcap::get() {
+ unsafe { sha256_compress(state, blocks) }
+ } else {
+ super::soft::compress(state, blocks);
+ }
+}
+
+#[target_feature(enable = "sha2")]
+unsafe fn sha256_compress(state: &mut [u32; 8], blocks: &[[u8; 64]]) {
+ // SAFETY: Requires the sha2 feature.
+
+ // Load state into vectors.
+ let mut abcd = vld1q_u32(state[0..4].as_ptr());
+ let mut efgh = vld1q_u32(state[4..8].as_ptr());
+
+ // Iterate through the message blocks.
+ for block in blocks {
+ // Keep original state values.
+ let abcd_orig = abcd;
+ let efgh_orig = efgh;
+
+ // Load the message block into vectors, assuming little endianness.
+ let mut s0 = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(block[0..16].as_ptr())));
+ let mut s1 = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(block[16..32].as_ptr())));
+ let mut s2 = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(block[32..48].as_ptr())));
+ let mut s3 = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(block[48..64].as_ptr())));
+
+ // Rounds 0 to 3
+ let mut tmp = vaddq_u32(s0, vld1q_u32(&K32[0]));
+ let mut abcd_prev = abcd;
+ abcd = vsha256hq_u32(abcd_prev, efgh, tmp);
+ efgh = vsha256h2q_u32(efgh, abcd_prev, tmp);
+
+ // Rounds 4 to 7
+ tmp = vaddq_u32(s1, vld1q_u32(&K32[4]));
+ abcd_prev = abcd;
+ abcd = vsha256hq_u32(abcd_prev, efgh, tmp);
+ efgh = vsha256h2q_u32(efgh, abcd_prev, tmp);
+
+ // Rounds 8 to 11
+ tmp = vaddq_u32(s2, vld1q_u32(&K32[8]));
+ abcd_prev = abcd;
+ abcd = vsha256hq_u32(abcd_prev, efgh, tmp);
+ efgh = vsha256h2q_u32(efgh, abcd_prev, tmp);
+
+ // Rounds 12 to 15
+ tmp = vaddq_u32(s3, vld1q_u32(&K32[12]));
+ abcd_prev = abcd;
+ abcd = vsha256hq_u32(abcd_prev, efgh, tmp);
+ efgh = vsha256h2q_u32(efgh, abcd_prev, tmp);
+
+ for t in (16..64).step_by(16) {
+ // Rounds t to t + 3
+ s0 = vsha256su1q_u32(vsha256su0q_u32(s0, s1), s2, s3);
+ tmp = vaddq_u32(s0, vld1q_u32(&K32[t]));
+ abcd_prev = abcd;
+ abcd = vsha256hq_u32(abcd_prev, efgh, tmp);
+ efgh = vsha256h2q_u32(efgh, abcd_prev, tmp);
+
+ // Rounds t + 4 to t + 7
+ s1 = vsha256su1q_u32(vsha256su0q_u32(s1, s2), s3, s0);
+ tmp = vaddq_u32(s1, vld1q_u32(&K32[t + 4]));
+ abcd_prev = abcd;
+ abcd = vsha256hq_u32(abcd_prev, efgh, tmp);
+ efgh = vsha256h2q_u32(efgh, abcd_prev, tmp);
+
+ // Rounds t + 8 to t + 11
+ s2 = vsha256su1q_u32(vsha256su0q_u32(s2, s3), s0, s1);
+ tmp = vaddq_u32(s2, vld1q_u32(&K32[t + 8]));
+ abcd_prev = abcd;
+ abcd = vsha256hq_u32(abcd_prev, efgh, tmp);
+ efgh = vsha256h2q_u32(efgh, abcd_prev, tmp);
+
+ // Rounds t + 12 to t + 15
+ s3 = vsha256su1q_u32(vsha256su0q_u32(s3, s0), s1, s2);
+ tmp = vaddq_u32(s3, vld1q_u32(&K32[t + 12]));
+ abcd_prev = abcd;
+ abcd = vsha256hq_u32(abcd_prev, efgh, tmp);
+ efgh = vsha256h2q_u32(efgh, abcd_prev, tmp);
+ }
+
+ // Add the block-specific state to the original state.
+ abcd = vaddq_u32(abcd, abcd_orig);
+ efgh = vaddq_u32(efgh, efgh_orig);
+ }
+
+ // Store vectors into state.
+ vst1q_u32(state[0..4].as_mut_ptr(), abcd);
+ vst1q_u32(state[4..8].as_mut_ptr(), efgh);
+}
+
+// TODO remove these polyfills once SHA2 intrinsics land
+
+#[inline(always)]
+unsafe fn vsha256hq_u32(
+ mut hash_efgh: uint32x4_t,
+ hash_abcd: uint32x4_t,
+ wk: uint32x4_t,
+) -> uint32x4_t {
+ asm!(
+ "SHA256H {:q}, {:q}, {:v}.4S",
+ inout(vreg) hash_efgh, in(vreg) hash_abcd, in(vreg) wk,
+ options(pure, nomem, nostack, preserves_flags)
+ );
+ hash_efgh
+}
+
+#[inline(always)]
+unsafe fn vsha256h2q_u32(
+ mut hash_efgh: uint32x4_t,
+ hash_abcd: uint32x4_t,
+ wk: uint32x4_t,
+) -> uint32x4_t {
+ asm!(
+ "SHA256H2 {:q}, {:q}, {:v}.4S",
+ inout(vreg) hash_efgh, in(vreg) hash_abcd, in(vreg) wk,
+ options(pure, nomem, nostack, preserves_flags)
+ );
+ hash_efgh
+}
+
+#[inline(always)]
+unsafe fn vsha256su0q_u32(mut w0_3: uint32x4_t, w4_7: uint32x4_t) -> uint32x4_t {
+ asm!(
+ "SHA256SU0 {:v}.4S, {:v}.4S",
+ inout(vreg) w0_3, in(vreg) w4_7,
+ options(pure, nomem, nostack, preserves_flags)
+ );
+ w0_3
+}
+
+#[inline(always)]
+unsafe fn vsha256su1q_u32(
+ mut tw0_3: uint32x4_t,
+ w8_11: uint32x4_t,
+ w12_15: uint32x4_t,
+) -> uint32x4_t {
+ asm!(
+ "SHA256SU1 {:v}.4S, {:v}.4S, {:v}.4S",
+ inout(vreg) tw0_3, in(vreg) w8_11, in(vreg) w12_15,
+ options(pure, nomem, nostack, preserves_flags)
+ );
+ tw0_3
+}
diff --git a/rust/vendor/sha2/src/sha256/loongarch64_asm.rs b/rust/vendor/sha2/src/sha256/loongarch64_asm.rs
new file mode 100644
index 0000000..c80fce8
--- /dev/null
+++ b/rust/vendor/sha2/src/sha256/loongarch64_asm.rs
@@ -0,0 +1,227 @@
+//! LoongArch64 assembly backend
+
+macro_rules! c {
+ ($($l:expr)*) => {
+ concat!($($l ,)*)
+ };
+}
+
+macro_rules! rounda {
+ ($i:literal, $a:literal, $b:literal, $c:literal, $d:literal, $e:literal, $f:literal, $g:literal, $h:literal) => {
+ c!(
+ "ld.w $a5, $a1, (" $i " * 4);"
+ "revb.2h $a5, $a5;"
+ "rotri.w $a5, $a5, 16;"
+ roundtail!($i, $a, $b, $c, $d, $e, $f, $g, $h)
+ )
+ };
+}
+
+macro_rules! roundb {
+ ($i:literal, $a:literal, $b:literal, $c:literal, $d:literal, $e:literal, $f:literal, $g:literal, $h:literal) => {
+ c!(
+ "ld.w $a4, $sp, (((" $i " - 15) & 0xF) * 4);"
+ "ld.w $a5, $sp, (((" $i " - 16) & 0xF) * 4);"
+ "ld.w $a6, $sp, (((" $i " - 7) & 0xF) * 4);"
+ "add.w $a5, $a5, $a6;"
+ "rotri.w $a6, $a4, 18;"
+ "srli.w $a7, $a4, 3;"
+ "rotri.w $a4, $a4, 7;"
+ "xor $a6, $a6, $a7;"
+ "xor $a4, $a4, $a6;"
+ "add.w $a5, $a5, $a4;"
+ "ld.w $a4, $sp, (((" $i " - 2) & 0xF) * 4);"
+ "rotri.w $a6, $a4, 19;"
+ "srli.w $a7, $a4, 10;"
+ "rotri.w $a4, $a4, 17;"
+ "xor $a6, $a6, $a7;"
+ "xor $a4, $a4, $a6;"
+ "add.w $a5, $a5, $a4;"
+ roundtail!($i, $a, $b, $c, $d, $e, $f, $g, $h)
+ )
+ };
+}
+
+macro_rules! roundtail {
+ ($i:literal, $a:literal, $b:literal, $c:literal, $d:literal, $e:literal, $f:literal, $g:literal, $h:literal) => {
+ c!(
+ // Part 0
+ "rotri.w $a6, " $e ", 11;"
+ "rotri.w $a7, " $e ", 25;"
+ "rotri.w $a4, " $e ", 6;"
+ "xor $a6, $a6, $a7;"
+ "xor $a4, $a4, $a6;"
+ "xor $a6, " $g ", " $f ";"
+ "ld.w $a7, $a3, " $i " * 4;"
+ "and $a6, $a6, " $e ";"
+ "xor $a6, $a6, " $g ";"
+ "add.w $a4, $a4, $a6;"
+ "add.w $a4, $a4, $a7;"
+ "add.w " $h ", " $h ", $a5;"
+ "add.w " $h ", " $h ", $a4;"
+ // Part 1
+ "add.w " $d ", " $d ", " $h ";"
+ // Part 2
+ "rotri.w $a6, " $a ", 13;"
+ "rotri.w $a7, " $a ", 22;"
+ "rotri.w $a4, " $a ", 2;"
+ "xor $a6, $a6, $a7;"
+ "xor $a4, $a4, $a6;"
+ "add.w " $h ", " $h ", $a4;"
+ "or $a4, " $c ", " $b ";"
+ "and $a6, " $c ", " $b ";"
+ "and $a4, $a4, " $a ";"
+ "or $a4, $a4, $a6;"
+ "add.w " $h ", " $h ", $a4;"
+ "st.w $a5, $sp, ((" $i " & 0xF) * 4);"
+ )
+ };
+}
+
+pub fn compress(state: &mut [u32; 8], blocks: &[[u8; 64]]) {
+ if blocks.is_empty() {
+ return;
+ }
+
+ unsafe {
+ core::arch::asm!(
+ // Allocate scratch stack space
+ "addi.d $sp, $sp, -64;",
+
+ // Load state
+ "ld.w $t0, $a0, 0",
+ "ld.w $t1, $a0, 4",
+ "ld.w $t2, $a0, 8",
+ "ld.w $t3, $a0, 12",
+ "ld.w $t4, $a0, 16",
+ "ld.w $t5, $a0, 20",
+ "ld.w $t6, $a0, 24",
+ "ld.w $t7, $a0, 28",
+
+ "42:",
+
+ // Do 64 rounds of hashing
+ rounda!( 0, "$t0" , "$t1" , "$t2", "$t3", "$t4", "$t5", "$t6", "$t7"),
+ rounda!( 1, "$t7", "$t0" , "$t1" , "$t2", "$t3", "$t4", "$t5", "$t6"),
+ rounda!( 2, "$t6", "$t7", "$t0" , "$t1" , "$t2", "$t3", "$t4", "$t5"),
+ rounda!( 3, "$t5", "$t6", "$t7", "$t0" , "$t1" , "$t2", "$t3", "$t4"),
+ rounda!( 4, "$t4", "$t5", "$t6", "$t7", "$t0" , "$t1" , "$t2", "$t3"),
+ rounda!( 5, "$t3", "$t4", "$t5", "$t6", "$t7", "$t0" , "$t1" , "$t2"),
+ rounda!( 6, "$t2", "$t3", "$t4", "$t5", "$t6", "$t7", "$t0" , "$t1"),
+ rounda!( 7, "$t1" , "$t2", "$t3", "$t4", "$t5", "$t6", "$t7", "$t0"),
+ rounda!( 8, "$t0" , "$t1" , "$t2", "$t3", "$t4", "$t5", "$t6", "$t7"),
+ rounda!( 9, "$t7", "$t0" , "$t1" , "$t2", "$t3", "$t4", "$t5", "$t6"),
+ rounda!(10, "$t6", "$t7", "$t0" , "$t1" , "$t2", "$t3", "$t4", "$t5"),
+ rounda!(11, "$t5", "$t6", "$t7", "$t0" , "$t1" , "$t2", "$t3", "$t4"),
+ rounda!(12, "$t4", "$t5", "$t6", "$t7", "$t0" , "$t1" , "$t2", "$t3"),
+ rounda!(13, "$t3", "$t4", "$t5", "$t6", "$t7", "$t0" , "$t1" , "$t2"),
+ rounda!(14, "$t2", "$t3", "$t4", "$t5", "$t6", "$t7", "$t0" , "$t1"),
+ rounda!(15, "$t1" , "$t2", "$t3", "$t4", "$t5", "$t6", "$t7", "$t0"),
+ roundb!(16, "$t0" , "$t1" , "$t2", "$t3", "$t4", "$t5", "$t6", "$t7"),
+ roundb!(17, "$t7", "$t0" , "$t1" , "$t2", "$t3", "$t4", "$t5", "$t6"),
+ roundb!(18, "$t6", "$t7", "$t0" , "$t1" , "$t2", "$t3", "$t4", "$t5"),
+ roundb!(19, "$t5", "$t6", "$t7", "$t0" , "$t1" , "$t2", "$t3", "$t4"),
+ roundb!(20, "$t4", "$t5", "$t6", "$t7", "$t0" , "$t1" , "$t2", "$t3"),
+ roundb!(21, "$t3", "$t4", "$t5", "$t6", "$t7", "$t0" , "$t1" , "$t2"),
+ roundb!(22, "$t2", "$t3", "$t4", "$t5", "$t6", "$t7", "$t0" , "$t1"),
+ roundb!(23, "$t1" , "$t2", "$t3", "$t4", "$t5", "$t6", "$t7", "$t0"),
+ roundb!(24, "$t0" , "$t1" , "$t2", "$t3", "$t4", "$t5", "$t6", "$t7"),
+ roundb!(25, "$t7", "$t0" , "$t1" , "$t2", "$t3", "$t4", "$t5", "$t6"),
+ roundb!(26, "$t6", "$t7", "$t0" , "$t1" , "$t2", "$t3", "$t4", "$t5"),
+ roundb!(27, "$t5", "$t6", "$t7", "$t0" , "$t1" , "$t2", "$t3", "$t4"),
+ roundb!(28, "$t4", "$t5", "$t6", "$t7", "$t0" , "$t1" , "$t2", "$t3"),
+ roundb!(29, "$t3", "$t4", "$t5", "$t6", "$t7", "$t0" , "$t1" , "$t2"),
+ roundb!(30, "$t2", "$t3", "$t4", "$t5", "$t6", "$t7", "$t0" , "$t1"),
+ roundb!(31, "$t1" , "$t2", "$t3", "$t4", "$t5", "$t6", "$t7", "$t0"),
+ roundb!(32, "$t0" , "$t1" , "$t2", "$t3", "$t4", "$t5", "$t6", "$t7"),
+ roundb!(33, "$t7", "$t0" , "$t1" , "$t2", "$t3", "$t4", "$t5", "$t6"),
+ roundb!(34, "$t6", "$t7", "$t0" , "$t1" , "$t2", "$t3", "$t4", "$t5"),
+ roundb!(35, "$t5", "$t6", "$t7", "$t0" , "$t1" , "$t2", "$t3", "$t4"),
+ roundb!(36, "$t4", "$t5", "$t6", "$t7", "$t0" , "$t1" , "$t2", "$t3"),
+ roundb!(37, "$t3", "$t4", "$t5", "$t6", "$t7", "$t0" , "$t1" , "$t2"),
+ roundb!(38, "$t2", "$t3", "$t4", "$t5", "$t6", "$t7", "$t0" , "$t1"),
+ roundb!(39, "$t1" , "$t2", "$t3", "$t4", "$t5", "$t6", "$t7", "$t0"),
+ roundb!(40, "$t0" , "$t1" , "$t2", "$t3", "$t4", "$t5", "$t6", "$t7"),
+ roundb!(41, "$t7", "$t0" , "$t1" , "$t2", "$t3", "$t4", "$t5", "$t6"),
+ roundb!(42, "$t6", "$t7", "$t0" , "$t1" , "$t2", "$t3", "$t4", "$t5"),
+ roundb!(43, "$t5", "$t6", "$t7", "$t0" , "$t1" , "$t2", "$t3", "$t4"),
+ roundb!(44, "$t4", "$t5", "$t6", "$t7", "$t0" , "$t1" , "$t2", "$t3"),
+ roundb!(45, "$t3", "$t4", "$t5", "$t6", "$t7", "$t0" , "$t1" , "$t2"),
+ roundb!(46, "$t2", "$t3", "$t4", "$t5", "$t6", "$t7", "$t0" , "$t1"),
+ roundb!(47, "$t1" , "$t2", "$t3", "$t4", "$t5", "$t6", "$t7", "$t0"),
+ roundb!(48, "$t0" , "$t1" , "$t2", "$t3", "$t4", "$t5", "$t6", "$t7"),
+ roundb!(49, "$t7", "$t0" , "$t1" , "$t2", "$t3", "$t4", "$t5", "$t6"),
+ roundb!(50, "$t6", "$t7", "$t0" , "$t1" , "$t2", "$t3", "$t4", "$t5"),
+ roundb!(51, "$t5", "$t6", "$t7", "$t0" , "$t1" , "$t2", "$t3", "$t4"),
+ roundb!(52, "$t4", "$t5", "$t6", "$t7", "$t0" , "$t1" , "$t2", "$t3"),
+ roundb!(53, "$t3", "$t4", "$t5", "$t6", "$t7", "$t0" , "$t1" , "$t2"),
+ roundb!(54, "$t2", "$t3", "$t4", "$t5", "$t6", "$t7", "$t0" , "$t1"),
+ roundb!(55, "$t1" , "$t2", "$t3", "$t4", "$t5", "$t6", "$t7", "$t0"),
+ roundb!(56, "$t0" , "$t1" , "$t2", "$t3", "$t4", "$t5", "$t6", "$t7"),
+ roundb!(57, "$t7", "$t0" , "$t1" , "$t2", "$t3", "$t4", "$t5", "$t6"),
+ roundb!(58, "$t6", "$t7", "$t0" , "$t1" , "$t2", "$t3", "$t4", "$t5"),
+ roundb!(59, "$t5", "$t6", "$t7", "$t0" , "$t1" , "$t2", "$t3", "$t4"),
+ roundb!(60, "$t4", "$t5", "$t6", "$t7", "$t0" , "$t1" , "$t2", "$t3"),
+ roundb!(61, "$t3", "$t4", "$t5", "$t6", "$t7", "$t0" , "$t1" , "$t2"),
+ roundb!(62, "$t2", "$t3", "$t4", "$t5", "$t6", "$t7", "$t0" , "$t1"),
+ roundb!(63, "$t1" , "$t2", "$t3", "$t4", "$t5", "$t6", "$t7", "$t0"),
+
+ // Update state registers
+ "ld.w $a4, $a0, 0", // a
+ "ld.w $a5, $a0, 4", // b
+ "ld.w $a6, $a0, 8", // c
+ "ld.w $a7, $a0, 12", // d
+ "add.w $t0, $t0, $a4",
+ "add.w $t1, $t1, $a5",
+ "add.w $t2, $t2, $a6",
+ "add.w $t3, $t3, $a7",
+ "ld.w $a4, $a0, 16", // e
+ "ld.w $a5, $a0, 20", // f
+ "ld.w $a6, $a0, 24", // g
+ "ld.w $a7, $a0, 28", // h
+ "add.w $t4, $t4, $a4",
+ "add.w $t5, $t5, $a5",
+ "add.w $t6, $t6, $a6",
+ "add.w $t7, $t7, $a7",
+
+ // Save updated state
+ "st.w $t0, $a0, 0",
+ "st.w $t1, $a0, 4",
+ "st.w $t2, $a0, 8",
+ "st.w $t3, $a0, 12",
+ "st.w $t4, $a0, 16",
+ "st.w $t5, $a0, 20",
+ "st.w $t6, $a0, 24",
+ "st.w $t7, $a0, 28",
+
+ // Looping over blocks
+ "addi.d $a1, $a1, 64",
+ "addi.d $a2, $a2, -1",
+ "bnez $a2, 42b",
+
+ // Restore stack register
+ "addi.d $sp, $sp, 64",
+
+ in("$a0") state,
+ inout("$a1") blocks.as_ptr() => _,
+ inout("$a2") blocks.len() => _,
+ in("$a3") crate::consts::K32.as_ptr(),
+
+ // Clobbers
+ out("$a4") _,
+ out("$a5") _,
+ out("$a6") _,
+ out("$a7") _,
+ out("$t0") _,
+ out("$t1") _,
+ out("$t2") _,
+ out("$t3") _,
+ out("$t4") _,
+ out("$t5") _,
+ out("$t6") _,
+ out("$t7") _,
+
+ options(preserves_flags),
+ );
+ }
+}
diff --git a/rust/vendor/sha2/src/sha256/soft.rs b/rust/vendor/sha2/src/sha256/soft.rs
new file mode 100644
index 0000000..34826a7
--- /dev/null
+++ b/rust/vendor/sha2/src/sha256/soft.rs
@@ -0,0 +1,218 @@
+#![allow(clippy::many_single_char_names)]
+use crate::consts::BLOCK_LEN;
+use core::convert::TryInto;
+
+#[inline(always)]
+fn shl(v: [u32; 4], o: u32) -> [u32; 4] {
+ [v[0] >> o, v[1] >> o, v[2] >> o, v[3] >> o]
+}
+
+#[inline(always)]
+fn shr(v: [u32; 4], o: u32) -> [u32; 4] {
+ [v[0] << o, v[1] << o, v[2] << o, v[3] << o]
+}
+
+#[inline(always)]
+fn or(a: [u32; 4], b: [u32; 4]) -> [u32; 4] {
+ [a[0] | b[0], a[1] | b[1], a[2] | b[2], a[3] | b[3]]
+}
+
+#[inline(always)]
+fn xor(a: [u32; 4], b: [u32; 4]) -> [u32; 4] {
+ [a[0] ^ b[0], a[1] ^ b[1], a[2] ^ b[2], a[3] ^ b[3]]
+}
+
+#[inline(always)]
+fn add(a: [u32; 4], b: [u32; 4]) -> [u32; 4] {
+ [
+ a[0].wrapping_add(b[0]),
+ a[1].wrapping_add(b[1]),
+ a[2].wrapping_add(b[2]),
+ a[3].wrapping_add(b[3]),
+ ]
+}
+
+fn sha256load(v2: [u32; 4], v3: [u32; 4]) -> [u32; 4] {
+ [v3[3], v2[0], v2[1], v2[2]]
+}
+
+fn sha256swap(v0: [u32; 4]) -> [u32; 4] {
+ [v0[2], v0[3], v0[0], v0[1]]
+}
+
+fn sha256msg1(v0: [u32; 4], v1: [u32; 4]) -> [u32; 4] {
+ // sigma 0 on vectors
+ #[inline]
+ fn sigma0x4(x: [u32; 4]) -> [u32; 4] {
+ let t1 = or(shl(x, 7), shr(x, 25));
+ let t2 = or(shl(x, 18), shr(x, 14));
+ let t3 = shl(x, 3);
+ xor(xor(t1, t2), t3)
+ }
+
+ add(v0, sigma0x4(sha256load(v0, v1)))
+}
+
+fn sha256msg2(v4: [u32; 4], v3: [u32; 4]) -> [u32; 4] {
+ macro_rules! sigma1 {
+ ($a:expr) => {
+ $a.rotate_right(17) ^ $a.rotate_right(19) ^ ($a >> 10)
+ };
+ }
+
+ let [x3, x2, x1, x0] = v4;
+ let [w15, w14, _, _] = v3;
+
+ let w16 = x0.wrapping_add(sigma1!(w14));
+ let w17 = x1.wrapping_add(sigma1!(w15));
+ let w18 = x2.wrapping_add(sigma1!(w16));
+ let w19 = x3.wrapping_add(sigma1!(w17));
+
+ [w19, w18, w17, w16]
+}
+
+fn sha256_digest_round_x2(cdgh: [u32; 4], abef: [u32; 4], wk: [u32; 4]) -> [u32; 4] {
+ macro_rules! big_sigma0 {
+ ($a:expr) => {
+ ($a.rotate_right(2) ^ $a.rotate_right(13) ^ $a.rotate_right(22))
+ };
+ }
+ macro_rules! big_sigma1 {
+ ($a:expr) => {
+ ($a.rotate_right(6) ^ $a.rotate_right(11) ^ $a.rotate_right(25))
+ };
+ }
+ macro_rules! bool3ary_202 {
+ ($a:expr, $b:expr, $c:expr) => {
+ $c ^ ($a & ($b ^ $c))
+ };
+ } // Choose, MD5F, SHA1C
+ macro_rules! bool3ary_232 {
+ ($a:expr, $b:expr, $c:expr) => {
+ ($a & $b) ^ ($a & $c) ^ ($b & $c)
+ };
+ } // Majority, SHA1M
+
+ let [_, _, wk1, wk0] = wk;
+ let [a0, b0, e0, f0] = abef;
+ let [c0, d0, g0, h0] = cdgh;
+
+ // a round
+ let x0 = big_sigma1!(e0)
+ .wrapping_add(bool3ary_202!(e0, f0, g0))
+ .wrapping_add(wk0)
+ .wrapping_add(h0);
+ let y0 = big_sigma0!(a0).wrapping_add(bool3ary_232!(a0, b0, c0));
+ let (a1, b1, c1, d1, e1, f1, g1, h1) = (
+ x0.wrapping_add(y0),
+ a0,
+ b0,
+ c0,
+ x0.wrapping_add(d0),
+ e0,
+ f0,
+ g0,
+ );
+
+ // a round
+ let x1 = big_sigma1!(e1)
+ .wrapping_add(bool3ary_202!(e1, f1, g1))
+ .wrapping_add(wk1)
+ .wrapping_add(h1);
+ let y1 = big_sigma0!(a1).wrapping_add(bool3ary_232!(a1, b1, c1));
+ let (a2, b2, _, _, e2, f2, _, _) = (
+ x1.wrapping_add(y1),
+ a1,
+ b1,
+ c1,
+ x1.wrapping_add(d1),
+ e1,
+ f1,
+ g1,
+ );
+
+ [a2, b2, e2, f2]
+}
+
+fn schedule(v0: [u32; 4], v1: [u32; 4], v2: [u32; 4], v3: [u32; 4]) -> [u32; 4] {
+ let t1 = sha256msg1(v0, v1);
+ let t2 = sha256load(v2, v3);
+ let t3 = add(t1, t2);
+ sha256msg2(t3, v3)
+}
+
+macro_rules! rounds4 {
+ ($abef:ident, $cdgh:ident, $rest:expr, $i:expr) => {{
+ let t1 = add($rest, crate::consts::K32X4[$i]);
+ $cdgh = sha256_digest_round_x2($cdgh, $abef, t1);
+ let t2 = sha256swap(t1);
+ $abef = sha256_digest_round_x2($abef, $cdgh, t2);
+ }};
+}
+
+macro_rules! schedule_rounds4 {
+ (
+ $abef:ident, $cdgh:ident,
+ $w0:expr, $w1:expr, $w2:expr, $w3:expr, $w4:expr,
+ $i: expr
+ ) => {{
+ $w4 = schedule($w0, $w1, $w2, $w3);
+ rounds4!($abef, $cdgh, $w4, $i);
+ }};
+}
+
+/// Process a block with the SHA-256 algorithm.
+fn sha256_digest_block_u32(state: &mut [u32; 8], block: &[u32; 16]) {
+ let mut abef = [state[0], state[1], state[4], state[5]];
+ let mut cdgh = [state[2], state[3], state[6], state[7]];
+
+ // Rounds 0..64
+ let mut w0 = [block[3], block[2], block[1], block[0]];
+ let mut w1 = [block[7], block[6], block[5], block[4]];
+ let mut w2 = [block[11], block[10], block[9], block[8]];
+ let mut w3 = [block[15], block[14], block[13], block[12]];
+ let mut w4;
+
+ rounds4!(abef, cdgh, w0, 0);
+ rounds4!(abef, cdgh, w1, 1);
+ rounds4!(abef, cdgh, w2, 2);
+ rounds4!(abef, cdgh, w3, 3);
+ schedule_rounds4!(abef, cdgh, w0, w1, w2, w3, w4, 4);
+ schedule_rounds4!(abef, cdgh, w1, w2, w3, w4, w0, 5);
+ schedule_rounds4!(abef, cdgh, w2, w3, w4, w0, w1, 6);
+ schedule_rounds4!(abef, cdgh, w3, w4, w0, w1, w2, 7);
+ schedule_rounds4!(abef, cdgh, w4, w0, w1, w2, w3, 8);
+ schedule_rounds4!(abef, cdgh, w0, w1, w2, w3, w4, 9);
+ schedule_rounds4!(abef, cdgh, w1, w2, w3, w4, w0, 10);
+ schedule_rounds4!(abef, cdgh, w2, w3, w4, w0, w1, 11);
+ schedule_rounds4!(abef, cdgh, w3, w4, w0, w1, w2, 12);
+ schedule_rounds4!(abef, cdgh, w4, w0, w1, w2, w3, 13);
+ schedule_rounds4!(abef, cdgh, w0, w1, w2, w3, w4, 14);
+ schedule_rounds4!(abef, cdgh, w1, w2, w3, w4, w0, 15);
+
+ let [a, b, e, f] = abef;
+ let [c, d, g, h] = cdgh;
+
+ state[0] = state[0].wrapping_add(a);
+ state[1] = state[1].wrapping_add(b);
+ state[2] = state[2].wrapping_add(c);
+ state[3] = state[3].wrapping_add(d);
+ state[4] = state[4].wrapping_add(e);
+ state[5] = state[5].wrapping_add(f);
+ state[6] = state[6].wrapping_add(g);
+ state[7] = state[7].wrapping_add(h);
+}
+
+pub fn compress(state: &mut [u32; 8], blocks: &[[u8; 64]]) {
+ let mut block_u32 = [0u32; BLOCK_LEN];
+ // since LLVM can't properly use aliasing yet it will make
+ // unnecessary state stores without this copy
+ let mut state_cpy = *state;
+ for block in blocks {
+ for (o, chunk) in block_u32.iter_mut().zip(block.chunks_exact(4)) {
+ *o = u32::from_be_bytes(chunk.try_into().unwrap());
+ }
+ sha256_digest_block_u32(&mut state_cpy, &block_u32);
+ }
+ *state = state_cpy;
+}
diff --git a/rust/vendor/sha2/src/sha256/x86.rs b/rust/vendor/sha2/src/sha256/x86.rs
new file mode 100644
index 0000000..4601938
--- /dev/null
+++ b/rust/vendor/sha2/src/sha256/x86.rs
@@ -0,0 +1,112 @@
+//! SHA-256 `x86`/`x86_64` backend
+
+#![allow(clippy::many_single_char_names)]
+
+#[cfg(target_arch = "x86")]
+use core::arch::x86::*;
+#[cfg(target_arch = "x86_64")]
+use core::arch::x86_64::*;
+
+unsafe fn schedule(v0: __m128i, v1: __m128i, v2: __m128i, v3: __m128i) -> __m128i {
+ let t1 = _mm_sha256msg1_epu32(v0, v1);
+ let t2 = _mm_alignr_epi8(v3, v2, 4);
+ let t3 = _mm_add_epi32(t1, t2);
+ _mm_sha256msg2_epu32(t3, v3)
+}
+
+macro_rules! rounds4 {
+ ($abef:ident, $cdgh:ident, $rest:expr, $i:expr) => {{
+ let k = crate::consts::K32X4[$i];
+ let kv = _mm_set_epi32(k[0] as i32, k[1] as i32, k[2] as i32, k[3] as i32);
+ let t1 = _mm_add_epi32($rest, kv);
+ $cdgh = _mm_sha256rnds2_epu32($cdgh, $abef, t1);
+ let t2 = _mm_shuffle_epi32(t1, 0x0E);
+ $abef = _mm_sha256rnds2_epu32($abef, $cdgh, t2);
+ }};
+}
+
+macro_rules! schedule_rounds4 {
+ (
+ $abef:ident, $cdgh:ident,
+ $w0:expr, $w1:expr, $w2:expr, $w3:expr, $w4:expr,
+ $i: expr
+ ) => {{
+ $w4 = schedule($w0, $w1, $w2, $w3);
+ rounds4!($abef, $cdgh, $w4, $i);
+ }};
+}
+
+// we use unaligned loads with `__m128i` pointers
+#[allow(clippy::cast_ptr_alignment)]
+#[target_feature(enable = "sha,sse2,ssse3,sse4.1")]
+unsafe fn digest_blocks(state: &mut [u32; 8], blocks: &[[u8; 64]]) {
+ #[allow(non_snake_case)]
+ let MASK: __m128i = _mm_set_epi64x(
+ 0x0C0D_0E0F_0809_0A0Bu64 as i64,
+ 0x0405_0607_0001_0203u64 as i64,
+ );
+
+ let state_ptr = state.as_ptr() as *const __m128i;
+ let dcba = _mm_loadu_si128(state_ptr.add(0));
+ let efgh = _mm_loadu_si128(state_ptr.add(1));
+
+ let cdab = _mm_shuffle_epi32(dcba, 0xB1);
+ let efgh = _mm_shuffle_epi32(efgh, 0x1B);
+ let mut abef = _mm_alignr_epi8(cdab, efgh, 8);
+ let mut cdgh = _mm_blend_epi16(efgh, cdab, 0xF0);
+
+ for block in blocks {
+ let abef_save = abef;
+ let cdgh_save = cdgh;
+
+ let data_ptr = block.as_ptr() as *const __m128i;
+ let mut w0 = _mm_shuffle_epi8(_mm_loadu_si128(data_ptr.add(0)), MASK);
+ let mut w1 = _mm_shuffle_epi8(_mm_loadu_si128(data_ptr.add(1)), MASK);
+ let mut w2 = _mm_shuffle_epi8(_mm_loadu_si128(data_ptr.add(2)), MASK);
+ let mut w3 = _mm_shuffle_epi8(_mm_loadu_si128(data_ptr.add(3)), MASK);
+ let mut w4;
+
+ rounds4!(abef, cdgh, w0, 0);
+ rounds4!(abef, cdgh, w1, 1);
+ rounds4!(abef, cdgh, w2, 2);
+ rounds4!(abef, cdgh, w3, 3);
+ schedule_rounds4!(abef, cdgh, w0, w1, w2, w3, w4, 4);
+ schedule_rounds4!(abef, cdgh, w1, w2, w3, w4, w0, 5);
+ schedule_rounds4!(abef, cdgh, w2, w3, w4, w0, w1, 6);
+ schedule_rounds4!(abef, cdgh, w3, w4, w0, w1, w2, 7);
+ schedule_rounds4!(abef, cdgh, w4, w0, w1, w2, w3, 8);
+ schedule_rounds4!(abef, cdgh, w0, w1, w2, w3, w4, 9);
+ schedule_rounds4!(abef, cdgh, w1, w2, w3, w4, w0, 10);
+ schedule_rounds4!(abef, cdgh, w2, w3, w4, w0, w1, 11);
+ schedule_rounds4!(abef, cdgh, w3, w4, w0, w1, w2, 12);
+ schedule_rounds4!(abef, cdgh, w4, w0, w1, w2, w3, 13);
+ schedule_rounds4!(abef, cdgh, w0, w1, w2, w3, w4, 14);
+ schedule_rounds4!(abef, cdgh, w1, w2, w3, w4, w0, 15);
+
+ abef = _mm_add_epi32(abef, abef_save);
+ cdgh = _mm_add_epi32(cdgh, cdgh_save);
+ }
+
+ let feba = _mm_shuffle_epi32(abef, 0x1B);
+ let dchg = _mm_shuffle_epi32(cdgh, 0xB1);
+ let dcba = _mm_blend_epi16(feba, dchg, 0xF0);
+ let hgef = _mm_alignr_epi8(dchg, feba, 8);
+
+ let state_ptr_mut = state.as_mut_ptr() as *mut __m128i;
+ _mm_storeu_si128(state_ptr_mut.add(0), dcba);
+ _mm_storeu_si128(state_ptr_mut.add(1), hgef);
+}
+
+cpufeatures::new!(shani_cpuid, "sha", "sse2", "ssse3", "sse4.1");
+
+pub fn compress(state: &mut [u32; 8], blocks: &[[u8; 64]]) {
+ // TODO: Replace with https://github.com/rust-lang/rfcs/pull/2725
+ // after stabilization
+ if shani_cpuid::get() {
+ unsafe {
+ digest_blocks(state, blocks);
+ }
+ } else {
+ super::soft::compress(state, blocks);
+ }
+}