diff options
Diffstat (limited to 'rust/vendor/sha2/src/sha256')
-rw-r--r-- | rust/vendor/sha2/src/sha256/aarch64.rs | 159 | ||||
-rw-r--r-- | rust/vendor/sha2/src/sha256/loongarch64_asm.rs | 227 | ||||
-rw-r--r-- | rust/vendor/sha2/src/sha256/soft.rs | 218 | ||||
-rw-r--r-- | rust/vendor/sha2/src/sha256/x86.rs | 112 |
4 files changed, 716 insertions, 0 deletions
diff --git a/rust/vendor/sha2/src/sha256/aarch64.rs b/rust/vendor/sha2/src/sha256/aarch64.rs new file mode 100644 index 0000000..9d220a3 --- /dev/null +++ b/rust/vendor/sha2/src/sha256/aarch64.rs @@ -0,0 +1,159 @@ +//! SHA-256 `aarch64` backend. + +// Implementation adapted from mbedtls. + +// TODO: stdarch intrinsics: RustCrypto/hashes#257 + +use core::arch::{aarch64::*, asm}; + +use crate::consts::K32; + +cpufeatures::new!(sha2_hwcap, "sha2"); + +pub fn compress(state: &mut [u32; 8], blocks: &[[u8; 64]]) { + // TODO: Replace with https://github.com/rust-lang/rfcs/pull/2725 + // after stabilization + if sha2_hwcap::get() { + unsafe { sha256_compress(state, blocks) } + } else { + super::soft::compress(state, blocks); + } +} + +#[target_feature(enable = "sha2")] +unsafe fn sha256_compress(state: &mut [u32; 8], blocks: &[[u8; 64]]) { + // SAFETY: Requires the sha2 feature. + + // Load state into vectors. + let mut abcd = vld1q_u32(state[0..4].as_ptr()); + let mut efgh = vld1q_u32(state[4..8].as_ptr()); + + // Iterate through the message blocks. + for block in blocks { + // Keep original state values. + let abcd_orig = abcd; + let efgh_orig = efgh; + + // Load the message block into vectors, assuming little endianness. + let mut s0 = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(block[0..16].as_ptr()))); + let mut s1 = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(block[16..32].as_ptr()))); + let mut s2 = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(block[32..48].as_ptr()))); + let mut s3 = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(block[48..64].as_ptr()))); + + // Rounds 0 to 3 + let mut tmp = vaddq_u32(s0, vld1q_u32(&K32[0])); + let mut abcd_prev = abcd; + abcd = vsha256hq_u32(abcd_prev, efgh, tmp); + efgh = vsha256h2q_u32(efgh, abcd_prev, tmp); + + // Rounds 4 to 7 + tmp = vaddq_u32(s1, vld1q_u32(&K32[4])); + abcd_prev = abcd; + abcd = vsha256hq_u32(abcd_prev, efgh, tmp); + efgh = vsha256h2q_u32(efgh, abcd_prev, tmp); + + // Rounds 8 to 11 + tmp = vaddq_u32(s2, vld1q_u32(&K32[8])); + abcd_prev = abcd; + abcd = vsha256hq_u32(abcd_prev, efgh, tmp); + efgh = vsha256h2q_u32(efgh, abcd_prev, tmp); + + // Rounds 12 to 15 + tmp = vaddq_u32(s3, vld1q_u32(&K32[12])); + abcd_prev = abcd; + abcd = vsha256hq_u32(abcd_prev, efgh, tmp); + efgh = vsha256h2q_u32(efgh, abcd_prev, tmp); + + for t in (16..64).step_by(16) { + // Rounds t to t + 3 + s0 = vsha256su1q_u32(vsha256su0q_u32(s0, s1), s2, s3); + tmp = vaddq_u32(s0, vld1q_u32(&K32[t])); + abcd_prev = abcd; + abcd = vsha256hq_u32(abcd_prev, efgh, tmp); + efgh = vsha256h2q_u32(efgh, abcd_prev, tmp); + + // Rounds t + 4 to t + 7 + s1 = vsha256su1q_u32(vsha256su0q_u32(s1, s2), s3, s0); + tmp = vaddq_u32(s1, vld1q_u32(&K32[t + 4])); + abcd_prev = abcd; + abcd = vsha256hq_u32(abcd_prev, efgh, tmp); + efgh = vsha256h2q_u32(efgh, abcd_prev, tmp); + + // Rounds t + 8 to t + 11 + s2 = vsha256su1q_u32(vsha256su0q_u32(s2, s3), s0, s1); + tmp = vaddq_u32(s2, vld1q_u32(&K32[t + 8])); + abcd_prev = abcd; + abcd = vsha256hq_u32(abcd_prev, efgh, tmp); + efgh = vsha256h2q_u32(efgh, abcd_prev, tmp); + + // Rounds t + 12 to t + 15 + s3 = vsha256su1q_u32(vsha256su0q_u32(s3, s0), s1, s2); + tmp = vaddq_u32(s3, vld1q_u32(&K32[t + 12])); + abcd_prev = abcd; + abcd = vsha256hq_u32(abcd_prev, efgh, tmp); + efgh = vsha256h2q_u32(efgh, abcd_prev, tmp); + } + + // Add the block-specific state to the original state. + abcd = vaddq_u32(abcd, abcd_orig); + efgh = vaddq_u32(efgh, efgh_orig); + } + + // Store vectors into state. + vst1q_u32(state[0..4].as_mut_ptr(), abcd); + vst1q_u32(state[4..8].as_mut_ptr(), efgh); +} + +// TODO remove these polyfills once SHA2 intrinsics land + +#[inline(always)] +unsafe fn vsha256hq_u32( + mut hash_efgh: uint32x4_t, + hash_abcd: uint32x4_t, + wk: uint32x4_t, +) -> uint32x4_t { + asm!( + "SHA256H {:q}, {:q}, {:v}.4S", + inout(vreg) hash_efgh, in(vreg) hash_abcd, in(vreg) wk, + options(pure, nomem, nostack, preserves_flags) + ); + hash_efgh +} + +#[inline(always)] +unsafe fn vsha256h2q_u32( + mut hash_efgh: uint32x4_t, + hash_abcd: uint32x4_t, + wk: uint32x4_t, +) -> uint32x4_t { + asm!( + "SHA256H2 {:q}, {:q}, {:v}.4S", + inout(vreg) hash_efgh, in(vreg) hash_abcd, in(vreg) wk, + options(pure, nomem, nostack, preserves_flags) + ); + hash_efgh +} + +#[inline(always)] +unsafe fn vsha256su0q_u32(mut w0_3: uint32x4_t, w4_7: uint32x4_t) -> uint32x4_t { + asm!( + "SHA256SU0 {:v}.4S, {:v}.4S", + inout(vreg) w0_3, in(vreg) w4_7, + options(pure, nomem, nostack, preserves_flags) + ); + w0_3 +} + +#[inline(always)] +unsafe fn vsha256su1q_u32( + mut tw0_3: uint32x4_t, + w8_11: uint32x4_t, + w12_15: uint32x4_t, +) -> uint32x4_t { + asm!( + "SHA256SU1 {:v}.4S, {:v}.4S, {:v}.4S", + inout(vreg) tw0_3, in(vreg) w8_11, in(vreg) w12_15, + options(pure, nomem, nostack, preserves_flags) + ); + tw0_3 +} diff --git a/rust/vendor/sha2/src/sha256/loongarch64_asm.rs b/rust/vendor/sha2/src/sha256/loongarch64_asm.rs new file mode 100644 index 0000000..c80fce8 --- /dev/null +++ b/rust/vendor/sha2/src/sha256/loongarch64_asm.rs @@ -0,0 +1,227 @@ +//! LoongArch64 assembly backend + +macro_rules! c { + ($($l:expr)*) => { + concat!($($l ,)*) + }; +} + +macro_rules! rounda { + ($i:literal, $a:literal, $b:literal, $c:literal, $d:literal, $e:literal, $f:literal, $g:literal, $h:literal) => { + c!( + "ld.w $a5, $a1, (" $i " * 4);" + "revb.2h $a5, $a5;" + "rotri.w $a5, $a5, 16;" + roundtail!($i, $a, $b, $c, $d, $e, $f, $g, $h) + ) + }; +} + +macro_rules! roundb { + ($i:literal, $a:literal, $b:literal, $c:literal, $d:literal, $e:literal, $f:literal, $g:literal, $h:literal) => { + c!( + "ld.w $a4, $sp, (((" $i " - 15) & 0xF) * 4);" + "ld.w $a5, $sp, (((" $i " - 16) & 0xF) * 4);" + "ld.w $a6, $sp, (((" $i " - 7) & 0xF) * 4);" + "add.w $a5, $a5, $a6;" + "rotri.w $a6, $a4, 18;" + "srli.w $a7, $a4, 3;" + "rotri.w $a4, $a4, 7;" + "xor $a6, $a6, $a7;" + "xor $a4, $a4, $a6;" + "add.w $a5, $a5, $a4;" + "ld.w $a4, $sp, (((" $i " - 2) & 0xF) * 4);" + "rotri.w $a6, $a4, 19;" + "srli.w $a7, $a4, 10;" + "rotri.w $a4, $a4, 17;" + "xor $a6, $a6, $a7;" + "xor $a4, $a4, $a6;" + "add.w $a5, $a5, $a4;" + roundtail!($i, $a, $b, $c, $d, $e, $f, $g, $h) + ) + }; +} + +macro_rules! roundtail { + ($i:literal, $a:literal, $b:literal, $c:literal, $d:literal, $e:literal, $f:literal, $g:literal, $h:literal) => { + c!( + // Part 0 + "rotri.w $a6, " $e ", 11;" + "rotri.w $a7, " $e ", 25;" + "rotri.w $a4, " $e ", 6;" + "xor $a6, $a6, $a7;" + "xor $a4, $a4, $a6;" + "xor $a6, " $g ", " $f ";" + "ld.w $a7, $a3, " $i " * 4;" + "and $a6, $a6, " $e ";" + "xor $a6, $a6, " $g ";" + "add.w $a4, $a4, $a6;" + "add.w $a4, $a4, $a7;" + "add.w " $h ", " $h ", $a5;" + "add.w " $h ", " $h ", $a4;" + // Part 1 + "add.w " $d ", " $d ", " $h ";" + // Part 2 + "rotri.w $a6, " $a ", 13;" + "rotri.w $a7, " $a ", 22;" + "rotri.w $a4, " $a ", 2;" + "xor $a6, $a6, $a7;" + "xor $a4, $a4, $a6;" + "add.w " $h ", " $h ", $a4;" + "or $a4, " $c ", " $b ";" + "and $a6, " $c ", " $b ";" + "and $a4, $a4, " $a ";" + "or $a4, $a4, $a6;" + "add.w " $h ", " $h ", $a4;" + "st.w $a5, $sp, ((" $i " & 0xF) * 4);" + ) + }; +} + +pub fn compress(state: &mut [u32; 8], blocks: &[[u8; 64]]) { + if blocks.is_empty() { + return; + } + + unsafe { + core::arch::asm!( + // Allocate scratch stack space + "addi.d $sp, $sp, -64;", + + // Load state + "ld.w $t0, $a0, 0", + "ld.w $t1, $a0, 4", + "ld.w $t2, $a0, 8", + "ld.w $t3, $a0, 12", + "ld.w $t4, $a0, 16", + "ld.w $t5, $a0, 20", + "ld.w $t6, $a0, 24", + "ld.w $t7, $a0, 28", + + "42:", + + // Do 64 rounds of hashing + rounda!( 0, "$t0" , "$t1" , "$t2", "$t3", "$t4", "$t5", "$t6", "$t7"), + rounda!( 1, "$t7", "$t0" , "$t1" , "$t2", "$t3", "$t4", "$t5", "$t6"), + rounda!( 2, "$t6", "$t7", "$t0" , "$t1" , "$t2", "$t3", "$t4", "$t5"), + rounda!( 3, "$t5", "$t6", "$t7", "$t0" , "$t1" , "$t2", "$t3", "$t4"), + rounda!( 4, "$t4", "$t5", "$t6", "$t7", "$t0" , "$t1" , "$t2", "$t3"), + rounda!( 5, "$t3", "$t4", "$t5", "$t6", "$t7", "$t0" , "$t1" , "$t2"), + rounda!( 6, "$t2", "$t3", "$t4", "$t5", "$t6", "$t7", "$t0" , "$t1"), + rounda!( 7, "$t1" , "$t2", "$t3", "$t4", "$t5", "$t6", "$t7", "$t0"), + rounda!( 8, "$t0" , "$t1" , "$t2", "$t3", "$t4", "$t5", "$t6", "$t7"), + rounda!( 9, "$t7", "$t0" , "$t1" , "$t2", "$t3", "$t4", "$t5", "$t6"), + rounda!(10, "$t6", "$t7", "$t0" , "$t1" , "$t2", "$t3", "$t4", "$t5"), + rounda!(11, "$t5", "$t6", "$t7", "$t0" , "$t1" , "$t2", "$t3", "$t4"), + rounda!(12, "$t4", "$t5", "$t6", "$t7", "$t0" , "$t1" , "$t2", "$t3"), + rounda!(13, "$t3", "$t4", "$t5", "$t6", "$t7", "$t0" , "$t1" , "$t2"), + rounda!(14, "$t2", "$t3", "$t4", "$t5", "$t6", "$t7", "$t0" , "$t1"), + rounda!(15, "$t1" , "$t2", "$t3", "$t4", "$t5", "$t6", "$t7", "$t0"), + roundb!(16, "$t0" , "$t1" , "$t2", "$t3", "$t4", "$t5", "$t6", "$t7"), + roundb!(17, "$t7", "$t0" , "$t1" , "$t2", "$t3", "$t4", "$t5", "$t6"), + roundb!(18, "$t6", "$t7", "$t0" , "$t1" , "$t2", "$t3", "$t4", "$t5"), + roundb!(19, "$t5", "$t6", "$t7", "$t0" , "$t1" , "$t2", "$t3", "$t4"), + roundb!(20, "$t4", "$t5", "$t6", "$t7", "$t0" , "$t1" , "$t2", "$t3"), + roundb!(21, "$t3", "$t4", "$t5", "$t6", "$t7", "$t0" , "$t1" , "$t2"), + roundb!(22, "$t2", "$t3", "$t4", "$t5", "$t6", "$t7", "$t0" , "$t1"), + roundb!(23, "$t1" , "$t2", "$t3", "$t4", "$t5", "$t6", "$t7", "$t0"), + roundb!(24, "$t0" , "$t1" , "$t2", "$t3", "$t4", "$t5", "$t6", "$t7"), + roundb!(25, "$t7", "$t0" , "$t1" , "$t2", "$t3", "$t4", "$t5", "$t6"), + roundb!(26, "$t6", "$t7", "$t0" , "$t1" , "$t2", "$t3", "$t4", "$t5"), + roundb!(27, "$t5", "$t6", "$t7", "$t0" , "$t1" , "$t2", "$t3", "$t4"), + roundb!(28, "$t4", "$t5", "$t6", "$t7", "$t0" , "$t1" , "$t2", "$t3"), + roundb!(29, "$t3", "$t4", "$t5", "$t6", "$t7", "$t0" , "$t1" , "$t2"), + roundb!(30, "$t2", "$t3", "$t4", "$t5", "$t6", "$t7", "$t0" , "$t1"), + roundb!(31, "$t1" , "$t2", "$t3", "$t4", "$t5", "$t6", "$t7", "$t0"), + roundb!(32, "$t0" , "$t1" , "$t2", "$t3", "$t4", "$t5", "$t6", "$t7"), + roundb!(33, "$t7", "$t0" , "$t1" , "$t2", "$t3", "$t4", "$t5", "$t6"), + roundb!(34, "$t6", "$t7", "$t0" , "$t1" , "$t2", "$t3", "$t4", "$t5"), + roundb!(35, "$t5", "$t6", "$t7", "$t0" , "$t1" , "$t2", "$t3", "$t4"), + roundb!(36, "$t4", "$t5", "$t6", "$t7", "$t0" , "$t1" , "$t2", "$t3"), + roundb!(37, "$t3", "$t4", "$t5", "$t6", "$t7", "$t0" , "$t1" , "$t2"), + roundb!(38, "$t2", "$t3", "$t4", "$t5", "$t6", "$t7", "$t0" , "$t1"), + roundb!(39, "$t1" , "$t2", "$t3", "$t4", "$t5", "$t6", "$t7", "$t0"), + roundb!(40, "$t0" , "$t1" , "$t2", "$t3", "$t4", "$t5", "$t6", "$t7"), + roundb!(41, "$t7", "$t0" , "$t1" , "$t2", "$t3", "$t4", "$t5", "$t6"), + roundb!(42, "$t6", "$t7", "$t0" , "$t1" , "$t2", "$t3", "$t4", "$t5"), + roundb!(43, "$t5", "$t6", "$t7", "$t0" , "$t1" , "$t2", "$t3", "$t4"), + roundb!(44, "$t4", "$t5", "$t6", "$t7", "$t0" , "$t1" , "$t2", "$t3"), + roundb!(45, "$t3", "$t4", "$t5", "$t6", "$t7", "$t0" , "$t1" , "$t2"), + roundb!(46, "$t2", "$t3", "$t4", "$t5", "$t6", "$t7", "$t0" , "$t1"), + roundb!(47, "$t1" , "$t2", "$t3", "$t4", "$t5", "$t6", "$t7", "$t0"), + roundb!(48, "$t0" , "$t1" , "$t2", "$t3", "$t4", "$t5", "$t6", "$t7"), + roundb!(49, "$t7", "$t0" , "$t1" , "$t2", "$t3", "$t4", "$t5", "$t6"), + roundb!(50, "$t6", "$t7", "$t0" , "$t1" , "$t2", "$t3", "$t4", "$t5"), + roundb!(51, "$t5", "$t6", "$t7", "$t0" , "$t1" , "$t2", "$t3", "$t4"), + roundb!(52, "$t4", "$t5", "$t6", "$t7", "$t0" , "$t1" , "$t2", "$t3"), + roundb!(53, "$t3", "$t4", "$t5", "$t6", "$t7", "$t0" , "$t1" , "$t2"), + roundb!(54, "$t2", "$t3", "$t4", "$t5", "$t6", "$t7", "$t0" , "$t1"), + roundb!(55, "$t1" , "$t2", "$t3", "$t4", "$t5", "$t6", "$t7", "$t0"), + roundb!(56, "$t0" , "$t1" , "$t2", "$t3", "$t4", "$t5", "$t6", "$t7"), + roundb!(57, "$t7", "$t0" , "$t1" , "$t2", "$t3", "$t4", "$t5", "$t6"), + roundb!(58, "$t6", "$t7", "$t0" , "$t1" , "$t2", "$t3", "$t4", "$t5"), + roundb!(59, "$t5", "$t6", "$t7", "$t0" , "$t1" , "$t2", "$t3", "$t4"), + roundb!(60, "$t4", "$t5", "$t6", "$t7", "$t0" , "$t1" , "$t2", "$t3"), + roundb!(61, "$t3", "$t4", "$t5", "$t6", "$t7", "$t0" , "$t1" , "$t2"), + roundb!(62, "$t2", "$t3", "$t4", "$t5", "$t6", "$t7", "$t0" , "$t1"), + roundb!(63, "$t1" , "$t2", "$t3", "$t4", "$t5", "$t6", "$t7", "$t0"), + + // Update state registers + "ld.w $a4, $a0, 0", // a + "ld.w $a5, $a0, 4", // b + "ld.w $a6, $a0, 8", // c + "ld.w $a7, $a0, 12", // d + "add.w $t0, $t0, $a4", + "add.w $t1, $t1, $a5", + "add.w $t2, $t2, $a6", + "add.w $t3, $t3, $a7", + "ld.w $a4, $a0, 16", // e + "ld.w $a5, $a0, 20", // f + "ld.w $a6, $a0, 24", // g + "ld.w $a7, $a0, 28", // h + "add.w $t4, $t4, $a4", + "add.w $t5, $t5, $a5", + "add.w $t6, $t6, $a6", + "add.w $t7, $t7, $a7", + + // Save updated state + "st.w $t0, $a0, 0", + "st.w $t1, $a0, 4", + "st.w $t2, $a0, 8", + "st.w $t3, $a0, 12", + "st.w $t4, $a0, 16", + "st.w $t5, $a0, 20", + "st.w $t6, $a0, 24", + "st.w $t7, $a0, 28", + + // Looping over blocks + "addi.d $a1, $a1, 64", + "addi.d $a2, $a2, -1", + "bnez $a2, 42b", + + // Restore stack register + "addi.d $sp, $sp, 64", + + in("$a0") state, + inout("$a1") blocks.as_ptr() => _, + inout("$a2") blocks.len() => _, + in("$a3") crate::consts::K32.as_ptr(), + + // Clobbers + out("$a4") _, + out("$a5") _, + out("$a6") _, + out("$a7") _, + out("$t0") _, + out("$t1") _, + out("$t2") _, + out("$t3") _, + out("$t4") _, + out("$t5") _, + out("$t6") _, + out("$t7") _, + + options(preserves_flags), + ); + } +} diff --git a/rust/vendor/sha2/src/sha256/soft.rs b/rust/vendor/sha2/src/sha256/soft.rs new file mode 100644 index 0000000..34826a7 --- /dev/null +++ b/rust/vendor/sha2/src/sha256/soft.rs @@ -0,0 +1,218 @@ +#![allow(clippy::many_single_char_names)] +use crate::consts::BLOCK_LEN; +use core::convert::TryInto; + +#[inline(always)] +fn shl(v: [u32; 4], o: u32) -> [u32; 4] { + [v[0] >> o, v[1] >> o, v[2] >> o, v[3] >> o] +} + +#[inline(always)] +fn shr(v: [u32; 4], o: u32) -> [u32; 4] { + [v[0] << o, v[1] << o, v[2] << o, v[3] << o] +} + +#[inline(always)] +fn or(a: [u32; 4], b: [u32; 4]) -> [u32; 4] { + [a[0] | b[0], a[1] | b[1], a[2] | b[2], a[3] | b[3]] +} + +#[inline(always)] +fn xor(a: [u32; 4], b: [u32; 4]) -> [u32; 4] { + [a[0] ^ b[0], a[1] ^ b[1], a[2] ^ b[2], a[3] ^ b[3]] +} + +#[inline(always)] +fn add(a: [u32; 4], b: [u32; 4]) -> [u32; 4] { + [ + a[0].wrapping_add(b[0]), + a[1].wrapping_add(b[1]), + a[2].wrapping_add(b[2]), + a[3].wrapping_add(b[3]), + ] +} + +fn sha256load(v2: [u32; 4], v3: [u32; 4]) -> [u32; 4] { + [v3[3], v2[0], v2[1], v2[2]] +} + +fn sha256swap(v0: [u32; 4]) -> [u32; 4] { + [v0[2], v0[3], v0[0], v0[1]] +} + +fn sha256msg1(v0: [u32; 4], v1: [u32; 4]) -> [u32; 4] { + // sigma 0 on vectors + #[inline] + fn sigma0x4(x: [u32; 4]) -> [u32; 4] { + let t1 = or(shl(x, 7), shr(x, 25)); + let t2 = or(shl(x, 18), shr(x, 14)); + let t3 = shl(x, 3); + xor(xor(t1, t2), t3) + } + + add(v0, sigma0x4(sha256load(v0, v1))) +} + +fn sha256msg2(v4: [u32; 4], v3: [u32; 4]) -> [u32; 4] { + macro_rules! sigma1 { + ($a:expr) => { + $a.rotate_right(17) ^ $a.rotate_right(19) ^ ($a >> 10) + }; + } + + let [x3, x2, x1, x0] = v4; + let [w15, w14, _, _] = v3; + + let w16 = x0.wrapping_add(sigma1!(w14)); + let w17 = x1.wrapping_add(sigma1!(w15)); + let w18 = x2.wrapping_add(sigma1!(w16)); + let w19 = x3.wrapping_add(sigma1!(w17)); + + [w19, w18, w17, w16] +} + +fn sha256_digest_round_x2(cdgh: [u32; 4], abef: [u32; 4], wk: [u32; 4]) -> [u32; 4] { + macro_rules! big_sigma0 { + ($a:expr) => { + ($a.rotate_right(2) ^ $a.rotate_right(13) ^ $a.rotate_right(22)) + }; + } + macro_rules! big_sigma1 { + ($a:expr) => { + ($a.rotate_right(6) ^ $a.rotate_right(11) ^ $a.rotate_right(25)) + }; + } + macro_rules! bool3ary_202 { + ($a:expr, $b:expr, $c:expr) => { + $c ^ ($a & ($b ^ $c)) + }; + } // Choose, MD5F, SHA1C + macro_rules! bool3ary_232 { + ($a:expr, $b:expr, $c:expr) => { + ($a & $b) ^ ($a & $c) ^ ($b & $c) + }; + } // Majority, SHA1M + + let [_, _, wk1, wk0] = wk; + let [a0, b0, e0, f0] = abef; + let [c0, d0, g0, h0] = cdgh; + + // a round + let x0 = big_sigma1!(e0) + .wrapping_add(bool3ary_202!(e0, f0, g0)) + .wrapping_add(wk0) + .wrapping_add(h0); + let y0 = big_sigma0!(a0).wrapping_add(bool3ary_232!(a0, b0, c0)); + let (a1, b1, c1, d1, e1, f1, g1, h1) = ( + x0.wrapping_add(y0), + a0, + b0, + c0, + x0.wrapping_add(d0), + e0, + f0, + g0, + ); + + // a round + let x1 = big_sigma1!(e1) + .wrapping_add(bool3ary_202!(e1, f1, g1)) + .wrapping_add(wk1) + .wrapping_add(h1); + let y1 = big_sigma0!(a1).wrapping_add(bool3ary_232!(a1, b1, c1)); + let (a2, b2, _, _, e2, f2, _, _) = ( + x1.wrapping_add(y1), + a1, + b1, + c1, + x1.wrapping_add(d1), + e1, + f1, + g1, + ); + + [a2, b2, e2, f2] +} + +fn schedule(v0: [u32; 4], v1: [u32; 4], v2: [u32; 4], v3: [u32; 4]) -> [u32; 4] { + let t1 = sha256msg1(v0, v1); + let t2 = sha256load(v2, v3); + let t3 = add(t1, t2); + sha256msg2(t3, v3) +} + +macro_rules! rounds4 { + ($abef:ident, $cdgh:ident, $rest:expr, $i:expr) => {{ + let t1 = add($rest, crate::consts::K32X4[$i]); + $cdgh = sha256_digest_round_x2($cdgh, $abef, t1); + let t2 = sha256swap(t1); + $abef = sha256_digest_round_x2($abef, $cdgh, t2); + }}; +} + +macro_rules! schedule_rounds4 { + ( + $abef:ident, $cdgh:ident, + $w0:expr, $w1:expr, $w2:expr, $w3:expr, $w4:expr, + $i: expr + ) => {{ + $w4 = schedule($w0, $w1, $w2, $w3); + rounds4!($abef, $cdgh, $w4, $i); + }}; +} + +/// Process a block with the SHA-256 algorithm. +fn sha256_digest_block_u32(state: &mut [u32; 8], block: &[u32; 16]) { + let mut abef = [state[0], state[1], state[4], state[5]]; + let mut cdgh = [state[2], state[3], state[6], state[7]]; + + // Rounds 0..64 + let mut w0 = [block[3], block[2], block[1], block[0]]; + let mut w1 = [block[7], block[6], block[5], block[4]]; + let mut w2 = [block[11], block[10], block[9], block[8]]; + let mut w3 = [block[15], block[14], block[13], block[12]]; + let mut w4; + + rounds4!(abef, cdgh, w0, 0); + rounds4!(abef, cdgh, w1, 1); + rounds4!(abef, cdgh, w2, 2); + rounds4!(abef, cdgh, w3, 3); + schedule_rounds4!(abef, cdgh, w0, w1, w2, w3, w4, 4); + schedule_rounds4!(abef, cdgh, w1, w2, w3, w4, w0, 5); + schedule_rounds4!(abef, cdgh, w2, w3, w4, w0, w1, 6); + schedule_rounds4!(abef, cdgh, w3, w4, w0, w1, w2, 7); + schedule_rounds4!(abef, cdgh, w4, w0, w1, w2, w3, 8); + schedule_rounds4!(abef, cdgh, w0, w1, w2, w3, w4, 9); + schedule_rounds4!(abef, cdgh, w1, w2, w3, w4, w0, 10); + schedule_rounds4!(abef, cdgh, w2, w3, w4, w0, w1, 11); + schedule_rounds4!(abef, cdgh, w3, w4, w0, w1, w2, 12); + schedule_rounds4!(abef, cdgh, w4, w0, w1, w2, w3, 13); + schedule_rounds4!(abef, cdgh, w0, w1, w2, w3, w4, 14); + schedule_rounds4!(abef, cdgh, w1, w2, w3, w4, w0, 15); + + let [a, b, e, f] = abef; + let [c, d, g, h] = cdgh; + + state[0] = state[0].wrapping_add(a); + state[1] = state[1].wrapping_add(b); + state[2] = state[2].wrapping_add(c); + state[3] = state[3].wrapping_add(d); + state[4] = state[4].wrapping_add(e); + state[5] = state[5].wrapping_add(f); + state[6] = state[6].wrapping_add(g); + state[7] = state[7].wrapping_add(h); +} + +pub fn compress(state: &mut [u32; 8], blocks: &[[u8; 64]]) { + let mut block_u32 = [0u32; BLOCK_LEN]; + // since LLVM can't properly use aliasing yet it will make + // unnecessary state stores without this copy + let mut state_cpy = *state; + for block in blocks { + for (o, chunk) in block_u32.iter_mut().zip(block.chunks_exact(4)) { + *o = u32::from_be_bytes(chunk.try_into().unwrap()); + } + sha256_digest_block_u32(&mut state_cpy, &block_u32); + } + *state = state_cpy; +} diff --git a/rust/vendor/sha2/src/sha256/x86.rs b/rust/vendor/sha2/src/sha256/x86.rs new file mode 100644 index 0000000..4601938 --- /dev/null +++ b/rust/vendor/sha2/src/sha256/x86.rs @@ -0,0 +1,112 @@ +//! SHA-256 `x86`/`x86_64` backend + +#![allow(clippy::many_single_char_names)] + +#[cfg(target_arch = "x86")] +use core::arch::x86::*; +#[cfg(target_arch = "x86_64")] +use core::arch::x86_64::*; + +unsafe fn schedule(v0: __m128i, v1: __m128i, v2: __m128i, v3: __m128i) -> __m128i { + let t1 = _mm_sha256msg1_epu32(v0, v1); + let t2 = _mm_alignr_epi8(v3, v2, 4); + let t3 = _mm_add_epi32(t1, t2); + _mm_sha256msg2_epu32(t3, v3) +} + +macro_rules! rounds4 { + ($abef:ident, $cdgh:ident, $rest:expr, $i:expr) => {{ + let k = crate::consts::K32X4[$i]; + let kv = _mm_set_epi32(k[0] as i32, k[1] as i32, k[2] as i32, k[3] as i32); + let t1 = _mm_add_epi32($rest, kv); + $cdgh = _mm_sha256rnds2_epu32($cdgh, $abef, t1); + let t2 = _mm_shuffle_epi32(t1, 0x0E); + $abef = _mm_sha256rnds2_epu32($abef, $cdgh, t2); + }}; +} + +macro_rules! schedule_rounds4 { + ( + $abef:ident, $cdgh:ident, + $w0:expr, $w1:expr, $w2:expr, $w3:expr, $w4:expr, + $i: expr + ) => {{ + $w4 = schedule($w0, $w1, $w2, $w3); + rounds4!($abef, $cdgh, $w4, $i); + }}; +} + +// we use unaligned loads with `__m128i` pointers +#[allow(clippy::cast_ptr_alignment)] +#[target_feature(enable = "sha,sse2,ssse3,sse4.1")] +unsafe fn digest_blocks(state: &mut [u32; 8], blocks: &[[u8; 64]]) { + #[allow(non_snake_case)] + let MASK: __m128i = _mm_set_epi64x( + 0x0C0D_0E0F_0809_0A0Bu64 as i64, + 0x0405_0607_0001_0203u64 as i64, + ); + + let state_ptr = state.as_ptr() as *const __m128i; + let dcba = _mm_loadu_si128(state_ptr.add(0)); + let efgh = _mm_loadu_si128(state_ptr.add(1)); + + let cdab = _mm_shuffle_epi32(dcba, 0xB1); + let efgh = _mm_shuffle_epi32(efgh, 0x1B); + let mut abef = _mm_alignr_epi8(cdab, efgh, 8); + let mut cdgh = _mm_blend_epi16(efgh, cdab, 0xF0); + + for block in blocks { + let abef_save = abef; + let cdgh_save = cdgh; + + let data_ptr = block.as_ptr() as *const __m128i; + let mut w0 = _mm_shuffle_epi8(_mm_loadu_si128(data_ptr.add(0)), MASK); + let mut w1 = _mm_shuffle_epi8(_mm_loadu_si128(data_ptr.add(1)), MASK); + let mut w2 = _mm_shuffle_epi8(_mm_loadu_si128(data_ptr.add(2)), MASK); + let mut w3 = _mm_shuffle_epi8(_mm_loadu_si128(data_ptr.add(3)), MASK); + let mut w4; + + rounds4!(abef, cdgh, w0, 0); + rounds4!(abef, cdgh, w1, 1); + rounds4!(abef, cdgh, w2, 2); + rounds4!(abef, cdgh, w3, 3); + schedule_rounds4!(abef, cdgh, w0, w1, w2, w3, w4, 4); + schedule_rounds4!(abef, cdgh, w1, w2, w3, w4, w0, 5); + schedule_rounds4!(abef, cdgh, w2, w3, w4, w0, w1, 6); + schedule_rounds4!(abef, cdgh, w3, w4, w0, w1, w2, 7); + schedule_rounds4!(abef, cdgh, w4, w0, w1, w2, w3, 8); + schedule_rounds4!(abef, cdgh, w0, w1, w2, w3, w4, 9); + schedule_rounds4!(abef, cdgh, w1, w2, w3, w4, w0, 10); + schedule_rounds4!(abef, cdgh, w2, w3, w4, w0, w1, 11); + schedule_rounds4!(abef, cdgh, w3, w4, w0, w1, w2, 12); + schedule_rounds4!(abef, cdgh, w4, w0, w1, w2, w3, 13); + schedule_rounds4!(abef, cdgh, w0, w1, w2, w3, w4, 14); + schedule_rounds4!(abef, cdgh, w1, w2, w3, w4, w0, 15); + + abef = _mm_add_epi32(abef, abef_save); + cdgh = _mm_add_epi32(cdgh, cdgh_save); + } + + let feba = _mm_shuffle_epi32(abef, 0x1B); + let dchg = _mm_shuffle_epi32(cdgh, 0xB1); + let dcba = _mm_blend_epi16(feba, dchg, 0xF0); + let hgef = _mm_alignr_epi8(dchg, feba, 8); + + let state_ptr_mut = state.as_mut_ptr() as *mut __m128i; + _mm_storeu_si128(state_ptr_mut.add(0), dcba); + _mm_storeu_si128(state_ptr_mut.add(1), hgef); +} + +cpufeatures::new!(shani_cpuid, "sha", "sse2", "ssse3", "sse4.1"); + +pub fn compress(state: &mut [u32; 8], blocks: &[[u8; 64]]) { + // TODO: Replace with https://github.com/rust-lang/rfcs/pull/2725 + // after stabilization + if shani_cpuid::get() { + unsafe { + digest_blocks(state, blocks); + } + } else { + super::soft::compress(state, blocks); + } +} |