diff options
Diffstat (limited to 'rust/vendor/sha1/src/compress')
-rw-r--r-- | rust/vendor/sha1/src/compress/aarch64.rs | 18 | ||||
-rw-r--r-- | rust/vendor/sha1/src/compress/loongarch64_asm.rs | 255 | ||||
-rw-r--r-- | rust/vendor/sha1/src/compress/soft.rs | 260 | ||||
-rw-r--r-- | rust/vendor/sha1/src/compress/x86.rs | 112 |
4 files changed, 645 insertions, 0 deletions
diff --git a/rust/vendor/sha1/src/compress/aarch64.rs b/rust/vendor/sha1/src/compress/aarch64.rs new file mode 100644 index 0000000..5952d1f --- /dev/null +++ b/rust/vendor/sha1/src/compress/aarch64.rs @@ -0,0 +1,18 @@ +//! SHA-1 `aarch64` backend. + +// Per rustc target feature docs for `aarch64-unknown-linux-gnu` and +// `aarch64-apple-darwin` platforms, the `sha2` target feature enables +// SHA-1 as well: +// +// > Enable SHA1 and SHA256 support. +cpufeatures::new!(sha1_hwcap, "sha2"); + +pub fn compress(state: &mut [u32; 5], blocks: &[[u8; 64]]) { + // TODO: Replace with https://github.com/rust-lang/rfcs/pull/2725 + // after stabilization + if sha1_hwcap::get() { + sha1_asm::compress(state, blocks); + } else { + super::soft::compress(state, blocks); + } +} diff --git a/rust/vendor/sha1/src/compress/loongarch64_asm.rs b/rust/vendor/sha1/src/compress/loongarch64_asm.rs new file mode 100644 index 0000000..facef1b --- /dev/null +++ b/rust/vendor/sha1/src/compress/loongarch64_asm.rs @@ -0,0 +1,255 @@ +//! LoongArch64 assembly backend + +use core::arch::asm; + +const K: [u32; 4] = [0x5A827999, 0x6ED9EBA1, 0x8F1BBCDC, 0xCA62C1D6]; + +macro_rules! c { + ($($l:expr)*) => { + concat!($($l ,)*) + }; +} + +macro_rules! round0a { + ($a:literal, $b:literal, $c:literal, $d:literal, $e:literal, $i:literal) => { + c!( + "ld.w $t5, $a1, (" $i " * 4);" + "revb.2h $t5, $t5;" + "rotri.w $t5, $t5, 16;" + "add.w " $e ", " $e ", $t5;" + "st.w $t5, $sp, (" $i " * 4);" + "xor $t5, " $c "," $d ";" + "and $t5, $t5, " $b ";" + "xor $t5, $t5, " $d ";" + roundtail!($a, $b, $e, $i, "$a4") + ) + }; +} + +macro_rules! scheldule { + ($i:literal, $e:literal) => { + c!( + "ld.w $t5, $sp, (((" $i " - 3) & 0xF) * 4);" + "ld.w $t6, $sp, (((" $i " - 8) & 0xF) * 4);" + "ld.w $t7, $sp, (((" $i " - 14) & 0xF) * 4);" + "ld.w $t8, $sp, (((" $i " - 16) & 0xF) * 4);" + "xor $t5, $t5, $t6;" + "xor $t5, $t5, $t7;" + "xor $t5, $t5, $t8;" + "rotri.w $t5, $t5, 31;" + "add.w " $e "," $e ", $t5;" + "st.w $t5, $sp, ((" $i " & 0xF) * 4);" + ) + }; +} + +macro_rules! round0b { + ($a:literal, $b:literal, $c:literal, $d:literal, $e:literal, $i:literal) => { + c!( + scheldule!($i, $e) + "xor $t5," $c "," $d ";" + "and $t5, $t5," $b ";" + "xor $t5, $t5," $d ";" + roundtail!($a, $b, $e, $i, "$a4") + ) + }; +} + +macro_rules! round1 { + ($a:literal, $b:literal, $c:literal, $d:literal, $e:literal, $i:literal) => { + c!( + scheldule!($i, $e) + "xor $t5," $b "," $c ";" + "xor $t5, $t5," $d ";" + roundtail!($a, $b, $e, $i, "$a5") + ) + }; +} + +macro_rules! round2 { + ($a:literal, $b:literal, $c:literal, $d:literal, $e:literal, $i:literal) => { + c!( + scheldule!($i, $e) + "or $t5," $c "," $d ";" + "and $t5, $t5, " $b ";" + "and $t7," $c "," $d ";" + "or $t5, $t5, $t7;" + roundtail!($a, $b, $e, $i, "$a6") + ) + }; +} + +macro_rules! round3 { + ($a:literal, $b:literal, $c:literal, $d:literal, $e:literal, $i:literal) => { + c!( + scheldule!($i, $e) + "xor $t5," $b "," $c ";" + "xor $t5, $t5," $d ";" + roundtail!($a, $b, $e, $i, "$a7") + ) + }; +} + +macro_rules! roundtail { + ($a:literal, $b:literal, $e:literal, $i:literal, $k:literal) => { + c!( + "rotri.w " $b "," $b ", 2;" + "add.w " $e "," $e ", $t5;" + "add.w " $e "," $e "," $k ";" + "rotri.w $t5," $a ", 27;" + "add.w " $e "," $e ", $t5;" + ) + }; +} + +pub fn compress(state: &mut [u32; 5], blocks: &[[u8; 64]]) { + if blocks.is_empty() { + return; + } + + unsafe { + asm!( + // Allocate scratch stack space + "addi.d $sp, $sp, -64;", + + // Load state + "ld.w $t0, $a0, 0", + "ld.w $t1, $a0, 4", + "ld.w $t2, $a0, 8", + "ld.w $t3, $a0, 12", + "ld.w $t4, $a0, 16", + + "42:", + + round0a!("$t0", "$t1", "$t2", "$t3", "$t4", 0), + round0a!("$t4", "$t0", "$t1", "$t2", "$t3", 1), + round0a!("$t3", "$t4", "$t0", "$t1", "$t2", 2), + round0a!("$t2", "$t3", "$t4", "$t0", "$t1", 3), + round0a!("$t1", "$t2", "$t3", "$t4", "$t0", 4), + round0a!("$t0", "$t1", "$t2", "$t3", "$t4", 5), + round0a!("$t4", "$t0", "$t1", "$t2", "$t3", 6), + round0a!("$t3", "$t4", "$t0", "$t1", "$t2", 7), + round0a!("$t2", "$t3", "$t4", "$t0", "$t1", 8), + round0a!("$t1", "$t2", "$t3", "$t4", "$t0", 9), + round0a!("$t0", "$t1", "$t2", "$t3", "$t4", 10), + round0a!("$t4", "$t0", "$t1", "$t2", "$t3", 11), + round0a!("$t3", "$t4", "$t0", "$t1", "$t2", 12), + round0a!("$t2", "$t3", "$t4", "$t0", "$t1", 13), + round0a!("$t1", "$t2", "$t3", "$t4", "$t0", 14), + round0a!("$t0", "$t1", "$t2", "$t3", "$t4", 15), + round0b!("$t4", "$t0", "$t1", "$t2", "$t3", 16), + round0b!("$t3", "$t4", "$t0", "$t1", "$t2", 17), + round0b!("$t2", "$t3", "$t4", "$t0", "$t1", 18), + round0b!("$t1", "$t2", "$t3", "$t4", "$t0", 19), + round1!("$t0", "$t1", "$t2", "$t3", "$t4", 20), + round1!("$t4", "$t0", "$t1", "$t2", "$t3", 21), + round1!("$t3", "$t4", "$t0", "$t1", "$t2", 22), + round1!("$t2", "$t3", "$t4", "$t0", "$t1", 23), + round1!("$t1", "$t2", "$t3", "$t4", "$t0", 24), + round1!("$t0", "$t1", "$t2", "$t3", "$t4", 25), + round1!("$t4", "$t0", "$t1", "$t2", "$t3", 26), + round1!("$t3", "$t4", "$t0", "$t1", "$t2", 27), + round1!("$t2", "$t3", "$t4", "$t0", "$t1", 28), + round1!("$t1", "$t2", "$t3", "$t4", "$t0", 29), + round1!("$t0", "$t1", "$t2", "$t3", "$t4", 30), + round1!("$t4", "$t0", "$t1", "$t2", "$t3", 31), + round1!("$t3", "$t4", "$t0", "$t1", "$t2", 32), + round1!("$t2", "$t3", "$t4", "$t0", "$t1", 33), + round1!("$t1", "$t2", "$t3", "$t4", "$t0", 34), + round1!("$t0", "$t1", "$t2", "$t3", "$t4", 35), + round1!("$t4", "$t0", "$t1", "$t2", "$t3", 36), + round1!("$t3", "$t4", "$t0", "$t1", "$t2", 37), + round1!("$t2", "$t3", "$t4", "$t0", "$t1", 38), + round1!("$t1", "$t2", "$t3", "$t4", "$t0", 39), + round2!("$t0", "$t1", "$t2", "$t3", "$t4", 40), + round2!("$t4", "$t0", "$t1", "$t2", "$t3", 41), + round2!("$t3", "$t4", "$t0", "$t1", "$t2", 42), + round2!("$t2", "$t3", "$t4", "$t0", "$t1", 43), + round2!("$t1", "$t2", "$t3", "$t4", "$t0", 44), + round2!("$t0", "$t1", "$t2", "$t3", "$t4", 45), + round2!("$t4", "$t0", "$t1", "$t2", "$t3", 46), + round2!("$t3", "$t4", "$t0", "$t1", "$t2", 47), + round2!("$t2", "$t3", "$t4", "$t0", "$t1", 48), + round2!("$t1", "$t2", "$t3", "$t4", "$t0", 49), + round2!("$t0", "$t1", "$t2", "$t3", "$t4", 50), + round2!("$t4", "$t0", "$t1", "$t2", "$t3", 51), + round2!("$t3", "$t4", "$t0", "$t1", "$t2", 52), + round2!("$t2", "$t3", "$t4", "$t0", "$t1", 53), + round2!("$t1", "$t2", "$t3", "$t4", "$t0", 54), + round2!("$t0", "$t1", "$t2", "$t3", "$t4", 55), + round2!("$t4", "$t0", "$t1", "$t2", "$t3", 56), + round2!("$t3", "$t4", "$t0", "$t1", "$t2", 57), + round2!("$t2", "$t3", "$t4", "$t0", "$t1", 58), + round2!("$t1", "$t2", "$t3", "$t4", "$t0", 59), + round3!("$t0", "$t1", "$t2", "$t3", "$t4", 60), + round3!("$t4", "$t0", "$t1", "$t2", "$t3", 61), + round3!("$t3", "$t4", "$t0", "$t1", "$t2", 62), + round3!("$t2", "$t3", "$t4", "$t0", "$t1", 63), + round3!("$t1", "$t2", "$t3", "$t4", "$t0", 64), + round3!("$t0", "$t1", "$t2", "$t3", "$t4", 65), + round3!("$t4", "$t0", "$t1", "$t2", "$t3", 66), + round3!("$t3", "$t4", "$t0", "$t1", "$t2", 67), + round3!("$t2", "$t3", "$t4", "$t0", "$t1", 68), + round3!("$t1", "$t2", "$t3", "$t4", "$t0", 69), + round3!("$t0", "$t1", "$t2", "$t3", "$t4", 70), + round3!("$t4", "$t0", "$t1", "$t2", "$t3", 71), + round3!("$t3", "$t4", "$t0", "$t1", "$t2", 72), + round3!("$t2", "$t3", "$t4", "$t0", "$t1", 73), + round3!("$t1", "$t2", "$t3", "$t4", "$t0", 74), + round3!("$t0", "$t1", "$t2", "$t3", "$t4", 75), + round3!("$t4", "$t0", "$t1", "$t2", "$t3", 76), + round3!("$t3", "$t4", "$t0", "$t1", "$t2", 77), + round3!("$t2", "$t3", "$t4", "$t0", "$t1", 78), + round3!("$t1", "$t2", "$t3", "$t4", "$t0", 79), + + // Update state registers + "ld.w $t5, $a0, 0", // a + "ld.w $t6, $a0, 4", // b + "ld.w $t7, $a0, 8", // c + "ld.w $t8, $a0, 12", // d + "add.w $t0, $t0, $t5", + "ld.w $t5, $a0, 16", // e + "add.w $t1, $t1, $t6", + "add.w $t2, $t2, $t7", + "add.w $t3, $t3, $t8", + "add.w $t4, $t4, $t5", + + // Save updated state + "st.w $t0, $a0, 0", + "st.w $t1, $a0, 4", + "st.w $t2, $a0, 8", + "st.w $t3, $a0, 12", + "st.w $t4, $a0, 16", + + // Looping over blocks + "addi.d $a1, $a1, 64", + "addi.d $a2, $a2, -1", + "bnez $a2, 42b", + + // Restore stack register + "addi.d $sp, $sp, 64", + + in("$a0") state, + inout("$a1") blocks.as_ptr() => _, + inout("$a2") blocks.len() => _, + + in("$a4") K[0], + in("$a5") K[1], + in("$a6") K[2], + in("$a7") K[3], + + // Clobbers + out("$t0") _, + out("$t1") _, + out("$t2") _, + out("$t3") _, + out("$t4") _, + out("$t5") _, + out("$t6") _, + out("$t7") _, + out("$t8") _, + + options(preserves_flags), + ); + } +} diff --git a/rust/vendor/sha1/src/compress/soft.rs b/rust/vendor/sha1/src/compress/soft.rs new file mode 100644 index 0000000..0b9fb27 --- /dev/null +++ b/rust/vendor/sha1/src/compress/soft.rs @@ -0,0 +1,260 @@ +#![allow(clippy::many_single_char_names)] +use super::BLOCK_SIZE; +use core::convert::TryInto; + +const K: [u32; 4] = [0x5A827999, 0x6ED9EBA1, 0x8F1BBCDC, 0xCA62C1D6]; + +#[inline(always)] +fn add(a: [u32; 4], b: [u32; 4]) -> [u32; 4] { + [ + a[0].wrapping_add(b[0]), + a[1].wrapping_add(b[1]), + a[2].wrapping_add(b[2]), + a[3].wrapping_add(b[3]), + ] +} + +#[inline(always)] +fn xor(a: [u32; 4], b: [u32; 4]) -> [u32; 4] { + [a[0] ^ b[0], a[1] ^ b[1], a[2] ^ b[2], a[3] ^ b[3]] +} + +#[inline] +pub fn sha1_first_add(e: u32, w0: [u32; 4]) -> [u32; 4] { + let [a, b, c, d] = w0; + [e.wrapping_add(a), b, c, d] +} + +fn sha1msg1(a: [u32; 4], b: [u32; 4]) -> [u32; 4] { + let [_, _, w2, w3] = a; + let [w4, w5, _, _] = b; + [a[0] ^ w2, a[1] ^ w3, a[2] ^ w4, a[3] ^ w5] +} + +fn sha1msg2(a: [u32; 4], b: [u32; 4]) -> [u32; 4] { + let [x0, x1, x2, x3] = a; + let [_, w13, w14, w15] = b; + + let w16 = (x0 ^ w13).rotate_left(1); + let w17 = (x1 ^ w14).rotate_left(1); + let w18 = (x2 ^ w15).rotate_left(1); + let w19 = (x3 ^ w16).rotate_left(1); + + [w16, w17, w18, w19] +} + +#[inline] +fn sha1_first_half(abcd: [u32; 4], msg: [u32; 4]) -> [u32; 4] { + sha1_first_add(abcd[0].rotate_left(30), msg) +} + +fn sha1_digest_round_x4(abcd: [u32; 4], work: [u32; 4], i: i8) -> [u32; 4] { + match i { + 0 => sha1rnds4c(abcd, add(work, [K[0]; 4])), + 1 => sha1rnds4p(abcd, add(work, [K[1]; 4])), + 2 => sha1rnds4m(abcd, add(work, [K[2]; 4])), + 3 => sha1rnds4p(abcd, add(work, [K[3]; 4])), + _ => unreachable!("unknown icosaround index"), + } +} + +fn sha1rnds4c(abcd: [u32; 4], msg: [u32; 4]) -> [u32; 4] { + let [mut a, mut b, mut c, mut d] = abcd; + let [t, u, v, w] = msg; + let mut e = 0u32; + + macro_rules! bool3ary_202 { + ($a:expr, $b:expr, $c:expr) => { + $c ^ ($a & ($b ^ $c)) + }; + } // Choose, MD5F, SHA1C + + e = e + .wrapping_add(a.rotate_left(5)) + .wrapping_add(bool3ary_202!(b, c, d)) + .wrapping_add(t); + b = b.rotate_left(30); + + d = d + .wrapping_add(e.rotate_left(5)) + .wrapping_add(bool3ary_202!(a, b, c)) + .wrapping_add(u); + a = a.rotate_left(30); + + c = c + .wrapping_add(d.rotate_left(5)) + .wrapping_add(bool3ary_202!(e, a, b)) + .wrapping_add(v); + e = e.rotate_left(30); + + b = b + .wrapping_add(c.rotate_left(5)) + .wrapping_add(bool3ary_202!(d, e, a)) + .wrapping_add(w); + d = d.rotate_left(30); + + [b, c, d, e] +} + +fn sha1rnds4p(abcd: [u32; 4], msg: [u32; 4]) -> [u32; 4] { + let [mut a, mut b, mut c, mut d] = abcd; + let [t, u, v, w] = msg; + let mut e = 0u32; + + macro_rules! bool3ary_150 { + ($a:expr, $b:expr, $c:expr) => { + $a ^ $b ^ $c + }; + } // Parity, XOR, MD5H, SHA1P + + e = e + .wrapping_add(a.rotate_left(5)) + .wrapping_add(bool3ary_150!(b, c, d)) + .wrapping_add(t); + b = b.rotate_left(30); + + d = d + .wrapping_add(e.rotate_left(5)) + .wrapping_add(bool3ary_150!(a, b, c)) + .wrapping_add(u); + a = a.rotate_left(30); + + c = c + .wrapping_add(d.rotate_left(5)) + .wrapping_add(bool3ary_150!(e, a, b)) + .wrapping_add(v); + e = e.rotate_left(30); + + b = b + .wrapping_add(c.rotate_left(5)) + .wrapping_add(bool3ary_150!(d, e, a)) + .wrapping_add(w); + d = d.rotate_left(30); + + [b, c, d, e] +} + +fn sha1rnds4m(abcd: [u32; 4], msg: [u32; 4]) -> [u32; 4] { + let [mut a, mut b, mut c, mut d] = abcd; + let [t, u, v, w] = msg; + let mut e = 0u32; + + macro_rules! bool3ary_232 { + ($a:expr, $b:expr, $c:expr) => { + ($a & $b) ^ ($a & $c) ^ ($b & $c) + }; + } // Majority, SHA1M + + e = e + .wrapping_add(a.rotate_left(5)) + .wrapping_add(bool3ary_232!(b, c, d)) + .wrapping_add(t); + b = b.rotate_left(30); + + d = d + .wrapping_add(e.rotate_left(5)) + .wrapping_add(bool3ary_232!(a, b, c)) + .wrapping_add(u); + a = a.rotate_left(30); + + c = c + .wrapping_add(d.rotate_left(5)) + .wrapping_add(bool3ary_232!(e, a, b)) + .wrapping_add(v); + e = e.rotate_left(30); + + b = b + .wrapping_add(c.rotate_left(5)) + .wrapping_add(bool3ary_232!(d, e, a)) + .wrapping_add(w); + d = d.rotate_left(30); + + [b, c, d, e] +} + +macro_rules! rounds4 { + ($h0:ident, $h1:ident, $wk:expr, $i:expr) => { + sha1_digest_round_x4($h0, sha1_first_half($h1, $wk), $i) + }; +} + +macro_rules! schedule { + ($v0:expr, $v1:expr, $v2:expr, $v3:expr) => { + sha1msg2(xor(sha1msg1($v0, $v1), $v2), $v3) + }; +} + +macro_rules! schedule_rounds4 { + ( + $h0:ident, $h1:ident, + $w0:expr, $w1:expr, $w2:expr, $w3:expr, $w4:expr, + $i:expr + ) => { + $w4 = schedule!($w0, $w1, $w2, $w3); + $h1 = rounds4!($h0, $h1, $w4, $i); + }; +} + +#[inline(always)] +fn sha1_digest_block_u32(state: &mut [u32; 5], block: &[u32; 16]) { + let mut w0 = [block[0], block[1], block[2], block[3]]; + let mut w1 = [block[4], block[5], block[6], block[7]]; + let mut w2 = [block[8], block[9], block[10], block[11]]; + let mut w3 = [block[12], block[13], block[14], block[15]]; + #[allow(clippy::needless_late_init)] + let mut w4; + + let mut h0 = [state[0], state[1], state[2], state[3]]; + let mut h1 = sha1_first_add(state[4], w0); + + // Rounds 0..20 + h1 = sha1_digest_round_x4(h0, h1, 0); + h0 = rounds4!(h1, h0, w1, 0); + h1 = rounds4!(h0, h1, w2, 0); + h0 = rounds4!(h1, h0, w3, 0); + schedule_rounds4!(h0, h1, w0, w1, w2, w3, w4, 0); + + // Rounds 20..40 + schedule_rounds4!(h1, h0, w1, w2, w3, w4, w0, 1); + schedule_rounds4!(h0, h1, w2, w3, w4, w0, w1, 1); + schedule_rounds4!(h1, h0, w3, w4, w0, w1, w2, 1); + schedule_rounds4!(h0, h1, w4, w0, w1, w2, w3, 1); + schedule_rounds4!(h1, h0, w0, w1, w2, w3, w4, 1); + + // Rounds 40..60 + schedule_rounds4!(h0, h1, w1, w2, w3, w4, w0, 2); + schedule_rounds4!(h1, h0, w2, w3, w4, w0, w1, 2); + schedule_rounds4!(h0, h1, w3, w4, w0, w1, w2, 2); + schedule_rounds4!(h1, h0, w4, w0, w1, w2, w3, 2); + schedule_rounds4!(h0, h1, w0, w1, w2, w3, w4, 2); + + // Rounds 60..80 + schedule_rounds4!(h1, h0, w1, w2, w3, w4, w0, 3); + schedule_rounds4!(h0, h1, w2, w3, w4, w0, w1, 3); + schedule_rounds4!(h1, h0, w3, w4, w0, w1, w2, 3); + schedule_rounds4!(h0, h1, w4, w0, w1, w2, w3, 3); + schedule_rounds4!(h1, h0, w0, w1, w2, w3, w4, 3); + + let e = h1[0].rotate_left(30); + let [a, b, c, d] = h0; + + state[0] = state[0].wrapping_add(a); + state[1] = state[1].wrapping_add(b); + state[2] = state[2].wrapping_add(c); + state[3] = state[3].wrapping_add(d); + state[4] = state[4].wrapping_add(e); +} + +pub fn compress(state: &mut [u32; 5], blocks: &[[u8; BLOCK_SIZE]]) { + let mut block_u32 = [0u32; BLOCK_SIZE / 4]; + // since LLVM can't properly use aliasing yet it will make + // unnecessary state stores without this copy + let mut state_cpy = *state; + for block in blocks.iter() { + for (o, chunk) in block_u32.iter_mut().zip(block.chunks_exact(4)) { + *o = u32::from_be_bytes(chunk.try_into().unwrap()); + } + sha1_digest_block_u32(&mut state_cpy, &block_u32); + } + *state = state_cpy; +} diff --git a/rust/vendor/sha1/src/compress/x86.rs b/rust/vendor/sha1/src/compress/x86.rs new file mode 100644 index 0000000..4dcd56b --- /dev/null +++ b/rust/vendor/sha1/src/compress/x86.rs @@ -0,0 +1,112 @@ +//! SHA-1 `x86`/`x86_64` backend + +#![cfg(any(target_arch = "x86", target_arch = "x86_64"))] + +#[cfg(target_arch = "x86")] +use core::arch::x86::*; +#[cfg(target_arch = "x86_64")] +use core::arch::x86_64::*; + +macro_rules! rounds4 { + ($h0:ident, $h1:ident, $wk:expr, $i:expr) => { + _mm_sha1rnds4_epu32($h0, _mm_sha1nexte_epu32($h1, $wk), $i) + }; +} + +macro_rules! schedule { + ($v0:expr, $v1:expr, $v2:expr, $v3:expr) => { + _mm_sha1msg2_epu32(_mm_xor_si128(_mm_sha1msg1_epu32($v0, $v1), $v2), $v3) + }; +} + +macro_rules! schedule_rounds4 { + ( + $h0:ident, $h1:ident, + $w0:expr, $w1:expr, $w2:expr, $w3:expr, $w4:expr, + $i:expr + ) => { + $w4 = schedule!($w0, $w1, $w2, $w3); + $h1 = rounds4!($h0, $h1, $w4, $i); + }; +} + +#[target_feature(enable = "sha,sse2,ssse3,sse4.1")] +unsafe fn digest_blocks(state: &mut [u32; 5], blocks: &[[u8; 64]]) { + #[allow(non_snake_case)] + let MASK: __m128i = _mm_set_epi64x(0x0001_0203_0405_0607, 0x0809_0A0B_0C0D_0E0F); + + let mut state_abcd = _mm_set_epi32( + state[0] as i32, + state[1] as i32, + state[2] as i32, + state[3] as i32, + ); + let mut state_e = _mm_set_epi32(state[4] as i32, 0, 0, 0); + + for block in blocks { + // SAFETY: we use only unaligned loads with this pointer + #[allow(clippy::cast_ptr_alignment)] + let block_ptr = block.as_ptr() as *const __m128i; + + let mut w0 = _mm_shuffle_epi8(_mm_loadu_si128(block_ptr.offset(0)), MASK); + let mut w1 = _mm_shuffle_epi8(_mm_loadu_si128(block_ptr.offset(1)), MASK); + let mut w2 = _mm_shuffle_epi8(_mm_loadu_si128(block_ptr.offset(2)), MASK); + let mut w3 = _mm_shuffle_epi8(_mm_loadu_si128(block_ptr.offset(3)), MASK); + #[allow(clippy::needless_late_init)] + let mut w4; + + let mut h0 = state_abcd; + let mut h1 = _mm_add_epi32(state_e, w0); + + // Rounds 0..20 + h1 = _mm_sha1rnds4_epu32(h0, h1, 0); + h0 = rounds4!(h1, h0, w1, 0); + h1 = rounds4!(h0, h1, w2, 0); + h0 = rounds4!(h1, h0, w3, 0); + schedule_rounds4!(h0, h1, w0, w1, w2, w3, w4, 0); + + // Rounds 20..40 + schedule_rounds4!(h1, h0, w1, w2, w3, w4, w0, 1); + schedule_rounds4!(h0, h1, w2, w3, w4, w0, w1, 1); + schedule_rounds4!(h1, h0, w3, w4, w0, w1, w2, 1); + schedule_rounds4!(h0, h1, w4, w0, w1, w2, w3, 1); + schedule_rounds4!(h1, h0, w0, w1, w2, w3, w4, 1); + + // Rounds 40..60 + schedule_rounds4!(h0, h1, w1, w2, w3, w4, w0, 2); + schedule_rounds4!(h1, h0, w2, w3, w4, w0, w1, 2); + schedule_rounds4!(h0, h1, w3, w4, w0, w1, w2, 2); + schedule_rounds4!(h1, h0, w4, w0, w1, w2, w3, 2); + schedule_rounds4!(h0, h1, w0, w1, w2, w3, w4, 2); + + // Rounds 60..80 + schedule_rounds4!(h1, h0, w1, w2, w3, w4, w0, 3); + schedule_rounds4!(h0, h1, w2, w3, w4, w0, w1, 3); + schedule_rounds4!(h1, h0, w3, w4, w0, w1, w2, 3); + schedule_rounds4!(h0, h1, w4, w0, w1, w2, w3, 3); + schedule_rounds4!(h1, h0, w0, w1, w2, w3, w4, 3); + + state_abcd = _mm_add_epi32(state_abcd, h0); + state_e = _mm_sha1nexte_epu32(h1, state_e); + } + + state[0] = _mm_extract_epi32(state_abcd, 3) as u32; + state[1] = _mm_extract_epi32(state_abcd, 2) as u32; + state[2] = _mm_extract_epi32(state_abcd, 1) as u32; + state[3] = _mm_extract_epi32(state_abcd, 0) as u32; + state[4] = _mm_extract_epi32(state_e, 3) as u32; +} + +cpufeatures::new!(shani_cpuid, "sha", "sse2", "ssse3", "sse4.1"); + +pub fn compress(state: &mut [u32; 5], blocks: &[[u8; 64]]) { + // TODO: Replace with https://github.com/rust-lang/rfcs/pull/2725 + // after stabilization + if shani_cpuid::get() { + unsafe { + digest_blocks(state, blocks); + } + } else { + super::soft::compress(state, blocks); + } +} |