From 26a029d407be480d791972afb5975cf62c9360a6 Mon Sep 17 00:00:00 2001 From: Daniel Baumann Date: Fri, 19 Apr 2024 02:47:55 +0200 Subject: Adding upstream version 124.0.1. Signed-off-by: Daniel Baumann --- third_party/rust/sha2/src/sha512/soft.rs | 215 +++++++++++++++++++ third_party/rust/sha2/src/sha512/x86.rs | 357 +++++++++++++++++++++++++++++++ 2 files changed, 572 insertions(+) create mode 100644 third_party/rust/sha2/src/sha512/soft.rs create mode 100644 third_party/rust/sha2/src/sha512/x86.rs (limited to 'third_party/rust/sha2/src/sha512') diff --git a/third_party/rust/sha2/src/sha512/soft.rs b/third_party/rust/sha2/src/sha512/soft.rs new file mode 100644 index 0000000000..ab6d568313 --- /dev/null +++ b/third_party/rust/sha2/src/sha512/soft.rs @@ -0,0 +1,215 @@ +#![allow(clippy::many_single_char_names)] +use crate::consts::{BLOCK_LEN, K64X2}; +use core::convert::TryInto; + +fn add(a: [u64; 2], b: [u64; 2]) -> [u64; 2] { + [a[0].wrapping_add(b[0]), a[1].wrapping_add(b[1])] +} + +/// Not an intrinsic, but works like an unaligned load. +fn sha512load(v0: [u64; 2], v1: [u64; 2]) -> [u64; 2] { + [v1[1], v0[0]] +} + +/// Performs 2 rounds of the SHA-512 message schedule update. +pub fn sha512_schedule_x2(v0: [u64; 2], v1: [u64; 2], v4to5: [u64; 2], v7: [u64; 2]) -> [u64; 2] { + // sigma 0 + fn sigma0(x: u64) -> u64 { + ((x << 63) | (x >> 1)) ^ ((x << 56) | (x >> 8)) ^ (x >> 7) + } + + // sigma 1 + fn sigma1(x: u64) -> u64 { + ((x << 45) | (x >> 19)) ^ ((x << 3) | (x >> 61)) ^ (x >> 6) + } + + let [w1, w0] = v0; + let [_, w2] = v1; + let [w10, w9] = v4to5; + let [w15, w14] = v7; + + let w16 = sigma1(w14) + .wrapping_add(w9) + .wrapping_add(sigma0(w1)) + .wrapping_add(w0); + let w17 = sigma1(w15) + .wrapping_add(w10) + .wrapping_add(sigma0(w2)) + .wrapping_add(w1); + + [w17, w16] +} + +/// Performs one round of the SHA-512 message block digest. +pub fn sha512_digest_round( + ae: [u64; 2], + bf: [u64; 2], + cg: [u64; 2], + dh: [u64; 2], + wk0: u64, +) -> [u64; 2] { + macro_rules! big_sigma0 { + ($a:expr) => { + ($a.rotate_right(28) ^ $a.rotate_right(34) ^ $a.rotate_right(39)) + }; + } + macro_rules! big_sigma1 { + ($a:expr) => { + ($a.rotate_right(14) ^ $a.rotate_right(18) ^ $a.rotate_right(41)) + }; + } + macro_rules! bool3ary_202 { + ($a:expr, $b:expr, $c:expr) => { + $c ^ ($a & ($b ^ $c)) + }; + } // Choose, MD5F, SHA1C + macro_rules! bool3ary_232 { + ($a:expr, $b:expr, $c:expr) => { + ($a & $b) ^ ($a & $c) ^ ($b & $c) + }; + } // Majority, SHA1M + + let [a0, e0] = ae; + let [b0, f0] = bf; + let [c0, g0] = cg; + let [d0, h0] = dh; + + // a round + let x0 = big_sigma1!(e0) + .wrapping_add(bool3ary_202!(e0, f0, g0)) + .wrapping_add(wk0) + .wrapping_add(h0); + let y0 = big_sigma0!(a0).wrapping_add(bool3ary_232!(a0, b0, c0)); + let (a1, _, _, _, e1, _, _, _) = ( + x0.wrapping_add(y0), + a0, + b0, + c0, + x0.wrapping_add(d0), + e0, + f0, + g0, + ); + + [a1, e1] +} + +/// Process a block with the SHA-512 algorithm. +pub fn sha512_digest_block_u64(state: &mut [u64; 8], block: &[u64; 16]) { + let k = &K64X2; + + macro_rules! schedule { + ($v0:expr, $v1:expr, $v4:expr, $v5:expr, $v7:expr) => { + sha512_schedule_x2($v0, $v1, sha512load($v4, $v5), $v7) + }; + } + + macro_rules! rounds4 { + ($ae:ident, $bf:ident, $cg:ident, $dh:ident, $wk0:expr, $wk1:expr) => {{ + let [u, t] = $wk0; + let [w, v] = $wk1; + + $dh = sha512_digest_round($ae, $bf, $cg, $dh, t); + $cg = sha512_digest_round($dh, $ae, $bf, $cg, u); + $bf = sha512_digest_round($cg, $dh, $ae, $bf, v); + $ae = sha512_digest_round($bf, $cg, $dh, $ae, w); + }}; + } + + let mut ae = [state[0], state[4]]; + let mut bf = [state[1], state[5]]; + let mut cg = [state[2], state[6]]; + let mut dh = [state[3], state[7]]; + + // Rounds 0..20 + let (mut w1, mut w0) = ([block[3], block[2]], [block[1], block[0]]); + rounds4!(ae, bf, cg, dh, add(k[0], w0), add(k[1], w1)); + let (mut w3, mut w2) = ([block[7], block[6]], [block[5], block[4]]); + rounds4!(ae, bf, cg, dh, add(k[2], w2), add(k[3], w3)); + let (mut w5, mut w4) = ([block[11], block[10]], [block[9], block[8]]); + rounds4!(ae, bf, cg, dh, add(k[4], w4), add(k[5], w5)); + let (mut w7, mut w6) = ([block[15], block[14]], [block[13], block[12]]); + rounds4!(ae, bf, cg, dh, add(k[6], w6), add(k[7], w7)); + let mut w8 = schedule!(w0, w1, w4, w5, w7); + let mut w9 = schedule!(w1, w2, w5, w6, w8); + rounds4!(ae, bf, cg, dh, add(k[8], w8), add(k[9], w9)); + + // Rounds 20..40 + w0 = schedule!(w2, w3, w6, w7, w9); + w1 = schedule!(w3, w4, w7, w8, w0); + rounds4!(ae, bf, cg, dh, add(k[10], w0), add(k[11], w1)); + w2 = schedule!(w4, w5, w8, w9, w1); + w3 = schedule!(w5, w6, w9, w0, w2); + rounds4!(ae, bf, cg, dh, add(k[12], w2), add(k[13], w3)); + w4 = schedule!(w6, w7, w0, w1, w3); + w5 = schedule!(w7, w8, w1, w2, w4); + rounds4!(ae, bf, cg, dh, add(k[14], w4), add(k[15], w5)); + w6 = schedule!(w8, w9, w2, w3, w5); + w7 = schedule!(w9, w0, w3, w4, w6); + rounds4!(ae, bf, cg, dh, add(k[16], w6), add(k[17], w7)); + w8 = schedule!(w0, w1, w4, w5, w7); + w9 = schedule!(w1, w2, w5, w6, w8); + rounds4!(ae, bf, cg, dh, add(k[18], w8), add(k[19], w9)); + + // Rounds 40..60 + w0 = schedule!(w2, w3, w6, w7, w9); + w1 = schedule!(w3, w4, w7, w8, w0); + rounds4!(ae, bf, cg, dh, add(k[20], w0), add(k[21], w1)); + w2 = schedule!(w4, w5, w8, w9, w1); + w3 = schedule!(w5, w6, w9, w0, w2); + rounds4!(ae, bf, cg, dh, add(k[22], w2), add(k[23], w3)); + w4 = schedule!(w6, w7, w0, w1, w3); + w5 = schedule!(w7, w8, w1, w2, w4); + rounds4!(ae, bf, cg, dh, add(k[24], w4), add(k[25], w5)); + w6 = schedule!(w8, w9, w2, w3, w5); + w7 = schedule!(w9, w0, w3, w4, w6); + rounds4!(ae, bf, cg, dh, add(k[26], w6), add(k[27], w7)); + w8 = schedule!(w0, w1, w4, w5, w7); + w9 = schedule!(w1, w2, w5, w6, w8); + rounds4!(ae, bf, cg, dh, add(k[28], w8), add(k[29], w9)); + + // Rounds 60..80 + w0 = schedule!(w2, w3, w6, w7, w9); + w1 = schedule!(w3, w4, w7, w8, w0); + rounds4!(ae, bf, cg, dh, add(k[30], w0), add(k[31], w1)); + w2 = schedule!(w4, w5, w8, w9, w1); + w3 = schedule!(w5, w6, w9, w0, w2); + rounds4!(ae, bf, cg, dh, add(k[32], w2), add(k[33], w3)); + w4 = schedule!(w6, w7, w0, w1, w3); + w5 = schedule!(w7, w8, w1, w2, w4); + rounds4!(ae, bf, cg, dh, add(k[34], w4), add(k[35], w5)); + w6 = schedule!(w8, w9, w2, w3, w5); + w7 = schedule!(w9, w0, w3, w4, w6); + rounds4!(ae, bf, cg, dh, add(k[36], w6), add(k[37], w7)); + w8 = schedule!(w0, w1, w4, w5, w7); + w9 = schedule!(w1, w2, w5, w6, w8); + rounds4!(ae, bf, cg, dh, add(k[38], w8), add(k[39], w9)); + + let [a, e] = ae; + let [b, f] = bf; + let [c, g] = cg; + let [d, h] = dh; + + state[0] = state[0].wrapping_add(a); + state[1] = state[1].wrapping_add(b); + state[2] = state[2].wrapping_add(c); + state[3] = state[3].wrapping_add(d); + state[4] = state[4].wrapping_add(e); + state[5] = state[5].wrapping_add(f); + state[6] = state[6].wrapping_add(g); + state[7] = state[7].wrapping_add(h); +} + +pub fn compress(state: &mut [u64; 8], blocks: &[[u8; 128]]) { + let mut block_u32 = [0u64; BLOCK_LEN]; + // since LLVM can't properly use aliasing yet it will make + // unnecessary state stores without this copy + let mut state_cpy = *state; + for block in blocks { + for (o, chunk) in block_u32.iter_mut().zip(block.chunks_exact(8)) { + *o = u64::from_be_bytes(chunk.try_into().unwrap()); + } + sha512_digest_block_u64(&mut state_cpy, &block_u32); + } + *state = state_cpy; +} diff --git a/third_party/rust/sha2/src/sha512/x86.rs b/third_party/rust/sha2/src/sha512/x86.rs new file mode 100644 index 0000000000..bb79040889 --- /dev/null +++ b/third_party/rust/sha2/src/sha512/x86.rs @@ -0,0 +1,357 @@ +//! SHA-512 `x86`/`x86_64` backend + +#![allow(clippy::many_single_char_names)] + +use core::mem::size_of; + +#[cfg(target_arch = "x86")] +use core::arch::x86::*; +#[cfg(target_arch = "x86_64")] +use core::arch::x86_64::*; + +use crate::consts::K64; + +cpufeatures::new!(avx2_cpuid, "avx2"); + +pub fn compress(state: &mut [u64; 8], blocks: &[[u8; 128]]) { + // TODO: Replace with https://github.com/rust-lang/rfcs/pull/2725 + // after stabilization + if avx2_cpuid::get() { + unsafe { + sha512_compress_x86_64_avx2(state, blocks); + } + } else { + super::soft::compress(state, blocks); + } +} + +#[target_feature(enable = "avx2")] +unsafe fn sha512_compress_x86_64_avx2(state: &mut [u64; 8], blocks: &[[u8; 128]]) { + let mut start_block = 0; + + if blocks.len() & 0b1 != 0 { + sha512_compress_x86_64_avx(state, &blocks[0]); + start_block += 1; + } + + let mut ms: MsgSchedule = [_mm_setzero_si128(); 8]; + let mut t2: RoundStates = [_mm_setzero_si128(); 40]; + let mut x = [_mm256_setzero_si256(); 8]; + + for i in (start_block..blocks.len()).step_by(2) { + load_data_avx2(&mut x, &mut ms, &mut t2, blocks.as_ptr().add(i) as *const _); + + // First block + let mut current_state = *state; + rounds_0_63_avx2(&mut current_state, &mut x, &mut ms, &mut t2); + rounds_64_79(&mut current_state, &ms); + accumulate_state(state, ¤t_state); + + // Second block + current_state = *state; + process_second_block(&mut current_state, &t2); + accumulate_state(state, ¤t_state); + } +} + +#[inline(always)] +unsafe fn sha512_compress_x86_64_avx(state: &mut [u64; 8], block: &[u8; 128]) { + let mut ms = [_mm_setzero_si128(); 8]; + let mut x = [_mm_setzero_si128(); 8]; + + // Reduced to single iteration + let mut current_state = *state; + load_data_avx(&mut x, &mut ms, block.as_ptr() as *const _); + rounds_0_63_avx(&mut current_state, &mut x, &mut ms); + rounds_64_79(&mut current_state, &ms); + accumulate_state(state, ¤t_state); +} + +#[inline(always)] +unsafe fn load_data_avx(x: &mut [__m128i; 8], ms: &mut MsgSchedule, data: *const __m128i) { + #[allow(non_snake_case)] + let MASK = _mm_setr_epi32(0x04050607, 0x00010203, 0x0c0d0e0f, 0x08090a0b); + + macro_rules! unrolled_iterations { + ($($i:literal),*) => {$( + x[$i] = _mm_loadu_si128(data.add($i) as *const _); + x[$i] = _mm_shuffle_epi8(x[$i], MASK); + + let y = _mm_add_epi64( + x[$i], + _mm_loadu_si128(&K64[2 * $i] as *const u64 as *const _), + ); + + ms[$i] = y; + )*}; + } + + unrolled_iterations!(0, 1, 2, 3, 4, 5, 6, 7); +} + +#[inline(always)] +unsafe fn load_data_avx2( + x: &mut [__m256i; 8], + ms: &mut MsgSchedule, + t2: &mut RoundStates, + data: *const __m128i, +) { + #[allow(non_snake_case)] + let MASK = _mm256_set_epi64x( + 0x0809_0A0B_0C0D_0E0F_i64, + 0x0001_0203_0405_0607_i64, + 0x0809_0A0B_0C0D_0E0F_i64, + 0x0001_0203_0405_0607_i64, + ); + + macro_rules! unrolled_iterations { + ($($i:literal),*) => {$( + x[$i] = _mm256_insertf128_si256(x[$i], _mm_loadu_si128(data.add(8 + $i) as *const _), 1); + x[$i] = _mm256_insertf128_si256(x[$i], _mm_loadu_si128(data.add($i) as *const _), 0); + + x[$i] = _mm256_shuffle_epi8(x[$i], MASK); + + let t = _mm_loadu_si128(K64.as_ptr().add($i * 2) as *const u64 as *const _); + let y = _mm256_add_epi64(x[$i], _mm256_set_m128i(t, t)); + + ms[$i] = _mm256_extracti128_si256(y, 0); + t2[$i] = _mm256_extracti128_si256(y, 1); + )*}; + } + + unrolled_iterations!(0, 1, 2, 3, 4, 5, 6, 7); +} + +#[inline(always)] +unsafe fn rounds_0_63_avx(current_state: &mut State, x: &mut [__m128i; 8], ms: &mut MsgSchedule) { + let mut k64_idx: usize = SHA512_BLOCK_WORDS_NUM; + + for _ in 0..4 { + for j in 0..8 { + let k64 = _mm_loadu_si128(&K64[k64_idx] as *const u64 as *const _); + let y = sha512_update_x_avx(x, k64); + + { + let ms = cast_ms(ms); + sha_round(current_state, ms[2 * j]); + sha_round(current_state, ms[2 * j + 1]); + } + + ms[j] = y; + k64_idx += 2; + } + } +} + +#[inline(always)] +unsafe fn rounds_0_63_avx2( + current_state: &mut State, + x: &mut [__m256i; 8], + ms: &mut MsgSchedule, + t2: &mut RoundStates, +) { + let mut k64x4_idx: usize = SHA512_BLOCK_WORDS_NUM; + + for i in 1..5 { + for j in 0..8 { + let t = _mm_loadu_si128(K64.as_ptr().add(k64x4_idx) as *const u64 as *const _); + let y = sha512_update_x_avx2(x, _mm256_set_m128i(t, t)); + + { + let ms = cast_ms(ms); + sha_round(current_state, ms[2 * j]); + sha_round(current_state, ms[2 * j + 1]); + } + + ms[j] = _mm256_extracti128_si256(y, 0); + t2[8 * i + j] = _mm256_extracti128_si256(y, 1); + + k64x4_idx += 2; + } + } +} + +#[inline(always)] +fn rounds_64_79(current_state: &mut State, ms: &MsgSchedule) { + let ms = cast_ms(ms); + for i in 64..80 { + sha_round(current_state, ms[i & 0xf]); + } +} + +#[inline(always)] +fn process_second_block(current_state: &mut State, t2: &RoundStates) { + for t2 in cast_rs(t2).iter() { + sha_round(current_state, *t2); + } +} + +#[inline(always)] +fn sha_round(s: &mut State, x: u64) { + macro_rules! big_sigma0 { + ($a:expr) => { + $a.rotate_right(28) ^ $a.rotate_right(34) ^ $a.rotate_right(39) + }; + } + macro_rules! big_sigma1 { + ($a:expr) => { + $a.rotate_right(14) ^ $a.rotate_right(18) ^ $a.rotate_right(41) + }; + } + macro_rules! bool3ary_202 { + ($a:expr, $b:expr, $c:expr) => { + $c ^ ($a & ($b ^ $c)) + }; + } // Choose, MD5F, SHA1C + macro_rules! bool3ary_232 { + ($a:expr, $b:expr, $c:expr) => { + ($a & $b) ^ ($a & $c) ^ ($b & $c) + }; + } // Majority, SHA1M + + macro_rules! rotate_state { + ($s:ident) => {{ + let tmp = $s[7]; + $s[7] = $s[6]; + $s[6] = $s[5]; + $s[5] = $s[4]; + $s[4] = $s[3]; + $s[3] = $s[2]; + $s[2] = $s[1]; + $s[1] = $s[0]; + $s[0] = tmp; + }}; + } + + let t = x + .wrapping_add(s[7]) + .wrapping_add(big_sigma1!(s[4])) + .wrapping_add(bool3ary_202!(s[4], s[5], s[6])); + + s[7] = t + .wrapping_add(big_sigma0!(s[0])) + .wrapping_add(bool3ary_232!(s[0], s[1], s[2])); + s[3] = s[3].wrapping_add(t); + + rotate_state!(s); +} + +#[inline(always)] +fn accumulate_state(dst: &mut State, src: &State) { + for i in 0..SHA512_HASH_WORDS_NUM { + dst[i] = dst[i].wrapping_add(src[i]); + } +} + +macro_rules! fn_sha512_update_x { + ($name:ident, $ty:ident, { + ADD64 = $ADD64:ident, + ALIGNR8 = $ALIGNR8:ident, + SRL64 = $SRL64:ident, + SLL64 = $SLL64:ident, + XOR = $XOR:ident, + }) => { + unsafe fn $name(x: &mut [$ty; 8], k64: $ty) -> $ty { + // q[2:1] + let mut t0 = $ALIGNR8(x[1], x[0], 8); + // q[10:9] + let mut t3 = $ALIGNR8(x[5], x[4], 8); + // q[2:1] >> s0[0] + let mut t2 = $SRL64(t0, 1); + // q[1:0] + q[10:9] + x[0] = $ADD64(x[0], t3); + // q[2:1] >> s0[2] + t3 = $SRL64(t0, 7); + // q[2:1] << (64 - s0[1]) + let mut t1 = $SLL64(t0, 64 - 8); + // (q[2:1] >> s0[2]) ^ + // (q[2:1] >> s0[0]) + t0 = $XOR(t3, t2); + // q[2:1] >> s0[1] + t2 = $SRL64(t2, 8 - 1); + // (q[2:1] >> s0[2]) ^ + // (q[2:1] >> s0[0]) ^ + // q[2:1] << (64 - s0[1]) + t0 = $XOR(t0, t1); + // q[2:1] << (64 - s0[0]) + t1 = $SLL64(t1, 8 - 1); + // sigma1(q[2:1]) + t0 = $XOR(t0, t2); + t0 = $XOR(t0, t1); + // q[15:14] >> s1[2] + t3 = $SRL64(x[7], 6); + // q[15:14] >> (64 - s1[1]) + t2 = $SLL64(x[7], 64 - 61); + // q[1:0] + sigma0(q[2:1]) + x[0] = $ADD64(x[0], t0); + // q[15:14] >> s1[0] + t1 = $SRL64(x[7], 19); + // q[15:14] >> s1[2] ^ + // q[15:14] >> (64 - s1[1]) + t3 = $XOR(t3, t2); + // q[15:14] >> (64 - s1[0]) + t2 = $SLL64(t2, 61 - 19); + // q[15:14] >> s1[2] ^ + // q[15:14] >> (64 - s1[1] ^ + // q[15:14] >> s1[0] + t3 = $XOR(t3, t1); + // q[15:14] >> s1[1] + t1 = $SRL64(t1, 61 - 19); + // sigma1(q[15:14]) + t3 = $XOR(t3, t2); + t3 = $XOR(t3, t1); + + // q[1:0] + q[10:9] + sigma1(q[15:14]) + sigma0(q[2:1]) + x[0] = $ADD64(x[0], t3); + + // rotate + let temp = x[0]; + x[0] = x[1]; + x[1] = x[2]; + x[2] = x[3]; + x[3] = x[4]; + x[4] = x[5]; + x[5] = x[6]; + x[6] = x[7]; + x[7] = temp; + + $ADD64(x[7], k64) + } + }; +} + +fn_sha512_update_x!(sha512_update_x_avx, __m128i, { + ADD64 = _mm_add_epi64, + ALIGNR8 = _mm_alignr_epi8, + SRL64 = _mm_srli_epi64, + SLL64 = _mm_slli_epi64, + XOR = _mm_xor_si128, +}); + +fn_sha512_update_x!(sha512_update_x_avx2, __m256i, { + ADD64 = _mm256_add_epi64, + ALIGNR8 = _mm256_alignr_epi8, + SRL64 = _mm256_srli_epi64, + SLL64 = _mm256_slli_epi64, + XOR = _mm256_xor_si256, +}); + +#[inline(always)] +fn cast_ms(ms: &MsgSchedule) -> &[u64; SHA512_BLOCK_WORDS_NUM] { + unsafe { &*(ms as *const MsgSchedule as *const _) } +} + +#[inline(always)] +fn cast_rs(rs: &RoundStates) -> &[u64; SHA512_ROUNDS_NUM] { + unsafe { &*(rs as *const RoundStates as *const _) } +} + +type State = [u64; SHA512_HASH_WORDS_NUM]; +type MsgSchedule = [__m128i; SHA512_BLOCK_WORDS_NUM / 2]; +type RoundStates = [__m128i; SHA512_ROUNDS_NUM / 2]; + +const SHA512_BLOCK_BYTE_LEN: usize = 128; +const SHA512_ROUNDS_NUM: usize = 80; +const SHA512_HASH_BYTE_LEN: usize = 64; +const SHA512_HASH_WORDS_NUM: usize = SHA512_HASH_BYTE_LEN / size_of::(); +const SHA512_BLOCK_WORDS_NUM: usize = SHA512_BLOCK_BYTE_LEN / size_of::(); -- cgit v1.2.3