diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-19 17:39:49 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-19 17:39:49 +0000 |
commit | a0aa2307322cd47bbf416810ac0292925e03be87 (patch) | |
tree | 37076262a026c4b48c8a0e84f44ff9187556ca35 /rust/vendor/aes/src | |
parent | Initial commit. (diff) | |
download | suricata-a0aa2307322cd47bbf416810ac0292925e03be87.tar.xz suricata-a0aa2307322cd47bbf416810ac0292925e03be87.zip |
Adding upstream version 1:7.0.3.upstream/1%7.0.3
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'rust/vendor/aes/src')
25 files changed, 5945 insertions, 0 deletions
diff --git a/rust/vendor/aes/src/armv8.rs b/rust/vendor/aes/src/armv8.rs new file mode 100644 index 0000000..187ac1b --- /dev/null +++ b/rust/vendor/aes/src/armv8.rs @@ -0,0 +1,376 @@ +//! AES block cipher implementation using the ARMv8 Cryptography Extensions. +//! +//! Based on this C intrinsics implementation: +//! <https://github.com/noloader/AES-Intrinsics/blob/master/aes-arm.c> +//! +//! Original C written and placed in public domain by Jeffrey Walton. +//! Based on code from ARM, and by Johannes Schneiders, Skip Hovsmith and +//! Barry O'Rourke for the mbedTLS project. + +#![allow(clippy::needless_range_loop)] + +#[cfg(feature = "hazmat")] +pub(crate) mod hazmat; + +mod decrypt; +mod encrypt; +mod expand; + +use self::{ + decrypt::{decrypt, decrypt8}, + encrypt::{encrypt, encrypt8}, + expand::{expand_key, inv_expanded_keys}, +}; +use crate::{Block, ParBlocks}; +use cipher::{ + consts::{U16, U24, U32, U8}, + generic_array::GenericArray, + BlockCipher, BlockDecrypt, BlockEncrypt, NewBlockCipher, +}; +use core::arch::aarch64::*; + +macro_rules! define_aes_impl { + ( + $name:ident, + $name_enc:ident, + $name_dec:ident, + $key_size:ty, + $rounds:tt, + $doc:expr + ) => { + #[doc=$doc] + #[doc = "block cipher"] + #[derive(Clone)] + pub struct $name { + encrypt: $name_enc, + decrypt: $name_dec, + } + + impl NewBlockCipher for $name { + type KeySize = $key_size; + + #[inline] + fn new(key: &GenericArray<u8, $key_size>) -> Self { + let encrypt = $name_enc::new(key); + let decrypt = $name_dec::from(&encrypt); + Self { encrypt, decrypt } + } + } + + impl BlockCipher for $name { + type BlockSize = U16; + type ParBlocks = U8; + } + + impl BlockEncrypt for $name { + #[inline] + fn encrypt_block(&self, block: &mut Block) { + self.encrypt.encrypt_block(block) + } + + #[inline] + fn encrypt_par_blocks(&self, blocks: &mut ParBlocks) { + self.encrypt.encrypt_par_blocks(blocks) + } + } + + impl BlockDecrypt for $name { + #[inline] + fn decrypt_block(&self, block: &mut Block) { + self.decrypt.decrypt_block(block) + } + + #[inline] + fn decrypt_par_blocks(&self, blocks: &mut ParBlocks) { + self.decrypt.decrypt_par_blocks(blocks) + } + } + + #[doc=$doc] + #[doc = "block cipher (encrypt-only)"] + #[derive(Clone)] + pub struct $name_enc { + round_keys: [uint8x16_t; $rounds], + } + + impl NewBlockCipher for $name_enc { + type KeySize = $key_size; + + fn new(key: &GenericArray<u8, $key_size>) -> Self { + Self { + round_keys: expand_key(key.as_ref()), + } + } + } + + impl BlockCipher for $name_enc { + type BlockSize = U16; + type ParBlocks = U8; + } + + impl BlockEncrypt for $name_enc { + fn encrypt_block(&self, block: &mut Block) { + unsafe { encrypt(&self.round_keys, block) } + } + + fn encrypt_par_blocks(&self, blocks: &mut ParBlocks) { + unsafe { encrypt8(&self.round_keys, blocks) } + } + } + + #[doc=$doc] + #[doc = "block cipher (decrypt-only)"] + #[derive(Clone)] + pub struct $name_dec { + round_keys: [uint8x16_t; $rounds], + } + + impl NewBlockCipher for $name_dec { + type KeySize = $key_size; + + fn new(key: &GenericArray<u8, $key_size>) -> Self { + $name_enc::new(key).into() + } + } + + impl From<$name_enc> for $name_dec { + fn from(enc: $name_enc) -> $name_dec { + Self::from(&enc) + } + } + + impl From<&$name_enc> for $name_dec { + fn from(enc: &$name_enc) -> $name_dec { + let mut round_keys = enc.round_keys; + inv_expanded_keys(&mut round_keys); + Self { round_keys } + } + } + + impl BlockCipher for $name_dec { + type BlockSize = U16; + type ParBlocks = U8; + } + + impl BlockDecrypt for $name_dec { + fn decrypt_block(&self, block: &mut Block) { + unsafe { decrypt(&self.round_keys, block) } + } + + fn decrypt_par_blocks(&self, blocks: &mut ParBlocks) { + unsafe { decrypt8(&self.round_keys, blocks) } + } + } + + opaque_debug::implement!($name); + opaque_debug::implement!($name_enc); + opaque_debug::implement!($name_dec); + }; +} + +define_aes_impl!(Aes128, Aes128Enc, Aes128Dec, U16, 11, "AES-128"); +define_aes_impl!(Aes192, Aes192Enc, Aes192Dec, U24, 13, "AES-192"); +define_aes_impl!(Aes256, Aes256Enc, Aes256Dec, U32, 15, "AES-256"); + +#[cfg(test)] +mod tests { + use super::{decrypt, decrypt8, encrypt, encrypt8, expand_key, inv_expanded_keys, ParBlocks}; + use core::{arch::aarch64::*, convert::TryInto}; + use hex_literal::hex; + + /// FIPS 197, Appendix A.1: AES-128 Cipher Key + /// user input, unaligned buffer + const AES128_KEY: [u8; 16] = hex!("2b7e151628aed2a6abf7158809cf4f3c"); + + /// FIPS 197 Appendix A.1: Expansion of a 128-bit Cipher Key + /// library controlled, aligned buffer + const AES128_EXP_KEYS: [[u8; 16]; 11] = [ + AES128_KEY, + hex!("a0fafe1788542cb123a339392a6c7605"), + hex!("f2c295f27a96b9435935807a7359f67f"), + hex!("3d80477d4716fe3e1e237e446d7a883b"), + hex!("ef44a541a8525b7fb671253bdb0bad00"), + hex!("d4d1c6f87c839d87caf2b8bc11f915bc"), + hex!("6d88a37a110b3efddbf98641ca0093fd"), + hex!("4e54f70e5f5fc9f384a64fb24ea6dc4f"), + hex!("ead27321b58dbad2312bf5607f8d292f"), + hex!("ac7766f319fadc2128d12941575c006e"), + hex!("d014f9a8c9ee2589e13f0cc8b6630ca6"), + ]; + + /// Inverse expanded keys for [`AES128_EXPANDED_KEYS`] + const AES128_EXP_INVKEYS: [[u8; 16]; 11] = [ + hex!("d014f9a8c9ee2589e13f0cc8b6630ca6"), + hex!("0c7b5a631319eafeb0398890664cfbb4"), + hex!("df7d925a1f62b09da320626ed6757324"), + hex!("12c07647c01f22c7bc42d2f37555114a"), + hex!("6efcd876d2df54807c5df034c917c3b9"), + hex!("6ea30afcbc238cf6ae82a4b4b54a338d"), + hex!("90884413d280860a12a128421bc89739"), + hex!("7c1f13f74208c219c021ae480969bf7b"), + hex!("cc7505eb3e17d1ee82296c51c9481133"), + hex!("2b3708a7f262d405bc3ebdbf4b617d62"), + AES128_KEY, + ]; + + /// FIPS 197, Appendix A.2: AES-192 Cipher Key + /// user input, unaligned buffer + const AES192_KEY: [u8; 24] = hex!("8e73b0f7da0e6452c810f32b809079e562f8ead2522c6b7b"); + + /// FIPS 197 Appendix A.2: Expansion of a 192-bit Cipher Key + /// library controlled, aligned buffer + const AES192_EXP_KEYS: [[u8; 16]; 13] = [ + hex!("8e73b0f7da0e6452c810f32b809079e5"), + hex!("62f8ead2522c6b7bfe0c91f72402f5a5"), + hex!("ec12068e6c827f6b0e7a95b95c56fec2"), + hex!("4db7b4bd69b5411885a74796e92538fd"), + hex!("e75fad44bb095386485af05721efb14f"), + hex!("a448f6d94d6dce24aa326360113b30e6"), + hex!("a25e7ed583b1cf9a27f939436a94f767"), + hex!("c0a69407d19da4e1ec1786eb6fa64971"), + hex!("485f703222cb8755e26d135233f0b7b3"), + hex!("40beeb282f18a2596747d26b458c553e"), + hex!("a7e1466c9411f1df821f750aad07d753"), + hex!("ca4005388fcc5006282d166abc3ce7b5"), + hex!("e98ba06f448c773c8ecc720401002202"), + ]; + + /// FIPS 197, Appendix A.3: AES-256 Cipher Key + /// user input, unaligned buffer + const AES256_KEY: [u8; 32] = + hex!("603deb1015ca71be2b73aef0857d77811f352c073b6108d72d9810a30914dff4"); + + /// FIPS 197 Appendix A.3: Expansion of a 256-bit Cipher Key + /// library controlled, aligned buffer + const AES256_EXP_KEYS: [[u8; 16]; 15] = [ + hex!("603deb1015ca71be2b73aef0857d7781"), + hex!("1f352c073b6108d72d9810a30914dff4"), + hex!("9ba354118e6925afa51a8b5f2067fcde"), + hex!("a8b09c1a93d194cdbe49846eb75d5b9a"), + hex!("d59aecb85bf3c917fee94248de8ebe96"), + hex!("b5a9328a2678a647983122292f6c79b3"), + hex!("812c81addadf48ba24360af2fab8b464"), + hex!("98c5bfc9bebd198e268c3ba709e04214"), + hex!("68007bacb2df331696e939e46c518d80"), + hex!("c814e20476a9fb8a5025c02d59c58239"), + hex!("de1369676ccc5a71fa2563959674ee15"), + hex!("5886ca5d2e2f31d77e0af1fa27cf73c3"), + hex!("749c47ab18501ddae2757e4f7401905a"), + hex!("cafaaae3e4d59b349adf6acebd10190d"), + hex!("fe4890d1e6188d0b046df344706c631e"), + ]; + + /// FIPS 197, Appendix B input + /// user input, unaligned buffer + const INPUT: [u8; 16] = hex!("3243f6a8885a308d313198a2e0370734"); + + /// FIPS 197, Appendix B output + const EXPECTED: [u8; 16] = hex!("3925841d02dc09fbdc118597196a0b32"); + + fn load_expanded_keys<const N: usize>(input: [[u8; 16]; N]) -> [uint8x16_t; N] { + let mut output = [unsafe { vdupq_n_u8(0) }; N]; + + for (src, dst) in input.iter().zip(output.iter_mut()) { + *dst = unsafe { vld1q_u8(src.as_ptr()) } + } + + output + } + + fn store_expanded_keys<const N: usize>(input: [uint8x16_t; N]) -> [[u8; 16]; N] { + let mut output = [[0u8; 16]; N]; + + for (src, dst) in input.iter().zip(output.iter_mut()) { + unsafe { vst1q_u8(dst.as_mut_ptr(), *src) } + } + + output + } + + #[test] + fn aes128_key_expansion() { + let ek = expand_key(&AES128_KEY); + assert_eq!(store_expanded_keys(ek), AES128_EXP_KEYS); + } + + #[test] + fn aes128_key_expansion_inv() { + let mut ek = load_expanded_keys(AES128_EXP_KEYS); + inv_expanded_keys(&mut ek); + assert_eq!(store_expanded_keys(ek), AES128_EXP_INVKEYS); + } + + #[test] + fn aes192_key_expansion() { + let ek = expand_key(&AES192_KEY); + assert_eq!(store_expanded_keys(ek), AES192_EXP_KEYS); + } + + #[test] + fn aes256_key_expansion() { + let ek = expand_key(&AES256_KEY); + assert_eq!(store_expanded_keys(ek), AES256_EXP_KEYS); + } + + #[test] + fn aes128_encrypt() { + // Intentionally misaligned block + let mut block = [0u8; 19]; + block[3..].copy_from_slice(&INPUT); + + unsafe { + encrypt( + &load_expanded_keys(AES128_EXP_KEYS), + (&mut block[3..]).try_into().unwrap(), + ) + }; + + assert_eq!(&block[3..], &EXPECTED); + } + + #[test] + fn aes128_encrypt8() { + let mut blocks = ParBlocks::default(); + + for block in &mut blocks { + block.copy_from_slice(&INPUT); + } + + unsafe { encrypt8(&load_expanded_keys(AES128_EXP_KEYS), &mut blocks) }; + + for block in &blocks { + assert_eq!(block.as_slice(), &EXPECTED); + } + } + + #[test] + fn aes128_decrypt() { + // Intentionally misaligned block + let mut block = [0u8; 19]; + block[3..].copy_from_slice(&EXPECTED); + + unsafe { + decrypt( + &load_expanded_keys(AES128_EXP_INVKEYS), + (&mut block[3..]).try_into().unwrap(), + ) + }; + + assert_eq!(&block[3..], &INPUT); + } + + #[test] + fn aes128_decrypt8() { + let mut blocks = ParBlocks::default(); + + for block in &mut blocks { + block.copy_from_slice(&EXPECTED); + } + + unsafe { decrypt8(&load_expanded_keys(AES128_EXP_INVKEYS), &mut blocks) }; + + for block in &blocks { + assert_eq!(block.as_slice(), &INPUT); + } + } +} diff --git a/rust/vendor/aes/src/armv8/decrypt.rs b/rust/vendor/aes/src/armv8/decrypt.rs new file mode 100644 index 0000000..cb84193 --- /dev/null +++ b/rust/vendor/aes/src/armv8/decrypt.rs @@ -0,0 +1,72 @@ +//! AES decryption support. + +use crate::{Block, ParBlocks}; +use core::arch::aarch64::*; + +/// Perform AES decryption using the given expanded keys. +#[target_feature(enable = "aes")] +#[target_feature(enable = "neon")] +pub(super) unsafe fn decrypt<const N: usize>(expanded_keys: &[uint8x16_t; N], block: &mut Block) { + let rounds = N - 1; + assert!(rounds == 10 || rounds == 12 || rounds == 14); + + let mut state = vld1q_u8(block.as_ptr()); + + for k in expanded_keys.iter().take(rounds - 1) { + // AES single round decryption + state = vaesdq_u8(state, *k); + + // AES inverse mix columns + state = vaesimcq_u8(state); + } + + // AES single round decryption + state = vaesdq_u8(state, expanded_keys[rounds - 1]); + + // Final add (bitwise XOR) + state = veorq_u8(state, expanded_keys[rounds]); + + vst1q_u8(block.as_mut_ptr(), state); +} + +/// Perform parallel AES decryption 8-blocks-at-a-time using the given expanded keys. +#[target_feature(enable = "aes")] +#[target_feature(enable = "neon")] +pub(super) unsafe fn decrypt8<const N: usize>( + expanded_keys: &[uint8x16_t; N], + blocks: &mut ParBlocks, +) { + let rounds = N - 1; + assert!(rounds == 10 || rounds == 12 || rounds == 14); + + let mut state = [ + vld1q_u8(blocks[0].as_ptr()), + vld1q_u8(blocks[1].as_ptr()), + vld1q_u8(blocks[2].as_ptr()), + vld1q_u8(blocks[3].as_ptr()), + vld1q_u8(blocks[4].as_ptr()), + vld1q_u8(blocks[5].as_ptr()), + vld1q_u8(blocks[6].as_ptr()), + vld1q_u8(blocks[7].as_ptr()), + ]; + + for k in expanded_keys.iter().take(rounds - 1) { + for i in 0..8 { + // AES single round decryption + state[i] = vaesdq_u8(state[i], *k); + + // AES inverse mix columns + state[i] = vaesimcq_u8(state[i]); + } + } + + for i in 0..8 { + // AES single round decryption + state[i] = vaesdq_u8(state[i], expanded_keys[rounds - 1]); + + // Final add (bitwise XOR) + state[i] = veorq_u8(state[i], expanded_keys[rounds]); + + vst1q_u8(blocks[i].as_mut_ptr(), state[i]); + } +} diff --git a/rust/vendor/aes/src/armv8/encrypt.rs b/rust/vendor/aes/src/armv8/encrypt.rs new file mode 100644 index 0000000..8464173 --- /dev/null +++ b/rust/vendor/aes/src/armv8/encrypt.rs @@ -0,0 +1,72 @@ +//! AES encryption support + +use crate::{Block, ParBlocks}; +use core::arch::aarch64::*; + +/// Perform AES encryption using the given expanded keys. +#[target_feature(enable = "aes")] +#[target_feature(enable = "neon")] +pub(super) unsafe fn encrypt<const N: usize>(expanded_keys: &[uint8x16_t; N], block: &mut Block) { + let rounds = N - 1; + assert!(rounds == 10 || rounds == 12 || rounds == 14); + + let mut state = vld1q_u8(block.as_ptr()); + + for k in expanded_keys.iter().take(rounds - 1) { + // AES single round encryption + state = vaeseq_u8(state, *k); + + // AES mix columns + state = vaesmcq_u8(state); + } + + // AES single round encryption + state = vaeseq_u8(state, expanded_keys[rounds - 1]); + + // Final add (bitwise XOR) + state = veorq_u8(state, expanded_keys[rounds]); + + vst1q_u8(block.as_mut_ptr(), state); +} + +/// Perform parallel AES encryption 8-blocks-at-a-time using the given expanded keys. +#[target_feature(enable = "aes")] +#[target_feature(enable = "neon")] +pub(super) unsafe fn encrypt8<const N: usize>( + expanded_keys: &[uint8x16_t; N], + blocks: &mut ParBlocks, +) { + let rounds = N - 1; + assert!(rounds == 10 || rounds == 12 || rounds == 14); + + let mut state = [ + vld1q_u8(blocks[0].as_ptr()), + vld1q_u8(blocks[1].as_ptr()), + vld1q_u8(blocks[2].as_ptr()), + vld1q_u8(blocks[3].as_ptr()), + vld1q_u8(blocks[4].as_ptr()), + vld1q_u8(blocks[5].as_ptr()), + vld1q_u8(blocks[6].as_ptr()), + vld1q_u8(blocks[7].as_ptr()), + ]; + + for k in expanded_keys.iter().take(rounds - 1) { + for i in 0..8 { + // AES single round encryption + state[i] = vaeseq_u8(state[i], *k); + + // AES mix columns + state[i] = vaesmcq_u8(state[i]); + } + } + + for i in 0..8 { + // AES single round encryption + state[i] = vaeseq_u8(state[i], expanded_keys[rounds - 1]); + + // Final add (bitwise XOR) + state[i] = veorq_u8(state[i], expanded_keys[rounds]); + + vst1q_u8(blocks[i].as_mut_ptr(), state[i]); + } +} diff --git a/rust/vendor/aes/src/armv8/expand.rs b/rust/vendor/aes/src/armv8/expand.rs new file mode 100644 index 0000000..2e26e39 --- /dev/null +++ b/rust/vendor/aes/src/armv8/expand.rs @@ -0,0 +1,77 @@ +//! AES key expansion support. + +use core::{arch::aarch64::*, convert::TryInto, mem, slice}; + +/// There are 4 AES words in a block. +const BLOCK_WORDS: usize = 4; + +/// The AES (nee Rijndael) notion of a word is always 32-bits, or 4-bytes. +const WORD_SIZE: usize = 4; + +/// AES round constants. +const ROUND_CONSTS: [u32; 10] = [0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x1b, 0x36]; + +/// AES key expansion +// TODO(tarcieri): big endian support? +#[inline] +pub(super) fn expand_key<const L: usize, const N: usize>(key: &[u8; L]) -> [uint8x16_t; N] { + assert!((L == 16 && N == 11) || (L == 24 && N == 13) || (L == 32 && N == 15)); + + let mut expanded_keys: [uint8x16_t; N] = unsafe { mem::zeroed() }; + + // TODO(tarcieri): construct expanded keys using `vreinterpretq_u8_u32` + let ek_words = unsafe { + slice::from_raw_parts_mut(expanded_keys.as_mut_ptr() as *mut u32, N * BLOCK_WORDS) + }; + + for (i, chunk) in key.chunks_exact(WORD_SIZE).enumerate() { + ek_words[i] = u32::from_ne_bytes(chunk.try_into().unwrap()); + } + + // From "The Rijndael Block Cipher" Section 4.1: + // > The number of columns of the Cipher Key is denoted by `Nk` and is + // > equal to the key length divided by 32 [bits]. + let nk = L / WORD_SIZE; + + for i in nk..(N * BLOCK_WORDS) { + let mut word = ek_words[i - 1]; + + if i % nk == 0 { + word = sub_word(word).rotate_right(8) ^ ROUND_CONSTS[i / nk - 1]; + } else if nk > 6 && i % nk == 4 { + word = sub_word(word) + } + + ek_words[i] = ek_words[i - nk] ^ word; + } + + expanded_keys +} + +/// Compute inverse expanded keys (for decryption). +/// +/// This is the reverse of the encryption keys, with the Inverse Mix Columns +/// operation applied to all but the first and last expanded key. +#[inline] +pub(super) fn inv_expanded_keys<const N: usize>(expanded_keys: &mut [uint8x16_t; N]) { + assert!(N == 11 || N == 13 || N == 15); + + for ek in expanded_keys.iter_mut().take(N - 1).skip(1) { + unsafe { *ek = vaesimcq_u8(*ek) } + } + + expanded_keys.reverse(); +} + +/// Sub bytes for a single AES word: used for key expansion. +#[inline(always)] +fn sub_word(input: u32) -> u32 { + unsafe { + let input = vreinterpretq_u8_u32(vdupq_n_u32(input)); + + // AES single round encryption (with a "round" key of all zeros) + let sub_input = vaeseq_u8(input, vdupq_n_u8(0)); + + vgetq_lane_u32(vreinterpretq_u32_u8(sub_input), 0) + } +} diff --git a/rust/vendor/aes/src/armv8/hazmat.rs b/rust/vendor/aes/src/armv8/hazmat.rs new file mode 100644 index 0000000..022db3a --- /dev/null +++ b/rust/vendor/aes/src/armv8/hazmat.rs @@ -0,0 +1,104 @@ +//! Low-level "hazmat" AES functions: ARMv8 Cryptography Extensions support. +//! +//! Note: this isn't actually used in the `Aes128`/`Aes192`/`Aes256` +//! implementations in this crate, but instead provides raw AES-NI accelerated +//! access to the AES round function gated under the `hazmat` crate feature. + +use crate::{Block, ParBlocks}; +use core::arch::aarch64::*; + +/// AES cipher (encrypt) round function. +#[allow(clippy::cast_ptr_alignment)] +#[target_feature(enable = "aes")] +pub(crate) unsafe fn cipher_round(block: &mut Block, round_key: &Block) { + let b = vld1q_u8(block.as_ptr()); + let k = vld1q_u8(round_key.as_ptr()); + + // AES single round encryption (all-zero round key, deferred until the end) + let mut state = vaeseq_u8(b, vdupq_n_u8(0)); + + // AES mix columns (the `vaeseq_u8` instruction otherwise omits this step) + state = vaesmcq_u8(state); + + // AES add round key (bitwise XOR) + state = veorq_u8(state, k); + + vst1q_u8(block.as_mut_ptr(), state); +} + +/// AES cipher (encrypt) round function: parallel version. +#[allow(clippy::cast_ptr_alignment)] +#[target_feature(enable = "aes")] +pub(crate) unsafe fn cipher_round_par(blocks: &mut ParBlocks, round_keys: &ParBlocks) { + for i in 0..8 { + let mut state = vld1q_u8(blocks[i].as_ptr()); + + // AES single round encryption + state = vaeseq_u8(state, vdupq_n_u8(0)); + + // AES mix columns + state = vaesmcq_u8(state); + + // AES add round key (bitwise XOR) + state = veorq_u8(state, vld1q_u8(round_keys[i].as_ptr())); + + vst1q_u8(blocks[i].as_mut_ptr(), state); + } +} + +/// AES equivalent inverse cipher (decrypt) round function. +#[allow(clippy::cast_ptr_alignment)] +#[target_feature(enable = "aes")] +pub(crate) unsafe fn equiv_inv_cipher_round(block: &mut Block, round_key: &Block) { + let b = vld1q_u8(block.as_ptr()); + let k = vld1q_u8(round_key.as_ptr()); + + // AES single round decryption (all-zero round key, deferred until the end) + let mut state = vaesdq_u8(b, vdupq_n_u8(0)); + + // AES inverse mix columns (the `vaesdq_u8` instruction otherwise omits this step) + state = vaesimcq_u8(state); + + // AES add round key (bitwise XOR) + state = veorq_u8(state, k); + + vst1q_u8(block.as_mut_ptr(), state); +} + +/// AES equivalent inverse cipher (decrypt) round function: parallel version. +#[allow(clippy::cast_ptr_alignment)] +#[target_feature(enable = "aes")] +pub(crate) unsafe fn equiv_inv_cipher_round_par(blocks: &mut ParBlocks, round_keys: &ParBlocks) { + for i in 0..8 { + let mut state = vld1q_u8(blocks[i].as_ptr()); + + // AES single round decryption (all-zero round key, deferred until the end) + state = vaesdq_u8(state, vdupq_n_u8(0)); + + // AES inverse mix columns (the `vaesdq_u8` instruction otherwise omits this step) + state = vaesimcq_u8(state); + + // AES add round key (bitwise XOR) + state = veorq_u8(state, vld1q_u8(round_keys[i].as_ptr())); + + vst1q_u8(blocks[i].as_mut_ptr(), state); + } +} + +/// AES mix columns function. +#[allow(clippy::cast_ptr_alignment)] +#[target_feature(enable = "aes")] +pub(crate) unsafe fn mix_columns(block: &mut Block) { + let b = vld1q_u8(block.as_ptr()); + let out = vaesmcq_u8(b); + vst1q_u8(block.as_mut_ptr(), out); +} + +/// AES inverse mix columns function. +#[allow(clippy::cast_ptr_alignment)] +#[target_feature(enable = "aes")] +pub(crate) unsafe fn inv_mix_columns(block: &mut Block) { + let b = vld1q_u8(block.as_ptr()); + let out = vaesimcq_u8(b); + vst1q_u8(block.as_mut_ptr(), out); +} diff --git a/rust/vendor/aes/src/autodetect.rs b/rust/vendor/aes/src/autodetect.rs new file mode 100644 index 0000000..dbbdeab --- /dev/null +++ b/rust/vendor/aes/src/autodetect.rs @@ -0,0 +1,259 @@ +//! Autodetection support for hardware accelerated AES backends with fallback +//! to the fixsliced "soft" implementation. + +use crate::{soft, Block, ParBlocks}; +use cipher::{ + consts::{U16, U24, U32, U8}, + generic_array::GenericArray, + BlockCipher, BlockDecrypt, BlockEncrypt, NewBlockCipher, +}; +use core::mem::ManuallyDrop; + +#[cfg(all(target_arch = "aarch64", feature = "armv8"))] +use crate::armv8 as intrinsics; + +#[cfg(any(target_arch = "x86_64", target_arch = "x86"))] +use crate::ni as intrinsics; + +cpufeatures::new!(aes_intrinsics, "aes"); + +macro_rules! define_aes_impl { + ( + $name:tt, + $module:tt, + $key_size:ty, + $doc:expr + ) => { + #[doc=$doc] + pub struct $name { + inner: $module::Inner, + token: aes_intrinsics::InitToken, + } + + mod $module { + use super::{intrinsics, soft}; + use core::mem::ManuallyDrop; + + pub(super) union Inner { + pub(super) intrinsics: ManuallyDrop<intrinsics::$name>, + pub(super) soft: ManuallyDrop<soft::$name>, + } + } + + impl NewBlockCipher for $name { + type KeySize = $key_size; + + #[inline] + fn new(key: &GenericArray<u8, $key_size>) -> Self { + let (token, aesni_present) = aes_intrinsics::init_get(); + + let inner = if aesni_present { + $module::Inner { + intrinsics: ManuallyDrop::new(intrinsics::$name::new(key)), + } + } else { + $module::Inner { + soft: ManuallyDrop::new(soft::$name::new(key)), + } + }; + + Self { inner, token } + } + } + + impl Clone for $name { + fn clone(&self) -> Self { + let inner = if self.token.get() { + $module::Inner { + intrinsics: unsafe { self.inner.intrinsics.clone() }, + } + } else { + $module::Inner { + soft: unsafe { self.inner.soft.clone() }, + } + }; + + Self { + inner, + token: self.token, + } + } + } + + impl BlockCipher for $name { + type BlockSize = U16; + type ParBlocks = U8; + } + + impl BlockEncrypt for $name { + #[inline] + fn encrypt_block(&self, block: &mut Block) { + if self.token.get() { + unsafe { self.inner.intrinsics.encrypt_block(block) } + } else { + unsafe { self.inner.soft.encrypt_block(block) } + } + } + + #[inline] + fn encrypt_par_blocks(&self, blocks: &mut ParBlocks) { + if self.token.get() { + unsafe { self.inner.intrinsics.encrypt_par_blocks(blocks) } + } else { + unsafe { self.inner.soft.encrypt_par_blocks(blocks) } + } + } + } + + impl BlockDecrypt for $name { + #[inline] + fn decrypt_block(&self, block: &mut Block) { + if self.token.get() { + unsafe { self.inner.intrinsics.decrypt_block(block) } + } else { + unsafe { self.inner.soft.decrypt_block(block) } + } + } + + #[inline] + fn decrypt_par_blocks(&self, blocks: &mut ParBlocks) { + if self.token.get() { + unsafe { self.inner.intrinsics.decrypt_par_blocks(blocks) } + } else { + unsafe { self.inner.soft.decrypt_par_blocks(blocks) } + } + } + } + + opaque_debug::implement!($name); + }; +} + +define_aes_impl!(Aes128, aes128, U16, "AES-128 block cipher instance"); +define_aes_impl!(Aes192, aes192, U24, "AES-192 block cipher instance"); +define_aes_impl!(Aes256, aes256, U32, "AES-256 block cipher instance"); + +#[cfg(all(feature = "ctr", target_arch = "aarch64"))] +pub(crate) mod ctr { + use super::{Aes128, Aes192, Aes256}; + + /// AES-128 in CTR mode + pub type Aes128Ctr = ::ctr::Ctr64BE<Aes128>; + + /// AES-192 in CTR mode + pub type Aes192Ctr = ::ctr::Ctr64BE<Aes192>; + + /// AES-256 in CTR mode + pub type Aes256Ctr = ::ctr::Ctr64BE<Aes256>; +} + +#[cfg(all(feature = "ctr", any(target_arch = "x86_64", target_arch = "x86")))] +pub(crate) mod ctr { + use super::{Aes128, Aes192, Aes256}; + use crate::{ni, soft}; + use cipher::{ + errors::{LoopError, OverflowError}, + generic_array::GenericArray, + BlockCipher, FromBlockCipher, SeekNum, StreamCipher, StreamCipherSeek, + }; + use core::mem::ManuallyDrop; + + cpufeatures::new!(aes_ssse3_cpuid, "aes", "ssse3"); + + macro_rules! define_aes_ctr_impl { + ( + $name:tt, + $cipher:ident, + $module:tt, + $doc:expr + ) => { + #[doc=$doc] + #[cfg_attr(docsrs, doc(cfg(feature = "ctr")))] + pub struct $name { + inner: $module::Inner, + token: aes_ssse3_cpuid::InitToken, + } + + mod $module { + use crate::{ni, soft}; + use core::mem::ManuallyDrop; + + pub(super) union Inner { + pub(super) ni: ManuallyDrop<ni::$name>, + pub(super) soft: ManuallyDrop<soft::$name>, + } + } + + impl FromBlockCipher for $name { + type BlockCipher = $cipher; + type NonceSize = <$cipher as BlockCipher>::BlockSize; + + fn from_block_cipher( + cipher: $cipher, + nonce: &GenericArray<u8, Self::NonceSize>, + ) -> Self { + let (token, aesni_present) = aes_ssse3_cpuid::init_get(); + + let inner = if aesni_present { + let ni = ni::$name::from_block_cipher( + unsafe { (*cipher.inner.intrinsics).clone() }, + nonce, + ); + + $module::Inner { + ni: ManuallyDrop::new(ni), + } + } else { + let soft = soft::$name::from_block_cipher( + unsafe { (*cipher.inner.soft).clone() }, + nonce, + ); + + $module::Inner { + soft: ManuallyDrop::new(soft), + } + }; + + Self { inner, token } + } + } + + impl StreamCipher for $name { + #[inline] + fn try_apply_keystream(&mut self, data: &mut [u8]) -> Result<(), LoopError> { + if self.token.get() { + unsafe { (*self.inner.ni).try_apply_keystream(data) } + } else { + unsafe { (*self.inner.soft).try_apply_keystream(data) } + } + } + } + + impl StreamCipherSeek for $name { + #[inline] + fn try_current_pos<T: SeekNum>(&self) -> Result<T, OverflowError> { + if self.token.get() { + unsafe { (*self.inner.ni).try_current_pos() } + } else { + unsafe { (*self.inner.soft).try_current_pos() } + } + } + + #[inline] + fn try_seek<T: SeekNum>(&mut self, pos: T) -> Result<(), LoopError> { + if self.token.get() { + unsafe { (*self.inner.ni).try_seek(pos) } + } else { + unsafe { (*self.inner.soft).try_seek(pos) } + } + } + } + + opaque_debug::implement!($name); + }; + } + + define_aes_ctr_impl!(Aes128Ctr, Aes128, aes128ctr, "AES-128 in CTR mode"); + define_aes_ctr_impl!(Aes192Ctr, Aes192, aes192ctr, "AES-192 in CTR mode"); + define_aes_ctr_impl!(Aes256Ctr, Aes256, aes256ctr, "AES-256 in CTR mode"); +} diff --git a/rust/vendor/aes/src/hazmat.rs b/rust/vendor/aes/src/hazmat.rs new file mode 100644 index 0000000..ff628e5 --- /dev/null +++ b/rust/vendor/aes/src/hazmat.rs @@ -0,0 +1,166 @@ +//! ⚠️ Low-level "hazmat" AES functions. +//! +//! # ☢️️ WARNING: HAZARDOUS API ☢️ +//! +//! This module contains an extremely low-level cryptographic primitive +//! which is likewise extremely difficult to use correctly. +//! +//! There are very few valid uses cases for this API. It's intended to be used +//! for implementing well-reviewed higher-level constructions. +//! +//! We do NOT recommending using it to implement any algorithm which has not +//! received extensive peer review by cryptographers. + +use crate::{soft::fixslice::hazmat as soft, Block, ParBlocks}; + +#[cfg(all( + target_arch = "aarch64", + feature = "armv8", + not(feature = "force-soft") +))] +use crate::armv8::hazmat as intrinsics; + +#[cfg(all( + any(target_arch = "x86_64", target_arch = "x86"), + not(feature = "force-soft") +))] +use crate::ni::hazmat as intrinsics; + +#[cfg(all( + any( + target_arch = "x86", + target_arch = "x86_64", + all(target_arch = "aarch64", feature = "armv8") + ), + not(feature = "force-soft") +))] +cpufeatures::new!(aes_intrinsics, "aes"); + +/// Execute the provided body if CPU intrinsics are available. +// TODO(tarcieri): more `cfg-if`-like macro with an else branch? +macro_rules! if_intrinsics_available { + ($body:expr) => {{ + #[cfg(all( + any( + target_arch = "x86", + target_arch = "x86_64", + all(target_arch = "aarch64", feature = "armv8") + ), + not(feature = "force-soft") + ))] + if aes_intrinsics::get() { + unsafe { $body } + return; + } + }}; +} + +/// ⚠️ AES cipher (encrypt) round function. +/// +/// This API performs the following steps as described in FIPS 197 Appendix C: +/// +/// - `s_box`: state after `SubBytes()` +/// - `s_row`: state after `ShiftRows()` +/// - `m_col`: state after `MixColumns()` +/// - `k_sch`: key schedule value for `round[r]` +/// +/// This series of operations is equivalent to the Intel AES-NI `AESENC` instruction. +/// +/// # ☢️️ WARNING: HAZARDOUS API ☢️ +/// +/// Use this function with great care! See the [module-level documentation][crate::hazmat] +/// for more information. +pub fn cipher_round(block: &mut Block, round_key: &Block) { + if_intrinsics_available! { + intrinsics::cipher_round(block, round_key) + } + + soft::cipher_round(block, round_key); +} + +/// ⚠️ AES cipher (encrypt) round function: parallel version. +/// +/// Equivalent to [`cipher_round`], but acts on 8 blocks-at-a-time, applying +/// the same number of round keys. +/// +/// # ☢️️ WARNING: HAZARDOUS API ☢️ +/// +/// Use this function with great care! See the [module-level documentation][crate::hazmat] +/// for more information. +pub fn cipher_round_par(blocks: &mut ParBlocks, round_keys: &ParBlocks) { + if_intrinsics_available! { + intrinsics::cipher_round_par(blocks, round_keys) + } + + soft::cipher_round_par(blocks, round_keys); +} + +/// ⚠️ AES equivalent inverse cipher (decrypt) round function. +/// +/// This API performs the following steps as described in FIPS 197 Appendix C: +/// +/// - `is_box`: state after `InvSubBytes()` +/// - `is_row`: state after `InvShiftRows()` +/// - `im_col`: state after `InvMixColumns()` +/// - `ik_sch`: key schedule value for `round[r]` +/// +/// This series of operations is equivalent to the Intel AES-NI `AESDEC` instruction. +/// +/// # ☢️️ WARNING: HAZARDOUS API ☢️ +/// +/// Use this function with great care! See the [module-level documentation][crate::hazmat] +/// for more information. +pub fn equiv_inv_cipher_round(block: &mut Block, round_key: &Block) { + if_intrinsics_available! { + intrinsics::equiv_inv_cipher_round(block, round_key) + } + + soft::equiv_inv_cipher_round(block, round_key); +} + +/// ⚠️ AES equivalent inverse cipher (decrypt) round function: parallel version. +/// +/// Equivalent to [`equiv_inv_cipher_round`], but acts on 8 blocks-at-a-time, +/// applying the same number of round keys. +/// +/// # ☢️️ WARNING: HAZARDOUS API ☢️ +/// +/// Use this function with great care! See the [module-level documentation][crate::hazmat] +/// for more information. +pub fn equiv_inv_cipher_round_par(blocks: &mut ParBlocks, round_keys: &ParBlocks) { + if_intrinsics_available! { + intrinsics::equiv_inv_cipher_round_par(blocks, round_keys) + } + + soft::equiv_inv_cipher_round_par(blocks, round_keys); +} + +/// ⚠️ AES mix columns function. +/// +/// # ☢️️ WARNING: HAZARDOUS API ☢️ +/// +/// Use this function with great care! See the [module-level documentation][crate::hazmat] +/// for more information. +pub fn mix_columns(block: &mut Block) { + if_intrinsics_available! { + intrinsics::mix_columns(block) + } + + soft::mix_columns(block); +} + +/// ⚠️ AES inverse mix columns function. +/// +/// This function is equivalent to the Intel AES-NI `AESIMC` instruction. +/// +/// # ☢️️ WARNING: HAZARDOUS API ☢️ +/// +/// Use this function with great care! See the [module-level documentation][crate::hazmat] +/// for more information. +pub fn inv_mix_columns(block: &mut Block) { + if_intrinsics_available! { + intrinsics::inv_mix_columns(block) + } + + soft::inv_mix_columns(block); +} diff --git a/rust/vendor/aes/src/lib.rs b/rust/vendor/aes/src/lib.rs new file mode 100644 index 0000000..f33183c --- /dev/null +++ b/rust/vendor/aes/src/lib.rs @@ -0,0 +1,138 @@ +//! Pure Rust implementation of the Advanced Encryption Standard +//! (a.k.a. Rijndael) +//! +//! # Supported backends +//! This crate provides multiple backends including a portable pure Rust +//! backend as well as ones based on CPU intrinsics. +//! +//! By default, it performs runtime detection of CPU intrinsics and uses them +//! if they are available. +//! +//! ## "soft" portable backend +//! As a baseline implementation, this crate provides a constant-time pure Rust +//! implementation based on [fixslicing], a more advanced form of bitslicing +//! implemented entirely in terms of bitwise arithmetic with no use of any +//! lookup tables or data-dependent branches. +//! +//! Enabling the `compact` Cargo feature will reduce the code size of this +//! backend at the cost of decreased performance (using a modified form of +//! the fixslicing technique called "semi-fixslicing"). +//! +//! ## ARMv8 intrinsics (nightly-only) +//! On `aarch64` targets including `aarch64-apple-darwin` (Apple M1) and Linux +//! targets such as `aarch64-unknown-linux-gnu` and `aarch64-unknown-linux-musl`, +//! support for using AES intrinsics provided by the ARMv8 Cryptography Extensions +//! is available when using the nightly compiler, and can be enabled using the +//! `armv8` crate feature. +//! +//! On Linux and macOS, when the `armv8` feature is enabled support for AES +//! intrinsics is autodetected at runtime. On other platforms the `aes` +//! target feature must be enabled via RUSTFLAGS. +//! +//! ## `x86`/`x86_64` intrinsics (AES-NI) +//! By default this crate uses runtime detection on `i686`/`x86_64` targets +//! in order to determine if AES-NI is available, and if it is not, it will +//! fallback to using a constant-time software implementation. +//! +//! Passing `RUSTFLAGS=-Ctarget-feature=+aes,+ssse3` explicitly at compile-time +//! will override runtime detection and ensure that AES-NI is always used. +//! Programs built in this manner will crash with an illegal instruction on +//! CPUs which do not have AES-NI enabled. +//! +//! Note: runtime detection is not possible on SGX targets. Please use the +//! afforementioned `RUSTFLAGS` to leverage AES-NI on these targets. +//! +//! # Usage example +//! ``` +//! use aes::{Aes128, Block, ParBlocks}; +//! use aes::cipher::{ +//! BlockCipher, BlockEncrypt, BlockDecrypt, NewBlockCipher, +//! generic_array::GenericArray, +//! }; +//! +//! let key = GenericArray::from_slice(&[0u8; 16]); +//! let mut block = Block::default(); +//! let mut block8 = ParBlocks::default(); +//! +//! // Initialize cipher +//! let cipher = Aes128::new(&key); +//! +//! let block_copy = block.clone(); +//! +//! // Encrypt block in-place +//! cipher.encrypt_block(&mut block); +//! +//! // And decrypt it back +//! cipher.decrypt_block(&mut block); +//! assert_eq!(block, block_copy); +//! +//! // We can encrypt 8 blocks simultaneously using +//! // instruction-level parallelism +//! let block8_copy = block8.clone(); +//! cipher.encrypt_par_blocks(&mut block8); +//! cipher.decrypt_par_blocks(&mut block8); +//! assert_eq!(block8, block8_copy); +//! ``` +//! +//! For implementations of block cipher modes of operation see +//! [`block-modes`] crate. +//! +//! [fixslicing]: https://eprint.iacr.org/2020/1123.pdf +//! [AES-NI]: https://en.wikipedia.org/wiki/AES_instruction_set +//! [`block-modes`]: https://docs.rs/block-modes + +#![no_std] +#![cfg_attr( + all(feature = "armv8", target_arch = "aarch64"), + feature(stdsimd, aarch64_target_feature) +)] +#![cfg_attr(docsrs, feature(doc_cfg))] +#![doc( + html_logo_url = "https://raw.githubusercontent.com/RustCrypto/meta/master/logo.svg", + html_favicon_url = "https://raw.githubusercontent.com/RustCrypto/meta/master/logo.svg" +)] +#![warn(missing_docs, rust_2018_idioms)] + +#[cfg(feature = "hazmat")] +pub mod hazmat; + +mod soft; + +use cfg_if::cfg_if; + +cfg_if! { + if #[cfg(all(target_arch = "aarch64", feature = "armv8", not(feature = "force-soft")))] { + mod armv8; + mod autodetect; + pub use autodetect::{Aes128, Aes192, Aes256}; + + #[cfg(feature = "ctr")] + pub use autodetect::ctr::{Aes128Ctr, Aes192Ctr, Aes256Ctr}; + } else if #[cfg(all( + any(target_arch = "x86", target_arch = "x86_64"), + not(feature = "force-soft") + ))] { + mod autodetect; + mod ni; + pub use autodetect::{Aes128, Aes192, Aes256}; + + #[cfg(feature = "ctr")] + pub use autodetect::ctr::{Aes128Ctr, Aes192Ctr, Aes256Ctr}; + } else { + pub use soft::{Aes128, Aes192, Aes256}; + + #[cfg(feature = "ctr")] + pub use soft::{Aes128Ctr, Aes192Ctr, Aes256Ctr}; + } +} + +pub use cipher::{self, BlockCipher, BlockDecrypt, BlockEncrypt, NewBlockCipher}; + +/// 128-bit AES block +pub type Block = cipher::generic_array::GenericArray<u8, cipher::consts::U16>; + +/// 8 x 128-bit AES blocks to be processed in parallel +pub type ParBlocks = cipher::generic_array::GenericArray<Block, cipher::consts::U8>; + +/// Size of an AES block (128-bits; 16-bytes) +pub const BLOCK_SIZE: usize = 16; diff --git a/rust/vendor/aes/src/ni.rs b/rust/vendor/aes/src/ni.rs new file mode 100644 index 0000000..56e8e1f --- /dev/null +++ b/rust/vendor/aes/src/ni.rs @@ -0,0 +1,45 @@ +//! AES block ciphers implementation using AES-NI instruction set. +//! +//! Ciphers functionality is accessed using `BlockCipher` trait from the +//! [`cipher`](https://docs.rs/cipher) crate. +//! +//! # CTR mode +//! In addition to core block cipher functionality this crate provides optimized +//! CTR mode implementation. This functionality requires additional `ssse3` +//! target feature and feature-gated behind `ctr` feature flag, which is enabled +//! by default. +//! +//! # Vulnerability +//! Lazy FP state restory vulnerability can allow local process to leak content +//! of the FPU register, in which round keys are stored. This vulnerability +//! can be mitigated at the operating system level by installing relevant +//! patches. (i.e. keep your OS updated!) More info: +//! - [Intel advisory](https://www.intel.com/content/www/us/en/security-center/advisory/intel-sa-00145.html) +//! - [Wikipedia](https://en.wikipedia.org/wiki/Lazy_FP_state_restore) +//! +//! # Related documents +//! - [Intel AES-NI whitepaper](https://software.intel.com/sites/default/files/article/165683/aes-wp-2012-09-22-v01.pdf) +//! - [Use of the AES Instruction Set](https://www.cosic.esat.kuleuven.be/ecrypt/AESday/slides/Use_of_the_AES_Instruction_Set.pdf) + +#[macro_use] +mod utils; + +mod aes128; +mod aes192; +mod aes256; + +#[cfg(feature = "ctr")] +mod ctr; + +#[cfg(feature = "hazmat")] +pub(crate) mod hazmat; + +#[cfg(target_arch = "x86")] +use core::arch::x86 as arch; +#[cfg(target_arch = "x86_64")] +use core::arch::x86_64 as arch; + +pub use self::{aes128::Aes128, aes192::Aes192, aes256::Aes256}; + +#[cfg(feature = "ctr")] +pub use self::ctr::{Aes128Ctr, Aes192Ctr, Aes256Ctr}; diff --git a/rust/vendor/aes/src/ni/aes128.rs b/rust/vendor/aes/src/ni/aes128.rs new file mode 100644 index 0000000..f079fdd --- /dev/null +++ b/rust/vendor/aes/src/ni/aes128.rs @@ -0,0 +1,163 @@ +use super::{ + arch::*, + utils::{aesdec8, aesdeclast8, aesenc8, aesenclast8, load8, store8, xor8, U128x8}, +}; +use crate::{Block, ParBlocks}; +use cipher::{ + consts::{U16, U8}, + generic_array::GenericArray, + BlockCipher, BlockDecrypt, BlockEncrypt, NewBlockCipher, +}; + +mod expand; +#[cfg(test)] +mod test_expand; + +/// AES-128 round keys +type RoundKeys = [__m128i; 11]; + +/// AES-128 block cipher +#[derive(Clone)] +pub struct Aes128 { + encrypt_keys: RoundKeys, + decrypt_keys: RoundKeys, +} + +impl Aes128 { + #[inline(always)] + pub(crate) fn encrypt8(&self, mut blocks: U128x8) -> U128x8 { + #[inline] + #[target_feature(enable = "aes")] + unsafe fn aesni128_encrypt8(keys: &RoundKeys, blocks: &mut U128x8) { + xor8(blocks, keys[0]); + aesenc8(blocks, keys[1]); + aesenc8(blocks, keys[2]); + aesenc8(blocks, keys[3]); + aesenc8(blocks, keys[4]); + aesenc8(blocks, keys[5]); + aesenc8(blocks, keys[6]); + aesenc8(blocks, keys[7]); + aesenc8(blocks, keys[8]); + aesenc8(blocks, keys[9]); + aesenclast8(blocks, keys[10]); + } + unsafe { aesni128_encrypt8(&self.encrypt_keys, &mut blocks) }; + blocks + } + + #[inline(always)] + pub(crate) fn encrypt(&self, block: __m128i) -> __m128i { + #[inline] + #[target_feature(enable = "aes")] + unsafe fn aesni128_encrypt1(keys: &RoundKeys, mut block: __m128i) -> __m128i { + block = _mm_xor_si128(block, keys[0]); + block = _mm_aesenc_si128(block, keys[1]); + block = _mm_aesenc_si128(block, keys[2]); + block = _mm_aesenc_si128(block, keys[3]); + block = _mm_aesenc_si128(block, keys[4]); + block = _mm_aesenc_si128(block, keys[5]); + block = _mm_aesenc_si128(block, keys[6]); + block = _mm_aesenc_si128(block, keys[7]); + block = _mm_aesenc_si128(block, keys[8]); + block = _mm_aesenc_si128(block, keys[9]); + _mm_aesenclast_si128(block, keys[10]) + } + unsafe { aesni128_encrypt1(&self.encrypt_keys, block) } + } +} + +impl NewBlockCipher for Aes128 { + type KeySize = U16; + + #[inline] + fn new(key: &GenericArray<u8, U16>) -> Self { + let key = unsafe { &*(key as *const _ as *const [u8; 16]) }; + + let (encrypt_keys, decrypt_keys) = expand::expand(key); + + Self { + encrypt_keys, + decrypt_keys, + } + } +} + +impl BlockCipher for Aes128 { + type BlockSize = U16; + type ParBlocks = U8; +} + +impl BlockEncrypt for Aes128 { + #[inline] + fn encrypt_block(&self, block: &mut Block) { + // Safety: `loadu` and `storeu` support unaligned access + #[allow(clippy::cast_ptr_alignment)] + unsafe { + let b = _mm_loadu_si128(block.as_ptr() as *const __m128i); + let b = self.encrypt(b); + _mm_storeu_si128(block.as_mut_ptr() as *mut __m128i, b); + } + } + + #[inline] + fn encrypt_par_blocks(&self, blocks: &mut ParBlocks) { + let b = self.encrypt8(load8(blocks)); + store8(blocks, b); + } +} + +impl BlockDecrypt for Aes128 { + #[inline] + fn decrypt_block(&self, block: &mut Block) { + #[inline] + #[target_feature(enable = "aes")] + unsafe fn aes128_decrypt1(block: &mut Block, keys: &RoundKeys) { + // Safety: `loadu` and `storeu` support unaligned access + #[allow(clippy::cast_ptr_alignment)] + let mut b = _mm_loadu_si128(block.as_ptr() as *const __m128i); + + b = _mm_xor_si128(b, keys[10]); + b = _mm_aesdec_si128(b, keys[9]); + b = _mm_aesdec_si128(b, keys[8]); + b = _mm_aesdec_si128(b, keys[7]); + b = _mm_aesdec_si128(b, keys[6]); + b = _mm_aesdec_si128(b, keys[5]); + b = _mm_aesdec_si128(b, keys[4]); + b = _mm_aesdec_si128(b, keys[3]); + b = _mm_aesdec_si128(b, keys[2]); + b = _mm_aesdec_si128(b, keys[1]); + b = _mm_aesdeclast_si128(b, keys[0]); + + // Safety: `loadu` and `storeu` support unaligned access + #[allow(clippy::cast_ptr_alignment)] + _mm_storeu_si128(block.as_mut_ptr() as *mut __m128i, b); + } + + unsafe { aes128_decrypt1(block, &self.decrypt_keys) } + } + + #[inline] + fn decrypt_par_blocks(&self, blocks: &mut ParBlocks) { + #[inline] + #[target_feature(enable = "aes")] + unsafe fn aes128_decrypt8(blocks: &mut ParBlocks, keys: &RoundKeys) { + let mut b = load8(blocks); + xor8(&mut b, keys[10]); + aesdec8(&mut b, keys[9]); + aesdec8(&mut b, keys[8]); + aesdec8(&mut b, keys[7]); + aesdec8(&mut b, keys[6]); + aesdec8(&mut b, keys[5]); + aesdec8(&mut b, keys[4]); + aesdec8(&mut b, keys[3]); + aesdec8(&mut b, keys[2]); + aesdec8(&mut b, keys[1]); + aesdeclast8(&mut b, keys[0]); + store8(blocks, b); + } + + unsafe { aes128_decrypt8(blocks, &self.decrypt_keys) } + } +} + +opaque_debug::implement!(Aes128); diff --git a/rust/vendor/aes/src/ni/aes128/expand.rs b/rust/vendor/aes/src/ni/aes128/expand.rs new file mode 100644 index 0000000..f7b65b6 --- /dev/null +++ b/rust/vendor/aes/src/ni/aes128/expand.rs @@ -0,0 +1,53 @@ +use super::RoundKeys; +use crate::ni::arch::*; + +use core::mem; + +macro_rules! expand_round { + ($enc_keys:expr, $dec_keys:expr, $pos:expr, $round:expr) => { + let mut t1 = $enc_keys[$pos - 1]; + let mut t2; + let mut t3; + + t2 = _mm_aeskeygenassist_si128(t1, $round); + t2 = _mm_shuffle_epi32(t2, 0xff); + t3 = _mm_slli_si128(t1, 0x4); + t1 = _mm_xor_si128(t1, t3); + t3 = _mm_slli_si128(t3, 0x4); + t1 = _mm_xor_si128(t1, t3); + t3 = _mm_slli_si128(t3, 0x4); + t1 = _mm_xor_si128(t1, t3); + t1 = _mm_xor_si128(t1, t2); + + $enc_keys[$pos] = t1; + let t1 = if $pos != 10 { _mm_aesimc_si128(t1) } else { t1 }; + $dec_keys[$pos] = t1; + }; +} + +#[inline(always)] +pub(super) fn expand(key: &[u8; 16]) -> (RoundKeys, RoundKeys) { + unsafe { + let mut enc_keys: RoundKeys = mem::zeroed(); + let mut dec_keys: RoundKeys = mem::zeroed(); + + // Safety: `loadu` supports unaligned loads + #[allow(clippy::cast_ptr_alignment)] + let k = _mm_loadu_si128(key.as_ptr() as *const __m128i); + enc_keys[0] = k; + dec_keys[0] = k; + + expand_round!(enc_keys, dec_keys, 1, 0x01); + expand_round!(enc_keys, dec_keys, 2, 0x02); + expand_round!(enc_keys, dec_keys, 3, 0x04); + expand_round!(enc_keys, dec_keys, 4, 0x08); + expand_round!(enc_keys, dec_keys, 5, 0x10); + expand_round!(enc_keys, dec_keys, 6, 0x20); + expand_round!(enc_keys, dec_keys, 7, 0x40); + expand_round!(enc_keys, dec_keys, 8, 0x80); + expand_round!(enc_keys, dec_keys, 9, 0x1B); + expand_round!(enc_keys, dec_keys, 10, 0x36); + + (enc_keys, dec_keys) + } +} diff --git a/rust/vendor/aes/src/ni/aes128/test_expand.rs b/rust/vendor/aes/src/ni/aes128/test_expand.rs new file mode 100644 index 0000000..38744e6 --- /dev/null +++ b/rust/vendor/aes/src/ni/aes128/test_expand.rs @@ -0,0 +1,107 @@ +use super::expand::expand; +use crate::ni::utils::check; + +#[test] +fn test() { + let enc_keys = expand(&[0x00; 16]).0; + check( + &enc_keys, + &[ + [0x0000000000000000, 0x0000000000000000], + [0x6263636362636363, 0x6263636362636363], + [0x9b9898c9f9fbfbaa, 0x9b9898c9f9fbfbaa], + [0x90973450696ccffa, 0xf2f457330b0fac99], + [0xee06da7b876a1581, 0x759e42b27e91ee2b], + [0x7f2e2b88f8443e09, 0x8dda7cbbf34b9290], + [0xec614b851425758c, 0x99ff09376ab49ba7], + [0x217517873550620b, 0xacaf6b3cc61bf09b], + [0x0ef903333ba96138, 0x97060a04511dfa9f], + [0xb1d4d8e28a7db9da, 0x1d7bb3de4c664941], + [0xb4ef5bcb3e92e211, 0x23e951cf6f8f188e], + ], + ); + + let enc_keys = expand(&[0xff; 16]).0; + check( + &enc_keys, + &[ + [0xffffffffffffffff, 0xffffffffffffffff], + [0xe8e9e9e917161616, 0xe8e9e9e917161616], + [0xadaeae19bab8b80f, 0x525151e6454747f0], + [0x090e2277b3b69a78, 0xe1e7cb9ea4a08c6e], + [0xe16abd3e52dc2746, 0xb33becd8179b60b6], + [0xe5baf3ceb766d488, 0x045d385013c658e6], + [0x71d07db3c6b6a93b, 0xc2eb916bd12dc98d], + [0xe90d208d2fbb89b6, 0xed5018dd3c7dd150], + [0x96337366b988fad0, 0x54d8e20d68a5335d], + [0x8bf03f233278c5f3, 0x66a027fe0e0514a3], + [0xd60a3588e472f07b, 0x82d2d7858cd7c326], + ], + ); + + let enc_keys = expand(&[ + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, + 0x0f, + ]) + .0; + check( + &enc_keys, + &[ + [0x0001020304050607, 0x08090a0b0c0d0e0f], + [0xd6aa74fdd2af72fa, 0xdaa678f1d6ab76fe], + [0xb692cf0b643dbdf1, 0xbe9bc5006830b3fe], + [0xb6ff744ed2c2c9bf, 0x6c590cbf0469bf41], + [0x47f7f7bc95353e03, 0xf96c32bcfd058dfd], + [0x3caaa3e8a99f9deb, 0x50f3af57adf622aa], + [0x5e390f7df7a69296, 0xa7553dc10aa31f6b], + [0x14f9701ae35fe28c, 0x440adf4d4ea9c026], + [0x47438735a41c65b9, 0xe016baf4aebf7ad2], + [0x549932d1f0855768, 0x1093ed9cbe2c974e], + [0x13111d7fe3944a17, 0xf307a78b4d2b30c5], + ], + ); + + let enc_keys = expand(&[ + 0x69, 0x20, 0xe2, 0x99, 0xa5, 0x20, 0x2a, 0x6d, 0x65, 0x6e, 0x63, 0x68, 0x69, 0x74, 0x6f, + 0x2a, + ]) + .0; + check( + &enc_keys, + &[ + [0x6920e299a5202a6d, 0x656e636869746f2a], + [0xfa8807605fa82d0d, 0x3ac64e6553b2214f], + [0xcf75838d90ddae80, 0xaa1be0e5f9a9c1aa], + [0x180d2f1488d08194, 0x22cb6171db62a0db], + [0xbaed96ad323d1739, 0x10f67648cb94d693], + [0x881b4ab2ba265d8b, 0xaad02bc36144fd50], + [0xb34f195d096944d6, 0xa3b96f15c2fd9245], + [0xa7007778ae6933ae, 0x0dd05cbbcf2dcefe], + [0xff8bccf251e2ff5c, 0x5c32a3e7931f6d19], + [0x24b7182e7555e772, 0x29674495ba78298c], + [0xae127cdadb479ba8, 0xf220df3d4858f6b1], + ], + ); + + let enc_keys = expand(&[ + 0x2b, 0x7e, 0x15, 0x16, 0x28, 0xae, 0xd2, 0xa6, 0xab, 0xf7, 0x15, 0x88, 0x09, 0xcf, 0x4f, + 0x3c, + ]) + .0; + check( + &enc_keys, + &[ + [0x2b7e151628aed2a6, 0xabf7158809cf4f3c], + [0xa0fafe1788542cb1, 0x23a339392a6c7605], + [0xf2c295f27a96b943, 0x5935807a7359f67f], + [0x3d80477d4716fe3e, 0x1e237e446d7a883b], + [0xef44a541a8525b7f, 0xb671253bdb0bad00], + [0xd4d1c6f87c839d87, 0xcaf2b8bc11f915bc], + [0x6d88a37a110b3efd, 0xdbf98641ca0093fd], + [0x4e54f70e5f5fc9f3, 0x84a64fb24ea6dc4f], + [0xead27321b58dbad2, 0x312bf5607f8d292f], + [0xac7766f319fadc21, 0x28d12941575c006e], + [0xd014f9a8c9ee2589, 0xe13f0cc8b6630ca6], + ], + ); +} diff --git a/rust/vendor/aes/src/ni/aes192.rs b/rust/vendor/aes/src/ni/aes192.rs new file mode 100644 index 0000000..fb64289 --- /dev/null +++ b/rust/vendor/aes/src/ni/aes192.rs @@ -0,0 +1,169 @@ +use super::{ + arch::*, + utils::{aesdec8, aesdeclast8, aesenc8, aesenclast8, load8, store8, xor8, U128x8}, +}; +use crate::{Block, ParBlocks}; +use cipher::{ + consts::{U16, U24, U8}, + generic_array::GenericArray, + BlockCipher, BlockDecrypt, BlockEncrypt, NewBlockCipher, +}; + +mod expand; +#[cfg(test)] +mod test_expand; + +/// AES-192 round keys +type RoundKeys = [__m128i; 13]; + +/// AES-192 block cipher +#[derive(Clone)] +pub struct Aes192 { + encrypt_keys: RoundKeys, + decrypt_keys: RoundKeys, +} + +impl Aes192 { + #[inline(always)] + pub(crate) fn encrypt8(&self, mut blocks: U128x8) -> U128x8 { + #[inline] + #[target_feature(enable = "aes")] + unsafe fn aesni192_encrypt8(keys: &RoundKeys, blocks: &mut U128x8) { + xor8(blocks, keys[0]); + aesenc8(blocks, keys[1]); + aesenc8(blocks, keys[2]); + aesenc8(blocks, keys[3]); + aesenc8(blocks, keys[4]); + aesenc8(blocks, keys[5]); + aesenc8(blocks, keys[6]); + aesenc8(blocks, keys[7]); + aesenc8(blocks, keys[8]); + aesenc8(blocks, keys[9]); + aesenc8(blocks, keys[10]); + aesenc8(blocks, keys[11]); + aesenclast8(blocks, keys[12]); + } + unsafe { aesni192_encrypt8(&self.encrypt_keys, &mut blocks) }; + blocks + } + + #[inline(always)] + pub(crate) fn encrypt(&self, block: __m128i) -> __m128i { + #[inline] + #[target_feature(enable = "aes")] + unsafe fn aesni192_encrypt1(keys: &RoundKeys, mut block: __m128i) -> __m128i { + block = _mm_xor_si128(block, keys[0]); + block = _mm_aesenc_si128(block, keys[1]); + block = _mm_aesenc_si128(block, keys[2]); + block = _mm_aesenc_si128(block, keys[3]); + block = _mm_aesenc_si128(block, keys[4]); + block = _mm_aesenc_si128(block, keys[5]); + block = _mm_aesenc_si128(block, keys[6]); + block = _mm_aesenc_si128(block, keys[7]); + block = _mm_aesenc_si128(block, keys[8]); + block = _mm_aesenc_si128(block, keys[9]); + block = _mm_aesenc_si128(block, keys[10]); + block = _mm_aesenc_si128(block, keys[11]); + _mm_aesenclast_si128(block, keys[12]) + } + unsafe { aesni192_encrypt1(&self.encrypt_keys, block) } + } +} + +impl NewBlockCipher for Aes192 { + type KeySize = U24; + + #[inline] + fn new(key: &GenericArray<u8, U24>) -> Self { + let key = unsafe { &*(key as *const _ as *const [u8; 24]) }; + let (encrypt_keys, decrypt_keys) = expand::expand(key); + Self { + encrypt_keys, + decrypt_keys, + } + } +} + +impl BlockCipher for Aes192 { + type BlockSize = U16; + type ParBlocks = U8; +} + +impl BlockEncrypt for Aes192 { + #[inline] + fn encrypt_block(&self, block: &mut Block) { + // Safety: `loadu` and `storeu` support unaligned access + #[allow(clippy::cast_ptr_alignment)] + unsafe { + let b = _mm_loadu_si128(block.as_ptr() as *const __m128i); + let b = self.encrypt(b); + _mm_storeu_si128(block.as_mut_ptr() as *mut __m128i, b); + } + } + + #[inline] + fn encrypt_par_blocks(&self, blocks: &mut ParBlocks) { + let b = self.encrypt8(load8(blocks)); + store8(blocks, b); + } +} + +impl BlockDecrypt for Aes192 { + #[inline] + fn decrypt_block(&self, block: &mut Block) { + #[inline] + #[target_feature(enable = "aes")] + unsafe fn aes192_decrypt1(block: &mut Block, keys: &RoundKeys) { + // Safety: `loadu` and `storeu` support unaligned access + #[allow(clippy::cast_ptr_alignment)] + let mut b = _mm_loadu_si128(block.as_ptr() as *const __m128i); + + b = _mm_xor_si128(b, keys[12]); + b = _mm_aesdec_si128(b, keys[11]); + b = _mm_aesdec_si128(b, keys[10]); + b = _mm_aesdec_si128(b, keys[9]); + b = _mm_aesdec_si128(b, keys[8]); + b = _mm_aesdec_si128(b, keys[7]); + b = _mm_aesdec_si128(b, keys[6]); + b = _mm_aesdec_si128(b, keys[5]); + b = _mm_aesdec_si128(b, keys[4]); + b = _mm_aesdec_si128(b, keys[3]); + b = _mm_aesdec_si128(b, keys[2]); + b = _mm_aesdec_si128(b, keys[1]); + b = _mm_aesdeclast_si128(b, keys[0]); + + // Safety: `loadu` and `storeu` support unaligned access + #[allow(clippy::cast_ptr_alignment)] + _mm_storeu_si128(block.as_mut_ptr() as *mut __m128i, b); + } + + unsafe { aes192_decrypt1(block, &self.decrypt_keys) } + } + + #[inline] + fn decrypt_par_blocks(&self, blocks: &mut ParBlocks) { + #[inline] + #[target_feature(enable = "aes")] + unsafe fn aes192_decrypt8(blocks: &mut ParBlocks, keys: &RoundKeys) { + let mut b = load8(blocks); + xor8(&mut b, keys[12]); + aesdec8(&mut b, keys[11]); + aesdec8(&mut b, keys[10]); + aesdec8(&mut b, keys[9]); + aesdec8(&mut b, keys[8]); + aesdec8(&mut b, keys[7]); + aesdec8(&mut b, keys[6]); + aesdec8(&mut b, keys[5]); + aesdec8(&mut b, keys[4]); + aesdec8(&mut b, keys[3]); + aesdec8(&mut b, keys[2]); + aesdec8(&mut b, keys[1]); + aesdeclast8(&mut b, keys[0]); + store8(blocks, b); + } + + unsafe { aes192_decrypt8(blocks, &self.decrypt_keys) } + } +} + +opaque_debug::implement!(Aes192); diff --git a/rust/vendor/aes/src/ni/aes192/expand.rs b/rust/vendor/aes/src/ni/aes192/expand.rs new file mode 100644 index 0000000..797b986 --- /dev/null +++ b/rust/vendor/aes/src/ni/aes192/expand.rs @@ -0,0 +1,108 @@ +use super::RoundKeys; +use crate::ni::arch::*; + +use core::{mem, ptr}; + +macro_rules! expand_round { + ($t1:expr, $t3:expr, $round:expr) => {{ + let mut t1 = $t1; + let mut t2; + let mut t3 = $t3; + let mut t4; + + t2 = _mm_aeskeygenassist_si128(t3, $round); + t2 = _mm_shuffle_epi32(t2, 0x55); + t4 = _mm_slli_si128(t1, 0x4); + t1 = _mm_xor_si128(t1, t4); + t4 = _mm_slli_si128(t4, 0x4); + t1 = _mm_xor_si128(t1, t4); + t4 = _mm_slli_si128(t4, 0x4); + t1 = _mm_xor_si128(t1, t4); + t1 = _mm_xor_si128(t1, t2); + t2 = _mm_shuffle_epi32(t1, 0xff); + t4 = _mm_slli_si128(t3, 0x4); + t3 = _mm_xor_si128(t3, t4); + t3 = _mm_xor_si128(t3, t2); + + (t1, t3) + }}; +} + +macro_rules! shuffle { + ($a:expr, $b:expr, $imm:expr) => { + mem::transmute::<_, __m128i>(_mm_shuffle_pd(mem::transmute($a), mem::transmute($b), $imm)) + }; +} + +#[inline(always)] +pub(super) fn expand(key: &[u8; 24]) -> (RoundKeys, RoundKeys) { + unsafe { + let mut enc_keys: RoundKeys = mem::zeroed(); + let mut dec_keys: RoundKeys = mem::zeroed(); + + macro_rules! store { + ($i:expr, $k:expr) => { + enc_keys[$i] = $k; + dec_keys[$i] = _mm_aesimc_si128($k); + }; + } + + // we are being extra pedantic here to remove out-of-bound access. + // this should be optimized out into movups, movsd sequence + // note that unaligned load MUST be used here, even though we read + // from the array (compiler missoptimizes aligned load) + let (k0, k1l) = { + let mut t = [0u8; 32]; + ptr::write(t.as_mut_ptr() as *mut [u8; 24], *key); + + // Safety: `loadu` supports unaligned loads + #[allow(clippy::cast_ptr_alignment)] + ( + _mm_loadu_si128(t.as_ptr() as *const __m128i), + _mm_loadu_si128(t.as_ptr().offset(16) as *const __m128i), + ) + }; + + enc_keys[0] = k0; + dec_keys[0] = k0; + + let (k1_2, k2r) = expand_round!(k0, k1l, 0x01); + let k1 = shuffle!(k1l, k1_2, 0); + let k2 = shuffle!(k1_2, k2r, 1); + store!(1, k1); + store!(2, k2); + + let (k3, k4l) = expand_round!(k1_2, k2r, 0x02); + store!(3, k3); + + let (k4_5, k5r) = expand_round!(k3, k4l, 0x04); + let k4 = shuffle!(k4l, k4_5, 0); + let k5 = shuffle!(k4_5, k5r, 1); + store!(4, k4); + store!(5, k5); + + let (k6, k7l) = expand_round!(k4_5, k5r, 0x08); + store!(6, k6); + + let (k7_8, k8r) = expand_round!(k6, k7l, 0x10); + let k7 = shuffle!(k7l, k7_8, 0); + let k8 = shuffle!(k7_8, k8r, 1); + store!(7, k7); + store!(8, k8); + + let (k9, k10l) = expand_round!(k7_8, k8r, 0x20); + store!(9, k9); + + let (k10_11, k11r) = expand_round!(k9, k10l, 0x40); + let k10 = shuffle!(k10l, k10_11, 0); + let k11 = shuffle!(k10_11, k11r, 1); + store!(10, k10); + store!(11, k11); + + let (k12, _) = expand_round!(k10_11, k11r, 0x80); + enc_keys[12] = k12; + dec_keys[12] = k12; + + (enc_keys, dec_keys) + } +} diff --git a/rust/vendor/aes/src/ni/aes192/test_expand.rs b/rust/vendor/aes/src/ni/aes192/test_expand.rs new file mode 100644 index 0000000..7811d4c --- /dev/null +++ b/rust/vendor/aes/src/ni/aes192/test_expand.rs @@ -0,0 +1,93 @@ +use super::expand::expand; +use crate::ni::utils::check; + +#[test] +fn test() { + let enc_keys = expand(&[0x00; 24]).0; + check( + &enc_keys, + &[ + [0x0000000000000000, 0x0000000000000000], + [0x0000000000000000, 0x6263636362636363], + [0x6263636362636363, 0x6263636362636363], + [0x9b9898c9f9fbfbaa, 0x9b9898c9f9fbfbaa], + [0x9b9898c9f9fbfbaa, 0x90973450696ccffa], + [0xf2f457330b0fac99, 0x90973450696ccffa], + [0xc81d19a9a171d653, 0x53858160588a2df9], + [0xc81d19a9a171d653, 0x7bebf49bda9a22c8], + [0x891fa3a8d1958e51, 0x198897f8b8f941ab], + [0xc26896f718f2b43f, 0x91ed1797407899c6], + [0x59f00e3ee1094f95, 0x83ecbc0f9b1e0830], + [0x0af31fa74a8b8661, 0x137b885ff272c7ca], + [0x432ac886d834c0b6, 0xd2c7df11984c5970], + ], + ); + + let enc_keys = expand(&[0xff; 24]).0; + check( + &enc_keys, + &[ + [0xffffffffffffffff, 0xffffffffffffffff], + [0xffffffffffffffff, 0xe8e9e9e917161616], + [0xe8e9e9e917161616, 0xe8e9e9e917161616], + [0xadaeae19bab8b80f, 0x525151e6454747f0], + [0xadaeae19bab8b80f, 0xc5c2d8ed7f7a60e2], + [0x2d2b3104686c76f4, 0xc5c2d8ed7f7a60e2], + [0x1712403f686820dd, 0x454311d92d2f672d], + [0xe8edbfc09797df22, 0x8f8cd3b7e7e4f36a], + [0xa2a7e2b38f88859e, 0x67653a5ef0f2e57c], + [0x2655c33bc1b13051, 0x6316d2e2ec9e577c], + [0x8bfb6d227b09885e, 0x67919b1aa620ab4b], + [0xc53679a929a82ed5, 0xa25343f7d95acba9], + [0x598e482fffaee364, 0x3a989acd1330b418], + ], + ); + + let enc_keys = expand(&[ + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, + 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, + ]) + .0; + check( + &enc_keys, + &[ + [0x0001020304050607, 0x08090a0b0c0d0e0f], + [0x1011121314151617, 0x5846f2f95c43f4fe], + [0x544afef55847f0fa, 0x4856e2e95c43f4fe], + [0x40f949b31cbabd4d, 0x48f043b810b7b342], + [0x58e151ab04a2a555, 0x7effb5416245080c], + [0x2ab54bb43a02f8f6, 0x62e3a95d66410c08], + [0xf501857297448d7e, 0xbdf1c6ca87f33e3c], + [0xe510976183519b69, 0x34157c9ea351f1e0], + [0x1ea0372a99530916, 0x7c439e77ff12051e], + [0xdd7e0e887e2fff68, 0x608fc842f9dcc154], + [0x859f5f237a8d5a3d, 0xc0c02952beefd63a], + [0xde601e7827bcdf2c, 0xa223800fd8aeda32], + [0xa4970a331a78dc09, 0xc418c271e3a41d5d], + ], + ); + + let enc_keys = expand(&[ + 0x8e, 0x73, 0xb0, 0xf7, 0xda, 0x0e, 0x64, 0x52, 0xc8, 0x10, 0xf3, 0x2b, 0x80, 0x90, 0x79, + 0xe5, 0x62, 0xf8, 0xea, 0xd2, 0x52, 0x2c, 0x6b, 0x7b, + ]) + .0; + check( + &enc_keys, + &[ + [0x8e73b0f7da0e6452, 0xc810f32b809079e5], + [0x62f8ead2522c6b7b, 0xfe0c91f72402f5a5], + [0xec12068e6c827f6b, 0x0e7a95b95c56fec2], + [0x4db7b4bd69b54118, 0x85a74796e92538fd], + [0xe75fad44bb095386, 0x485af05721efb14f], + [0xa448f6d94d6dce24, 0xaa326360113b30e6], + [0xa25e7ed583b1cf9a, 0x27f939436a94f767], + [0xc0a69407d19da4e1, 0xec1786eb6fa64971], + [0x485f703222cb8755, 0xe26d135233f0b7b3], + [0x40beeb282f18a259, 0x6747d26b458c553e], + [0xa7e1466c9411f1df, 0x821f750aad07d753], + [0xca4005388fcc5006, 0x282d166abc3ce7b5], + [0xe98ba06f448c773c, 0x8ecc720401002202], + ], + ); +} diff --git a/rust/vendor/aes/src/ni/aes256.rs b/rust/vendor/aes/src/ni/aes256.rs new file mode 100644 index 0000000..9a752c1 --- /dev/null +++ b/rust/vendor/aes/src/ni/aes256.rs @@ -0,0 +1,177 @@ +use super::{ + arch::*, + utils::{aesdec8, aesdeclast8, aesenc8, aesenclast8, load8, store8, xor8, U128x8}, +}; +use crate::{Block, ParBlocks}; +use cipher::{ + consts::{U16, U32, U8}, + generic_array::GenericArray, + BlockCipher, BlockDecrypt, BlockEncrypt, NewBlockCipher, +}; + +mod expand; +#[cfg(test)] +mod test_expand; + +/// AES-256 round keys +type RoundKeys = [__m128i; 15]; + +/// AES-256 block cipher +#[derive(Clone)] +pub struct Aes256 { + encrypt_keys: RoundKeys, + decrypt_keys: RoundKeys, +} + +impl Aes256 { + #[inline(always)] + pub(crate) fn encrypt8(&self, mut blocks: U128x8) -> U128x8 { + #[inline] + #[target_feature(enable = "aes")] + unsafe fn aesni256_encrypt8(keys: &RoundKeys, blocks: &mut U128x8) { + xor8(blocks, keys[0]); + aesenc8(blocks, keys[1]); + aesenc8(blocks, keys[2]); + aesenc8(blocks, keys[3]); + aesenc8(blocks, keys[4]); + aesenc8(blocks, keys[5]); + aesenc8(blocks, keys[6]); + aesenc8(blocks, keys[7]); + aesenc8(blocks, keys[8]); + aesenc8(blocks, keys[9]); + aesenc8(blocks, keys[10]); + aesenc8(blocks, keys[11]); + aesenc8(blocks, keys[12]); + aesenc8(blocks, keys[13]); + aesenclast8(blocks, keys[14]); + } + unsafe { aesni256_encrypt8(&self.encrypt_keys, &mut blocks) }; + blocks + } + + #[inline(always)] + pub(crate) fn encrypt(&self, block: __m128i) -> __m128i { + #[inline] + #[target_feature(enable = "aes")] + unsafe fn aesni256_encrypt1(keys: &RoundKeys, mut block: __m128i) -> __m128i { + block = _mm_xor_si128(block, keys[0]); + block = _mm_aesenc_si128(block, keys[1]); + block = _mm_aesenc_si128(block, keys[2]); + block = _mm_aesenc_si128(block, keys[3]); + block = _mm_aesenc_si128(block, keys[4]); + block = _mm_aesenc_si128(block, keys[5]); + block = _mm_aesenc_si128(block, keys[6]); + block = _mm_aesenc_si128(block, keys[7]); + block = _mm_aesenc_si128(block, keys[8]); + block = _mm_aesenc_si128(block, keys[9]); + block = _mm_aesenc_si128(block, keys[10]); + block = _mm_aesenc_si128(block, keys[11]); + block = _mm_aesenc_si128(block, keys[12]); + block = _mm_aesenc_si128(block, keys[13]); + _mm_aesenclast_si128(block, keys[14]) + } + unsafe { aesni256_encrypt1(&self.encrypt_keys, block) } + } +} + +impl NewBlockCipher for Aes256 { + type KeySize = U32; + + #[inline] + fn new(key: &GenericArray<u8, U32>) -> Self { + let key = unsafe { &*(key as *const _ as *const [u8; 32]) }; + let (encrypt_keys, decrypt_keys) = expand::expand(key); + Self { + encrypt_keys, + decrypt_keys, + } + } +} + +impl BlockCipher for Aes256 { + type BlockSize = U16; + type ParBlocks = U8; +} + +impl BlockEncrypt for Aes256 { + #[inline] + fn encrypt_block(&self, block: &mut Block) { + // Safety: `loadu` and `storeu` support unaligned access + #[allow(clippy::cast_ptr_alignment)] + unsafe { + let b = _mm_loadu_si128(block.as_ptr() as *const __m128i); + let b = self.encrypt(b); + _mm_storeu_si128(block.as_mut_ptr() as *mut __m128i, b); + } + } + + #[inline] + fn encrypt_par_blocks(&self, blocks: &mut ParBlocks) { + let b = self.encrypt8(load8(blocks)); + store8(blocks, b); + } +} + +impl BlockDecrypt for Aes256 { + #[inline] + fn decrypt_block(&self, block: &mut Block) { + #[inline] + #[target_feature(enable = "aes")] + unsafe fn aes256_decrypt1(block: &mut Block, keys: &RoundKeys) { + // Safety: `loadu` and `storeu` support unaligned access + #[allow(clippy::cast_ptr_alignment)] + let mut b = _mm_loadu_si128(block.as_ptr() as *const __m128i); + + b = _mm_xor_si128(b, keys[14]); + b = _mm_aesdec_si128(b, keys[13]); + b = _mm_aesdec_si128(b, keys[12]); + b = _mm_aesdec_si128(b, keys[11]); + b = _mm_aesdec_si128(b, keys[10]); + b = _mm_aesdec_si128(b, keys[9]); + b = _mm_aesdec_si128(b, keys[8]); + b = _mm_aesdec_si128(b, keys[7]); + b = _mm_aesdec_si128(b, keys[6]); + b = _mm_aesdec_si128(b, keys[5]); + b = _mm_aesdec_si128(b, keys[4]); + b = _mm_aesdec_si128(b, keys[3]); + b = _mm_aesdec_si128(b, keys[2]); + b = _mm_aesdec_si128(b, keys[1]); + b = _mm_aesdeclast_si128(b, keys[0]); + + // Safety: `loadu` and `storeu` support unaligned access + #[allow(clippy::cast_ptr_alignment)] + _mm_storeu_si128(block.as_mut_ptr() as *mut __m128i, b); + } + + unsafe { aes256_decrypt1(block, &self.decrypt_keys) } + } + + #[inline] + fn decrypt_par_blocks(&self, blocks: &mut ParBlocks) { + #[inline] + #[target_feature(enable = "aes")] + unsafe fn aes256_decrypt8(blocks: &mut ParBlocks, keys: &RoundKeys) { + let mut b = load8(blocks); + xor8(&mut b, keys[14]); + aesdec8(&mut b, keys[13]); + aesdec8(&mut b, keys[12]); + aesdec8(&mut b, keys[11]); + aesdec8(&mut b, keys[10]); + aesdec8(&mut b, keys[9]); + aesdec8(&mut b, keys[8]); + aesdec8(&mut b, keys[7]); + aesdec8(&mut b, keys[6]); + aesdec8(&mut b, keys[5]); + aesdec8(&mut b, keys[4]); + aesdec8(&mut b, keys[3]); + aesdec8(&mut b, keys[2]); + aesdec8(&mut b, keys[1]); + aesdeclast8(&mut b, keys[0]); + store8(blocks, b); + } + + unsafe { aes256_decrypt8(blocks, &self.decrypt_keys) } + } +} + +opaque_debug::implement!(Aes256); diff --git a/rust/vendor/aes/src/ni/aes256/expand.rs b/rust/vendor/aes/src/ni/aes256/expand.rs new file mode 100644 index 0000000..88b8558 --- /dev/null +++ b/rust/vendor/aes/src/ni/aes256/expand.rs @@ -0,0 +1,89 @@ +use super::RoundKeys; +use crate::ni::arch::*; + +use core::mem; + +macro_rules! expand_round { + ($enc_keys:expr, $dec_keys:expr, $pos:expr, $round:expr) => { + let mut t1 = $enc_keys[$pos - 2]; + let mut t2; + let mut t3 = $enc_keys[$pos - 1]; + let mut t4; + + t2 = _mm_aeskeygenassist_si128(t3, $round); + t2 = _mm_shuffle_epi32(t2, 0xff); + t4 = _mm_slli_si128(t1, 0x4); + t1 = _mm_xor_si128(t1, t4); + t4 = _mm_slli_si128(t4, 0x4); + t1 = _mm_xor_si128(t1, t4); + t4 = _mm_slli_si128(t4, 0x4); + t1 = _mm_xor_si128(t1, t4); + t1 = _mm_xor_si128(t1, t2); + + $enc_keys[$pos] = t1; + $dec_keys[$pos] = _mm_aesimc_si128(t1); + + t4 = _mm_aeskeygenassist_si128(t1, 0x00); + t2 = _mm_shuffle_epi32(t4, 0xaa); + t4 = _mm_slli_si128(t3, 0x4); + t3 = _mm_xor_si128(t3, t4); + t4 = _mm_slli_si128(t4, 0x4); + t3 = _mm_xor_si128(t3, t4); + t4 = _mm_slli_si128(t4, 0x4); + t3 = _mm_xor_si128(t3, t4); + t3 = _mm_xor_si128(t3, t2); + + $enc_keys[$pos + 1] = t3; + $dec_keys[$pos + 1] = _mm_aesimc_si128(t3); + }; +} + +macro_rules! expand_round_last { + ($enc_keys:expr, $dec_keys:expr, $pos:expr, $round:expr) => { + let mut t1 = $enc_keys[$pos - 2]; + let mut t2; + let t3 = $enc_keys[$pos - 1]; + let mut t4; + + t2 = _mm_aeskeygenassist_si128(t3, $round); + t2 = _mm_shuffle_epi32(t2, 0xff); + t4 = _mm_slli_si128(t1, 0x4); + t1 = _mm_xor_si128(t1, t4); + t4 = _mm_slli_si128(t4, 0x4); + t1 = _mm_xor_si128(t1, t4); + t4 = _mm_slli_si128(t4, 0x4); + t1 = _mm_xor_si128(t1, t4); + t1 = _mm_xor_si128(t1, t2); + + $enc_keys[$pos] = t1; + $dec_keys[$pos] = t1; + }; +} + +#[inline(always)] +pub(super) fn expand(key: &[u8; 32]) -> (RoundKeys, RoundKeys) { + // Safety: `loadu` and `storeu` support unaligned access + #[allow(clippy::cast_ptr_alignment)] + unsafe { + let mut enc_keys: RoundKeys = mem::zeroed(); + let mut dec_keys: RoundKeys = mem::zeroed(); + + let kp = key.as_ptr() as *const __m128i; + let k1 = _mm_loadu_si128(kp); + let k2 = _mm_loadu_si128(kp.offset(1)); + enc_keys[0] = k1; + dec_keys[0] = k1; + enc_keys[1] = k2; + dec_keys[1] = _mm_aesimc_si128(k2); + + expand_round!(enc_keys, dec_keys, 2, 0x01); + expand_round!(enc_keys, dec_keys, 4, 0x02); + expand_round!(enc_keys, dec_keys, 6, 0x04); + expand_round!(enc_keys, dec_keys, 8, 0x08); + expand_round!(enc_keys, dec_keys, 10, 0x10); + expand_round!(enc_keys, dec_keys, 12, 0x20); + expand_round_last!(enc_keys, dec_keys, 14, 0x40); + + (enc_keys, dec_keys) + } +} diff --git a/rust/vendor/aes/src/ni/aes256/test_expand.rs b/rust/vendor/aes/src/ni/aes256/test_expand.rs new file mode 100644 index 0000000..52e728f --- /dev/null +++ b/rust/vendor/aes/src/ni/aes256/test_expand.rs @@ -0,0 +1,103 @@ +use super::expand::expand; +use crate::ni::utils::check; + +#[test] +fn test() { + let enc_keys = expand(&[0x00; 32]).0; + check( + &enc_keys, + &[ + [0x0000000000000000, 0x0000000000000000], + [0x0000000000000000, 0x0000000000000000], + [0x6263636362636363, 0x6263636362636363], + [0xaafbfbfbaafbfbfb, 0xaafbfbfbaafbfbfb], + [0x6f6c6ccf0d0f0fac, 0x6f6c6ccf0d0f0fac], + [0x7d8d8d6ad7767691, 0x7d8d8d6ad7767691], + [0x5354edc15e5be26d, 0x31378ea23c38810e], + [0x968a81c141fcf750, 0x3c717a3aeb070cab], + [0x9eaa8f28c0f16d45, 0xf1c6e3e7cdfe62e9], + [0x2b312bdf6acddc8f, 0x56bca6b5bdbbaa1e], + [0x6406fd52a4f79017, 0x553173f098cf1119], + [0x6dbba90b07767584, 0x51cad331ec71792f], + [0xe7b0e89c4347788b, 0x16760b7b8eb91a62], + [0x74ed0ba1739b7e25, 0x2251ad14ce20d43b], + [0x10f80a1753bf729c, 0x45c979e7cb706385], + ], + ); + + let enc_keys = expand(&[0xff; 32]).0; + check( + &enc_keys, + &[ + [0xffffffffffffffff, 0xffffffffffffffff], + [0xffffffffffffffff, 0xffffffffffffffff], + [0xe8e9e9e917161616, 0xe8e9e9e917161616], + [0x0fb8b8b8f0474747, 0x0fb8b8b8f0474747], + [0x4a4949655d5f5f73, 0xb5b6b69aa2a0a08c], + [0x355858dcc51f1f9b, 0xcaa7a7233ae0e064], + [0xafa80ae5f2f75596, 0x4741e30ce5e14380], + [0xeca0421129bf5d8a, 0xe318faa9d9f81acd], + [0xe60ab7d014fde246, 0x53bc014ab65d42ca], + [0xa2ec6e658b5333ef, 0x684bc946b1b3d38b], + [0x9b6c8a188f91685e, 0xdc2d69146a702bde], + [0xa0bd9f782beeac97, 0x43a565d1f216b65a], + [0xfc22349173b35ccf, 0xaf9e35dbc5ee1e05], + [0x0695ed132d7b4184, 0x6ede24559cc8920f], + [0x546d424f27de1e80, 0x88402b5b4dae355e], + ], + ); + + let enc_keys = expand(&[ + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, + 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, + 0x1e, 0x1f, + ]) + .0; + check( + &enc_keys, + &[ + [0x0001020304050607, 0x08090a0b0c0d0e0f], + [0x1011121314151617, 0x18191a1b1c1d1e1f], + [0xa573c29fa176c498, 0xa97fce93a572c09c], + [0x1651a8cd0244beda, 0x1a5da4c10640bade], + [0xae87dff00ff11b68, 0xa68ed5fb03fc1567], + [0x6de1f1486fa54f92, 0x75f8eb5373b8518d], + [0xc656827fc9a79917, 0x6f294cec6cd5598b], + [0x3de23a75524775e7, 0x27bf9eb45407cf39], + [0x0bdc905fc27b0948, 0xad5245a4c1871c2f], + [0x45f5a66017b2d387, 0x300d4d33640a820a], + [0x7ccff71cbeb4fe54, 0x13e6bbf0d261a7df], + [0xf01afafee7a82979, 0xd7a5644ab3afe640], + [0x2541fe719bf50025, 0x8813bbd55a721c0a], + [0x4e5a6699a9f24fe0, 0x7e572baacdf8cdea], + [0x24fc79ccbf0979e9, 0x371ac23c6d68de36], + ], + ); + + let enc_keys = expand(&[ + 0x60, 0x3d, 0xeb, 0x10, 0x15, 0xca, 0x71, 0xbe, 0x2b, 0x73, 0xae, 0xf0, 0x85, 0x7d, 0x77, + 0x81, 0x1f, 0x35, 0x2c, 0x07, 0x3b, 0x61, 0x08, 0xd7, 0x2d, 0x98, 0x10, 0xa3, 0x09, 0x14, + 0xdf, 0xf4, + ]) + .0; + check( + &enc_keys, + &[ + [0x603deb1015ca71be, 0x2b73aef0857d7781], + [0x1f352c073b6108d7, 0x2d9810a30914dff4], + [0x9ba354118e6925af, 0xa51a8b5f2067fcde], + [0xa8b09c1a93d194cd, 0xbe49846eb75d5b9a], + [0xd59aecb85bf3c917, 0xfee94248de8ebe96], + [0xb5a9328a2678a647, 0x983122292f6c79b3], + [0x812c81addadf48ba, 0x24360af2fab8b464], + [0x98c5bfc9bebd198e, 0x268c3ba709e04214], + [0x68007bacb2df3316, 0x96e939e46c518d80], + [0xc814e20476a9fb8a, 0x5025c02d59c58239], + [0xde1369676ccc5a71, 0xfa2563959674ee15], + [0x5886ca5d2e2f31d7, 0x7e0af1fa27cf73c3], + [0x749c47ab18501dda, 0xe2757e4f7401905a], + [0xcafaaae3e4d59b34, 0x9adf6acebd10190d], + [0xfe4890d1e6188d0b, 0x046df344706c631e], + ], + ); +} diff --git a/rust/vendor/aes/src/ni/ctr.rs b/rust/vendor/aes/src/ni/ctr.rs new file mode 100644 index 0000000..cb78910 --- /dev/null +++ b/rust/vendor/aes/src/ni/ctr.rs @@ -0,0 +1,229 @@ +//! AES in counter mode (a.k.a. AES-CTR) + +// TODO(tarcieri): support generic CTR API + +#![allow(clippy::unreadable_literal)] + +use super::arch::*; +use core::mem; + +use super::{Aes128, Aes192, Aes256}; +use crate::BLOCK_SIZE; +use cipher::{ + consts::U16, + errors::{LoopError, OverflowError}, + generic_array::GenericArray, + BlockCipher, FromBlockCipher, SeekNum, StreamCipher, StreamCipherSeek, +}; + +const PAR_BLOCKS: usize = 8; +const PAR_BLOCKS_SIZE: usize = PAR_BLOCKS * BLOCK_SIZE; + +#[inline(always)] +pub fn xor(buf: &mut [u8], key: &[u8]) { + debug_assert_eq!(buf.len(), key.len()); + for (a, b) in buf.iter_mut().zip(key) { + *a ^= *b; + } +} + +#[inline(always)] +fn xor_block8(buf: &mut [u8], ctr: [__m128i; 8]) { + debug_assert_eq!(buf.len(), PAR_BLOCKS_SIZE); + + // Safety: `loadu` and `storeu` support unaligned access + #[allow(clippy::cast_ptr_alignment)] + unsafe { + // compiler should unroll this loop + for i in 0..8 { + let ptr = buf.as_mut_ptr().offset(16 * i) as *mut __m128i; + let data = _mm_loadu_si128(ptr); + let data = _mm_xor_si128(data, ctr[i as usize]); + _mm_storeu_si128(ptr, data); + } + } +} + +#[inline(always)] +fn swap_bytes(v: __m128i) -> __m128i { + unsafe { + let mask = _mm_set_epi64x(0x08090a0b0c0d0e0f, 0x0001020304050607); + _mm_shuffle_epi8(v, mask) + } +} + +#[inline(always)] +fn inc_be(v: __m128i) -> __m128i { + unsafe { _mm_add_epi64(v, _mm_set_epi64x(1, 0)) } +} + +#[inline(always)] +fn load(val: &GenericArray<u8, U16>) -> __m128i { + // Safety: `loadu` supports unaligned loads + #[allow(clippy::cast_ptr_alignment)] + unsafe { + _mm_loadu_si128(val.as_ptr() as *const __m128i) + } +} + +macro_rules! impl_ctr { + ($name:ident, $cipher:ty, $doc:expr) => { + #[doc=$doc] + #[derive(Clone)] + #[cfg_attr(docsrs, doc(cfg(feature = "ctr")))] + pub struct $name { + nonce: __m128i, + ctr: __m128i, + cipher: $cipher, + block: [u8; BLOCK_SIZE], + pos: u8, + } + + impl $name { + #[inline(always)] + fn gen_block(&mut self) { + let block = self.cipher.encrypt(swap_bytes(self.ctr)); + self.block = unsafe { mem::transmute(block) } + } + + #[inline(always)] + fn next_block(&mut self) -> __m128i { + let block = swap_bytes(self.ctr); + self.ctr = inc_be(self.ctr); + self.cipher.encrypt(block) + } + + #[inline(always)] + fn next_block8(&mut self) -> [__m128i; 8] { + let mut ctr = self.ctr; + let mut block8: [__m128i; 8] = unsafe { mem::zeroed() }; + for i in 0..8 { + block8[i] = swap_bytes(ctr); + ctr = inc_be(ctr); + } + self.ctr = ctr; + + self.cipher.encrypt8(block8) + } + + #[inline(always)] + fn get_u64_ctr(&self) -> u64 { + let (ctr, nonce) = unsafe { + ( + mem::transmute::<__m128i, [u64; 2]>(self.ctr)[1], + mem::transmute::<__m128i, [u64; 2]>(self.nonce)[1], + ) + }; + ctr.wrapping_sub(nonce) + } + + /// Check if provided data will not overflow counter + #[inline(always)] + fn check_data_len(&self, data: &[u8]) -> Result<(), LoopError> { + let bs = BLOCK_SIZE; + let leftover_bytes = bs - self.pos as usize; + if data.len() < leftover_bytes { + return Ok(()); + } + let blocks = 1 + (data.len() - leftover_bytes) / bs; + self.get_u64_ctr() + .checked_add(blocks as u64) + .ok_or(LoopError) + .map(|_| ()) + } + } + + impl FromBlockCipher for $name { + type BlockCipher = $cipher; + type NonceSize = <$cipher as BlockCipher>::BlockSize; + + fn from_block_cipher( + cipher: $cipher, + nonce: &GenericArray<u8, Self::NonceSize>, + ) -> Self { + let nonce = swap_bytes(load(nonce)); + Self { + nonce, + ctr: nonce, + cipher, + block: [0u8; BLOCK_SIZE], + pos: 0, + } + } + } + + impl StreamCipher for $name { + #[inline] + fn try_apply_keystream(&mut self, mut data: &mut [u8]) -> Result<(), LoopError> { + self.check_data_len(data)?; + let bs = BLOCK_SIZE; + let pos = self.pos as usize; + debug_assert!(bs > pos); + + if pos != 0 { + if data.len() < bs - pos { + let n = pos + data.len(); + xor(data, &self.block[pos..n]); + self.pos = n as u8; + return Ok(()); + } else { + let (l, r) = data.split_at_mut(bs - pos); + data = r; + xor(l, &self.block[pos..]); + self.ctr = inc_be(self.ctr); + } + } + + let mut chunks = data.chunks_exact_mut(PAR_BLOCKS_SIZE); + for chunk in &mut chunks { + xor_block8(chunk, self.next_block8()); + } + data = chunks.into_remainder(); + + let mut chunks = data.chunks_exact_mut(bs); + for chunk in &mut chunks { + let block = self.next_block(); + + unsafe { + let t = _mm_loadu_si128(chunk.as_ptr() as *const __m128i); + let res = _mm_xor_si128(block, t); + _mm_storeu_si128(chunk.as_mut_ptr() as *mut __m128i, res); + } + } + + let rem = chunks.into_remainder(); + self.pos = rem.len() as u8; + if !rem.is_empty() { + self.gen_block(); + for (a, b) in rem.iter_mut().zip(&self.block) { + *a ^= *b; + } + } + + Ok(()) + } + } + + impl StreamCipherSeek for $name { + fn try_current_pos<T: SeekNum>(&self) -> Result<T, OverflowError> { + T::from_block_byte(self.get_u64_ctr(), self.pos, BLOCK_SIZE as u8) + } + + fn try_seek<T: SeekNum>(&mut self, pos: T) -> Result<(), LoopError> { + let res: (u64, u8) = pos.to_block_byte(BLOCK_SIZE as u8)?; + self.ctr = unsafe { _mm_add_epi64(self.nonce, _mm_set_epi64x(res.0 as i64, 0)) }; + self.pos = res.1; + if self.pos != 0 { + self.gen_block() + } + Ok(()) + } + } + + opaque_debug::implement!($name); + }; +} + +impl_ctr!(Aes128Ctr, Aes128, "AES-128 in CTR mode"); +impl_ctr!(Aes192Ctr, Aes192, "AES-192 in CTR mode"); +impl_ctr!(Aes256Ctr, Aes256, "AES-256 in CTR mode"); diff --git a/rust/vendor/aes/src/ni/hazmat.rs b/rust/vendor/aes/src/ni/hazmat.rs new file mode 100644 index 0000000..5188ad7 --- /dev/null +++ b/rust/vendor/aes/src/ni/hazmat.rs @@ -0,0 +1,86 @@ +//! Low-level "hazmat" AES functions: AES-NI support. +//! +//! Note: this isn't actually used in the `Aes128`/`Aes192`/`Aes256` +//! implementations in this crate, but instead provides raw AES-NI accelerated +//! access to the AES round function gated under the `hazmat` crate feature. + +use super::{ + arch::*, + utils::{load8, store8}, +}; +use crate::{Block, ParBlocks}; + +/// AES cipher (encrypt) round function. +#[allow(clippy::cast_ptr_alignment)] +#[target_feature(enable = "aes")] +pub(crate) unsafe fn cipher_round(block: &mut Block, round_key: &Block) { + // Safety: `loadu` and `storeu` support unaligned access + let b = _mm_loadu_si128(block.as_ptr() as *const __m128i); + let k = _mm_loadu_si128(round_key.as_ptr() as *const __m128i); + let out = _mm_aesenc_si128(b, k); + _mm_storeu_si128(block.as_mut_ptr() as *mut __m128i, out); +} + +/// AES cipher (encrypt) round function: parallel version. +#[allow(clippy::cast_ptr_alignment)] +#[target_feature(enable = "aes")] +pub(crate) unsafe fn cipher_round_par(blocks: &mut ParBlocks, round_keys: &ParBlocks) { + let xmm_keys = load8(round_keys); + let mut xmm_blocks = load8(blocks); + + for i in 0..8 { + xmm_blocks[i] = _mm_aesenc_si128(xmm_blocks[i], xmm_keys[i]); + } + + store8(blocks, xmm_blocks); +} + +/// AES cipher (encrypt) round function. +#[allow(clippy::cast_ptr_alignment)] +#[target_feature(enable = "aes")] +pub(crate) unsafe fn equiv_inv_cipher_round(block: &mut Block, round_key: &Block) { + // Safety: `loadu` and `storeu` support unaligned access + let b = _mm_loadu_si128(block.as_ptr() as *const __m128i); + let k = _mm_loadu_si128(round_key.as_ptr() as *const __m128i); + let out = _mm_aesdec_si128(b, k); + _mm_storeu_si128(block.as_mut_ptr() as *mut __m128i, out); +} + +/// AES cipher (encrypt) round function: parallel version. +#[allow(clippy::cast_ptr_alignment)] +#[target_feature(enable = "aes")] +pub(crate) unsafe fn equiv_inv_cipher_round_par(blocks: &mut ParBlocks, round_keys: &ParBlocks) { + let xmm_keys = load8(round_keys); + let mut xmm_blocks = load8(blocks); + + for i in 0..8 { + xmm_blocks[i] = _mm_aesdec_si128(xmm_blocks[i], xmm_keys[i]); + } + + store8(blocks, xmm_blocks); +} + +/// AES mix columns function. +#[allow(clippy::cast_ptr_alignment)] +#[target_feature(enable = "aes")] +pub(crate) unsafe fn mix_columns(block: &mut Block) { + // Safety: `loadu` and `storeu` support unaligned access + let mut state = _mm_loadu_si128(block.as_ptr() as *const __m128i); + + // Emulate mix columns by performing three inverse mix columns operations + state = _mm_aesimc_si128(state); + state = _mm_aesimc_si128(state); + state = _mm_aesimc_si128(state); + + _mm_storeu_si128(block.as_mut_ptr() as *mut __m128i, state); +} + +/// AES inverse mix columns function. +#[allow(clippy::cast_ptr_alignment)] +#[target_feature(enable = "aes")] +pub(crate) unsafe fn inv_mix_columns(block: &mut Block) { + // Safety: `loadu` and `storeu` support unaligned access + let b = _mm_loadu_si128(block.as_ptr() as *const __m128i); + let out = _mm_aesimc_si128(b); + _mm_storeu_si128(block.as_mut_ptr() as *mut __m128i, out); +} diff --git a/rust/vendor/aes/src/ni/utils.rs b/rust/vendor/aes/src/ni/utils.rs new file mode 100644 index 0000000..1fc3403 --- /dev/null +++ b/rust/vendor/aes/src/ni/utils.rs @@ -0,0 +1,90 @@ +//! Utility functions + +// TODO(tarcieri): check performance impact / generated assembly changes +#![allow(clippy::needless_range_loop)] + +use super::arch::*; +use crate::ParBlocks; + +pub type U128x8 = [__m128i; 8]; + +#[cfg(test)] +pub(crate) fn check(a: &[__m128i], b: &[[u64; 2]]) { + for (v1, v2) in a.iter().zip(b) { + let t1: [u64; 2] = unsafe { core::mem::transmute(*v1) }; + let t2 = [v2[0].to_be(), v2[1].to_be()]; + assert_eq!(t1, t2); + } +} + +#[inline(always)] +pub(crate) fn load8(blocks: &ParBlocks) -> U128x8 { + unsafe { + [ + _mm_loadu_si128(blocks[0].as_ptr() as *const __m128i), + _mm_loadu_si128(blocks[1].as_ptr() as *const __m128i), + _mm_loadu_si128(blocks[2].as_ptr() as *const __m128i), + _mm_loadu_si128(blocks[3].as_ptr() as *const __m128i), + _mm_loadu_si128(blocks[4].as_ptr() as *const __m128i), + _mm_loadu_si128(blocks[5].as_ptr() as *const __m128i), + _mm_loadu_si128(blocks[6].as_ptr() as *const __m128i), + _mm_loadu_si128(blocks[7].as_ptr() as *const __m128i), + ] + } +} + +#[inline(always)] +pub(crate) fn store8(blocks: &mut ParBlocks, b: U128x8) { + unsafe { + _mm_storeu_si128(blocks[0].as_mut_ptr() as *mut __m128i, b[0]); + _mm_storeu_si128(blocks[1].as_mut_ptr() as *mut __m128i, b[1]); + _mm_storeu_si128(blocks[2].as_mut_ptr() as *mut __m128i, b[2]); + _mm_storeu_si128(blocks[3].as_mut_ptr() as *mut __m128i, b[3]); + _mm_storeu_si128(blocks[4].as_mut_ptr() as *mut __m128i, b[4]); + _mm_storeu_si128(blocks[5].as_mut_ptr() as *mut __m128i, b[5]); + _mm_storeu_si128(blocks[6].as_mut_ptr() as *mut __m128i, b[6]); + _mm_storeu_si128(blocks[7].as_mut_ptr() as *mut __m128i, b[7]); + } +} + +#[inline(always)] +pub(crate) fn xor8(b: &mut U128x8, key: __m128i) { + unsafe { + b[0] = _mm_xor_si128(b[0], key); + b[1] = _mm_xor_si128(b[1], key); + b[2] = _mm_xor_si128(b[2], key); + b[3] = _mm_xor_si128(b[3], key); + b[4] = _mm_xor_si128(b[4], key); + b[5] = _mm_xor_si128(b[5], key); + b[6] = _mm_xor_si128(b[6], key); + b[7] = _mm_xor_si128(b[7], key); + } +} + +#[inline(always)] +pub(crate) fn aesenc8(buffer: &mut U128x8, key: __m128i) { + for i in 0..8 { + buffer[i] = unsafe { _mm_aesenc_si128(buffer[i], key) }; + } +} + +#[inline(always)] +pub(crate) fn aesenclast8(buffer: &mut U128x8, key: __m128i) { + for i in 0..8 { + buffer[i] = unsafe { _mm_aesenclast_si128(buffer[i], key) }; + } +} + +#[inline(always)] +pub(crate) fn aesdec8(buffer: &mut U128x8, key: __m128i) { + for i in 0..8 { + buffer[i] = unsafe { _mm_aesdec_si128(buffer[i], key) }; + } +} + +#[inline(always)] +pub(crate) fn aesdeclast8(buffer: &mut U128x8, key: __m128i) { + for i in 0..8 { + buffer[i] = unsafe { _mm_aesdeclast_si128(buffer[i], key) }; + } +} diff --git a/rust/vendor/aes/src/soft.rs b/rust/vendor/aes/src/soft.rs new file mode 100644 index 0000000..1b51d22 --- /dev/null +++ b/rust/vendor/aes/src/soft.rs @@ -0,0 +1,127 @@ +//! AES block cipher constant-time implementation. +//! +//! The implementation uses a technique called [fixslicing][1], an improved +//! form of bitslicing which represents ciphers in a way which enables +//! very efficient constant-time implementations in software. +//! +//! [1]: https://eprint.iacr.org/2020/1123.pdf + +#![deny(unsafe_code)] + +#[cfg_attr(not(target_pointer_width = "64"), path = "soft/fixslice32.rs")] +#[cfg_attr(target_pointer_width = "64", path = "soft/fixslice64.rs")] +pub(crate) mod fixslice; + +#[cfg(feature = "ctr")] +mod ctr; + +#[cfg(feature = "ctr")] +pub use self::ctr::{Aes128Ctr, Aes192Ctr, Aes256Ctr}; + +use crate::{Block, ParBlocks}; +use cipher::{ + consts::{U16, U24, U32, U8}, + generic_array::GenericArray, + BlockCipher, BlockDecrypt, BlockEncrypt, NewBlockCipher, +}; +use fixslice::{FixsliceKeys128, FixsliceKeys192, FixsliceKeys256, FIXSLICE_BLOCKS}; + +macro_rules! define_aes_impl { + ( + $name:ident, + $key_size:ty, + $fixslice_keys:ty, + $fixslice_key_schedule:path, + $fixslice_decrypt:path, + $fixslice_encrypt:path, + $doc:expr + ) => { + #[doc=$doc] + #[derive(Clone)] + pub struct $name { + keys: $fixslice_keys, + } + + impl NewBlockCipher for $name { + type KeySize = $key_size; + + #[inline] + fn new(key: &GenericArray<u8, $key_size>) -> Self { + Self { + keys: $fixslice_key_schedule(key), + } + } + } + + impl BlockCipher for $name { + type BlockSize = U16; + type ParBlocks = U8; + } + + impl BlockEncrypt for $name { + #[inline] + fn encrypt_block(&self, block: &mut Block) { + let mut blocks = [Block::default(); FIXSLICE_BLOCKS]; + blocks[0].copy_from_slice(block); + $fixslice_encrypt(&self.keys, &mut blocks); + block.copy_from_slice(&blocks[0]); + } + + #[inline] + fn encrypt_par_blocks(&self, blocks: &mut ParBlocks) { + for chunk in blocks.chunks_mut(FIXSLICE_BLOCKS) { + $fixslice_encrypt(&self.keys, chunk); + } + } + } + + impl BlockDecrypt for $name { + #[inline] + fn decrypt_block(&self, block: &mut Block) { + let mut blocks = [Block::default(); FIXSLICE_BLOCKS]; + blocks[0].copy_from_slice(block); + $fixslice_decrypt(&self.keys, &mut blocks); + block.copy_from_slice(&blocks[0]); + } + + #[inline] + fn decrypt_par_blocks(&self, blocks: &mut ParBlocks) { + for chunk in blocks.chunks_mut(FIXSLICE_BLOCKS) { + $fixslice_decrypt(&self.keys, chunk); + } + } + } + + opaque_debug::implement!($name); + }; +} + +define_aes_impl!( + Aes128, + U16, + FixsliceKeys128, + fixslice::aes128_key_schedule, + fixslice::aes128_decrypt, + fixslice::aes128_encrypt, + "AES-128 block cipher instance" +); + +define_aes_impl!( + Aes192, + U24, + FixsliceKeys192, + fixslice::aes192_key_schedule, + fixslice::aes192_decrypt, + fixslice::aes192_encrypt, + "AES-192 block cipher instance" +); + +define_aes_impl!( + Aes256, + U32, + FixsliceKeys256, + fixslice::aes256_key_schedule, + fixslice::aes256_decrypt, + fixslice::aes256_encrypt, + "AES-256 block cipher instance" +); diff --git a/rust/vendor/aes/src/soft/ctr.rs b/rust/vendor/aes/src/soft/ctr.rs new file mode 100644 index 0000000..a288ab1 --- /dev/null +++ b/rust/vendor/aes/src/soft/ctr.rs @@ -0,0 +1,17 @@ +//! AES in counter mode (a.k.a. AES-CTR) + +// TODO(tarcieri): support generic CTR API + +use super::{Aes128, Aes192, Aes256}; + +/// AES-128 in CTR mode +#[cfg_attr(docsrs, doc(cfg(feature = "ctr")))] +pub type Aes128Ctr = ::ctr::Ctr64BE<Aes128>; + +/// AES-192 in CTR mode +#[cfg_attr(docsrs, doc(cfg(feature = "ctr")))] +pub type Aes192Ctr = ::ctr::Ctr64BE<Aes192>; + +/// AES-256 in CTR mode +#[cfg_attr(docsrs, doc(cfg(feature = "ctr")))] +pub type Aes256Ctr = ::ctr::Ctr64BE<Aes256>; diff --git a/rust/vendor/aes/src/soft/fixslice32.rs b/rust/vendor/aes/src/soft/fixslice32.rs new file mode 100644 index 0000000..5dc4834 --- /dev/null +++ b/rust/vendor/aes/src/soft/fixslice32.rs @@ -0,0 +1,1485 @@ +//! Fixsliced implementations of AES-128, AES-192 and AES-256 (32-bit) +//! adapted from the C implementation +//! +//! All implementations are fully bitsliced and do not rely on any +//! Look-Up Table (LUT). +//! +//! See the paper at <https://eprint.iacr.org/2020/1123.pdf> for more details. +//! +//! # Author (original C code) +//! +//! Alexandre Adomnicai, Nanyang Technological University, Singapore +//! <alexandre.adomnicai@ntu.edu.sg> +//! +//! Originally licensed MIT. Relicensed as Apache 2.0+MIT with permission. + +#![allow(clippy::unreadable_literal)] + +use crate::Block; +use cipher::{ + consts::{U16, U24, U32}, + generic_array::GenericArray, +}; +use core::convert::TryInto; + +/// AES block batch size for this implementation +pub(crate) const FIXSLICE_BLOCKS: usize = 2; + +/// AES-128 round keys +pub(crate) type FixsliceKeys128 = [u32; 88]; + +/// AES-192 round keys +pub(crate) type FixsliceKeys192 = [u32; 104]; + +/// AES-256 round keys +pub(crate) type FixsliceKeys256 = [u32; 120]; + +/// 256-bit internal state +pub(crate) type State = [u32; 8]; + +/// Fully bitsliced AES-128 key schedule to match the fully-fixsliced representation. +pub(crate) fn aes128_key_schedule(key: &GenericArray<u8, U16>) -> FixsliceKeys128 { + let mut rkeys = [0u32; 88]; + + bitslice(&mut rkeys[..8], key, key); + + let mut rk_off = 0; + for rcon in 0..10 { + memshift32(&mut rkeys, rk_off); + rk_off += 8; + + sub_bytes(&mut rkeys[rk_off..(rk_off + 8)]); + sub_bytes_nots(&mut rkeys[rk_off..(rk_off + 8)]); + + if rcon < 8 { + add_round_constant_bit(&mut rkeys[rk_off..(rk_off + 8)], rcon); + } else { + add_round_constant_bit(&mut rkeys[rk_off..(rk_off + 8)], rcon - 8); + add_round_constant_bit(&mut rkeys[rk_off..(rk_off + 8)], rcon - 7); + add_round_constant_bit(&mut rkeys[rk_off..(rk_off + 8)], rcon - 5); + add_round_constant_bit(&mut rkeys[rk_off..(rk_off + 8)], rcon - 4); + } + + xor_columns(&mut rkeys, rk_off, 8, ror_distance(1, 3)); + } + + // Adjust to match fixslicing format + #[cfg(feature = "compact")] + { + for i in (8..88).step_by(16) { + inv_shift_rows_1(&mut rkeys[i..(i + 8)]); + } + } + #[cfg(not(feature = "compact"))] + { + for i in (8..72).step_by(32) { + inv_shift_rows_1(&mut rkeys[i..(i + 8)]); + inv_shift_rows_2(&mut rkeys[(i + 8)..(i + 16)]); + inv_shift_rows_3(&mut rkeys[(i + 16)..(i + 24)]); + } + inv_shift_rows_1(&mut rkeys[72..80]); + } + + // Account for NOTs removed from sub_bytes + for i in 1..11 { + sub_bytes_nots(&mut rkeys[(i * 8)..(i * 8 + 8)]); + } + + rkeys +} + +/// Fully bitsliced AES-192 key schedule to match the fully-fixsliced representation. +pub(crate) fn aes192_key_schedule(key: &GenericArray<u8, U24>) -> FixsliceKeys192 { + let mut rkeys = [0u32; 104]; + let mut tmp = [0u32; 8]; + + bitslice(&mut rkeys[..8], &key[..16], &key[..16]); + bitslice(&mut tmp, &key[8..], &key[8..]); + + let mut rcon = 0; + let mut rk_off = 8; + + loop { + for i in 0..8 { + rkeys[rk_off + i] = + (0x0f0f0f0f & (tmp[i] >> 4)) | (0xf0f0f0f0 & (rkeys[(rk_off - 8) + i] << 4)); + } + + sub_bytes(&mut tmp); + sub_bytes_nots(&mut tmp); + + add_round_constant_bit(&mut tmp, rcon); + rcon += 1; + + for i in 0..8 { + let mut ti = rkeys[rk_off + i]; + ti ^= 0x30303030 & ror(tmp[i], ror_distance(1, 1)); + ti ^= 0xc0c0c0c0 & (ti << 2); + tmp[i] = ti; + } + rkeys[rk_off..(rk_off + 8)].copy_from_slice(&tmp); + rk_off += 8; + + for i in 0..8 { + let ui = tmp[i]; + let mut ti = (0x0f0f0f0f & (rkeys[(rk_off - 16) + i] >> 4)) | (0xf0f0f0f0 & (ui << 4)); + ti ^= 0x03030303 & (ui >> 6); + tmp[i] = + ti ^ (0xfcfcfcfc & (ti << 2)) ^ (0xf0f0f0f0 & (ti << 4)) ^ (0xc0c0c0c0 & (ti << 6)); + } + rkeys[rk_off..(rk_off + 8)].copy_from_slice(&tmp); + rk_off += 8; + + sub_bytes(&mut tmp); + sub_bytes_nots(&mut tmp); + + add_round_constant_bit(&mut tmp, rcon); + rcon += 1; + + for i in 0..8 { + let mut ti = (0x0f0f0f0f & (rkeys[(rk_off - 16) + i] >> 4)) + | (0xf0f0f0f0 & (rkeys[(rk_off - 8) + i] << 4)); + ti ^= 0x03030303 & ror(tmp[i], ror_distance(1, 3)); + rkeys[rk_off + i] = + ti ^ (0xfcfcfcfc & (ti << 2)) ^ (0xf0f0f0f0 & (ti << 4)) ^ (0xc0c0c0c0 & (ti << 6)); + } + rk_off += 8; + + if rcon >= 8 { + break; + } + + for i in 0..8 { + let ui = rkeys[(rk_off - 8) + i]; + let mut ti = rkeys[(rk_off - 16) + i]; + ti ^= 0x30303030 & (ui >> 2); + ti ^= 0xc0c0c0c0 & (ti << 2); + tmp[i] = ti; + } + } + + // Adjust to match fixslicing format + #[cfg(feature = "compact")] + { + for i in (8..104).step_by(16) { + inv_shift_rows_1(&mut rkeys[i..(i + 8)]); + } + } + #[cfg(not(feature = "compact"))] + { + for i in (0..96).step_by(32) { + inv_shift_rows_1(&mut rkeys[(i + 8)..(i + 16)]); + inv_shift_rows_2(&mut rkeys[(i + 16)..(i + 24)]); + inv_shift_rows_3(&mut rkeys[(i + 24)..(i + 32)]); + } + } + + // Account for NOTs removed from sub_bytes + for i in 1..13 { + sub_bytes_nots(&mut rkeys[(i * 8)..(i * 8 + 8)]); + } + + rkeys +} + +/// Fully bitsliced AES-256 key schedule to match the fully-fixsliced representation. +pub(crate) fn aes256_key_schedule(key: &GenericArray<u8, U32>) -> FixsliceKeys256 { + let mut rkeys = [0u32; 120]; + + bitslice(&mut rkeys[..8], &key[..16], &key[..16]); + bitslice(&mut rkeys[8..16], &key[16..], &key[16..]); + + let mut rk_off = 8; + + let mut rcon = 0; + loop { + memshift32(&mut rkeys, rk_off); + rk_off += 8; + + sub_bytes(&mut rkeys[rk_off..(rk_off + 8)]); + sub_bytes_nots(&mut rkeys[rk_off..(rk_off + 8)]); + + add_round_constant_bit(&mut rkeys[rk_off..(rk_off + 8)], rcon); + xor_columns(&mut rkeys, rk_off, 16, ror_distance(1, 3)); + rcon += 1; + + if rcon == 7 { + break; + } + + memshift32(&mut rkeys, rk_off); + rk_off += 8; + + sub_bytes(&mut rkeys[rk_off..(rk_off + 8)]); + sub_bytes_nots(&mut rkeys[rk_off..(rk_off + 8)]); + + xor_columns(&mut rkeys, rk_off, 16, ror_distance(0, 3)); + } + + // Adjust to match fixslicing format + #[cfg(feature = "compact")] + { + for i in (8..120).step_by(16) { + inv_shift_rows_1(&mut rkeys[i..(i + 8)]); + } + } + #[cfg(not(feature = "compact"))] + { + for i in (8..104).step_by(32) { + inv_shift_rows_1(&mut rkeys[i..(i + 8)]); + inv_shift_rows_2(&mut rkeys[(i + 8)..(i + 16)]); + inv_shift_rows_3(&mut rkeys[(i + 16)..(i + 24)]); + } + inv_shift_rows_1(&mut rkeys[104..112]); + } + + // Account for NOTs removed from sub_bytes + for i in 1..15 { + sub_bytes_nots(&mut rkeys[(i * 8)..(i * 8 + 8)]); + } + + rkeys +} + +/// Fully-fixsliced AES-128 decryption (the InvShiftRows is completely omitted). +/// +/// Decrypts four blocks in-place and in parallel. +pub(crate) fn aes128_decrypt(rkeys: &FixsliceKeys128, blocks: &mut [Block]) { + debug_assert_eq!(blocks.len(), FIXSLICE_BLOCKS); + let mut state = State::default(); + + bitslice(&mut state, &blocks[0], &blocks[1]); + + add_round_key(&mut state, &rkeys[80..]); + inv_sub_bytes(&mut state); + + #[cfg(not(feature = "compact"))] + { + inv_shift_rows_2(&mut state); + } + + let mut rk_off = 72; + loop { + #[cfg(feature = "compact")] + { + inv_shift_rows_2(&mut state); + } + + add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]); + inv_mix_columns_1(&mut state); + inv_sub_bytes(&mut state); + rk_off -= 8; + + if rk_off == 0 { + break; + } + + add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]); + inv_mix_columns_0(&mut state); + inv_sub_bytes(&mut state); + rk_off -= 8; + + #[cfg(not(feature = "compact"))] + { + add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]); + inv_mix_columns_3(&mut state); + inv_sub_bytes(&mut state); + rk_off -= 8; + + add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]); + inv_mix_columns_2(&mut state); + inv_sub_bytes(&mut state); + rk_off -= 8; + } + } + + add_round_key(&mut state, &rkeys[..8]); + + inv_bitslice(&state, blocks); +} + +/// Fully-fixsliced AES-128 encryption (the ShiftRows is completely omitted). +/// +/// Encrypts four blocks in-place and in parallel. +pub(crate) fn aes128_encrypt(rkeys: &FixsliceKeys128, blocks: &mut [Block]) { + debug_assert_eq!(blocks.len(), FIXSLICE_BLOCKS); + let mut state = State::default(); + + bitslice(&mut state, &blocks[0], &blocks[1]); + + add_round_key(&mut state, &rkeys[..8]); + + let mut rk_off = 8; + loop { + sub_bytes(&mut state); + mix_columns_1(&mut state); + add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]); + rk_off += 8; + + #[cfg(feature = "compact")] + { + shift_rows_2(&mut state); + } + + if rk_off == 80 { + break; + } + + #[cfg(not(feature = "compact"))] + { + sub_bytes(&mut state); + mix_columns_2(&mut state); + add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]); + rk_off += 8; + + sub_bytes(&mut state); + mix_columns_3(&mut state); + add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]); + rk_off += 8; + } + + sub_bytes(&mut state); + mix_columns_0(&mut state); + add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]); + rk_off += 8; + } + + #[cfg(not(feature = "compact"))] + { + shift_rows_2(&mut state); + } + + sub_bytes(&mut state); + add_round_key(&mut state, &rkeys[80..]); + + inv_bitslice(&state, blocks); +} + +/// Fully-fixsliced AES-192 decryption (the InvShiftRows is completely omitted). +/// +/// Decrypts four blocks in-place and in parallel. +pub(crate) fn aes192_decrypt(rkeys: &FixsliceKeys192, blocks: &mut [Block]) { + debug_assert_eq!(blocks.len(), FIXSLICE_BLOCKS); + let mut state = State::default(); + + bitslice(&mut state, &blocks[0], &blocks[1]); + + add_round_key(&mut state, &rkeys[96..]); + inv_sub_bytes(&mut state); + + let mut rk_off = 88; + loop { + #[cfg(feature = "compact")] + { + inv_shift_rows_2(&mut state); + } + #[cfg(not(feature = "compact"))] + { + add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]); + inv_mix_columns_3(&mut state); + inv_sub_bytes(&mut state); + rk_off -= 8; + + add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]); + inv_mix_columns_2(&mut state); + inv_sub_bytes(&mut state); + rk_off -= 8; + } + + add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]); + inv_mix_columns_1(&mut state); + inv_sub_bytes(&mut state); + rk_off -= 8; + + if rk_off == 0 { + break; + } + + add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]); + inv_mix_columns_0(&mut state); + inv_sub_bytes(&mut state); + rk_off -= 8; + } + + add_round_key(&mut state, &rkeys[..8]); + + inv_bitslice(&state, blocks); +} + +/// Fully-fixsliced AES-192 encryption (the ShiftRows is completely omitted). +/// +/// Encrypts four blocks in-place and in parallel. +pub(crate) fn aes192_encrypt(rkeys: &FixsliceKeys192, blocks: &mut [Block]) { + debug_assert_eq!(blocks.len(), FIXSLICE_BLOCKS); + let mut state = State::default(); + + bitslice(&mut state, &blocks[0], &blocks[1]); + + add_round_key(&mut state, &rkeys[..8]); + + let mut rk_off = 8; + loop { + sub_bytes(&mut state); + mix_columns_1(&mut state); + add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]); + rk_off += 8; + + #[cfg(feature = "compact")] + { + shift_rows_2(&mut state); + } + #[cfg(not(feature = "compact"))] + { + sub_bytes(&mut state); + mix_columns_2(&mut state); + add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]); + rk_off += 8; + + sub_bytes(&mut state); + mix_columns_3(&mut state); + add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]); + rk_off += 8; + } + + if rk_off == 96 { + break; + } + + sub_bytes(&mut state); + mix_columns_0(&mut state); + add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]); + rk_off += 8; + } + + sub_bytes(&mut state); + add_round_key(&mut state, &rkeys[96..]); + + inv_bitslice(&state, blocks); +} + +/// Fully-fixsliced AES-256 decryption (the InvShiftRows is completely omitted). +/// +/// Decrypts four blocks in-place and in parallel. +pub(crate) fn aes256_decrypt(rkeys: &FixsliceKeys256, blocks: &mut [Block]) { + debug_assert_eq!(blocks.len(), FIXSLICE_BLOCKS); + let mut state = State::default(); + + bitslice(&mut state, &blocks[0], &blocks[1]); + + add_round_key(&mut state, &rkeys[112..]); + inv_sub_bytes(&mut state); + + #[cfg(not(feature = "compact"))] + { + inv_shift_rows_2(&mut state); + } + + let mut rk_off = 104; + loop { + #[cfg(feature = "compact")] + { + inv_shift_rows_2(&mut state); + } + + add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]); + inv_mix_columns_1(&mut state); + inv_sub_bytes(&mut state); + rk_off -= 8; + + if rk_off == 0 { + break; + } + + add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]); + inv_mix_columns_0(&mut state); + inv_sub_bytes(&mut state); + rk_off -= 8; + + #[cfg(not(feature = "compact"))] + { + add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]); + inv_mix_columns_3(&mut state); + inv_sub_bytes(&mut state); + rk_off -= 8; + + add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]); + inv_mix_columns_2(&mut state); + inv_sub_bytes(&mut state); + rk_off -= 8; + } + } + + add_round_key(&mut state, &rkeys[..8]); + + inv_bitslice(&state, blocks); +} + +/// Fully-fixsliced AES-256 encryption (the ShiftRows is completely omitted). +/// +/// Encrypts four blocks in-place and in parallel. +pub(crate) fn aes256_encrypt(rkeys: &FixsliceKeys256, blocks: &mut [Block]) { + debug_assert_eq!(blocks.len(), FIXSLICE_BLOCKS); + let mut state = State::default(); + + bitslice(&mut state, &blocks[0], &blocks[1]); + + add_round_key(&mut state, &rkeys[..8]); + + let mut rk_off = 8; + loop { + sub_bytes(&mut state); + mix_columns_1(&mut state); + add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]); + rk_off += 8; + + #[cfg(feature = "compact")] + { + shift_rows_2(&mut state); + } + + if rk_off == 112 { + break; + } + + #[cfg(not(feature = "compact"))] + { + sub_bytes(&mut state); + mix_columns_2(&mut state); + add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]); + rk_off += 8; + + sub_bytes(&mut state); + mix_columns_3(&mut state); + add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]); + rk_off += 8; + } + + sub_bytes(&mut state); + mix_columns_0(&mut state); + add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]); + rk_off += 8; + } + + #[cfg(not(feature = "compact"))] + { + shift_rows_2(&mut state); + } + + sub_bytes(&mut state); + add_round_key(&mut state, &rkeys[112..]); + + inv_bitslice(&state, blocks); +} + +/// Note that the 4 bitwise NOT (^= 0xffffffff) are accounted for here so that it is a true +/// inverse of 'sub_bytes'. +fn inv_sub_bytes(state: &mut [u32]) { + debug_assert_eq!(state.len(), 8); + + // Scheduled using https://github.com/Ko-/aes-armcortexm/tree/public/scheduler + // Inline "stack" comments reflect suggested stores and loads (ARM Cortex-M3 and M4) + + let u7 = state[0]; + let u6 = state[1]; + let u5 = state[2]; + let u4 = state[3]; + let u3 = state[4]; + let u2 = state[5]; + let u1 = state[6]; + let u0 = state[7]; + + let t23 = u0 ^ u3; + let t8 = u1 ^ t23; + let m2 = t23 & t8; + let t4 = u4 ^ t8; + let t22 = u1 ^ u3; + let t2 = u0 ^ u1; + let t1 = u3 ^ u4; + // t23 -> stack + let t9 = u7 ^ t1; + // t8 -> stack + let m7 = t22 & t9; + // t9 -> stack + let t24 = u4 ^ u7; + // m7 -> stack + let t10 = t2 ^ t24; + // u4 -> stack + let m14 = t2 & t10; + let r5 = u6 ^ u7; + // m2 -> stack + let t3 = t1 ^ r5; + // t2 -> stack + let t13 = t2 ^ r5; + let t19 = t22 ^ r5; + // t3 -> stack + let t17 = u2 ^ t19; + // t4 -> stack + let t25 = u2 ^ t1; + let r13 = u1 ^ u6; + // t25 -> stack + let t20 = t24 ^ r13; + // t17 -> stack + let m9 = t20 & t17; + // t20 -> stack + let r17 = u2 ^ u5; + // t22 -> stack + let t6 = t22 ^ r17; + // t13 -> stack + let m1 = t13 & t6; + let y5 = u0 ^ r17; + let m4 = t19 & y5; + let m5 = m4 ^ m1; + let m17 = m5 ^ t24; + let r18 = u5 ^ u6; + let t27 = t1 ^ r18; + let t15 = t10 ^ t27; + // t6 -> stack + let m11 = t1 & t15; + let m15 = m14 ^ m11; + let m21 = m17 ^ m15; + // t1 -> stack + // t4 <- stack + let m12 = t4 & t27; + let m13 = m12 ^ m11; + let t14 = t10 ^ r18; + let m3 = t14 ^ m1; + // m2 <- stack + let m16 = m3 ^ m2; + let m20 = m16 ^ m13; + // u4 <- stack + let r19 = u2 ^ u4; + let t16 = r13 ^ r19; + // t3 <- stack + let t26 = t3 ^ t16; + let m6 = t3 & t16; + let m8 = t26 ^ m6; + // t10 -> stack + // m7 <- stack + let m18 = m8 ^ m7; + let m22 = m18 ^ m13; + let m25 = m22 & m20; + let m26 = m21 ^ m25; + let m10 = m9 ^ m6; + let m19 = m10 ^ m15; + // t25 <- stack + let m23 = m19 ^ t25; + let m28 = m23 ^ m25; + let m24 = m22 ^ m23; + let m30 = m26 & m24; + let m39 = m23 ^ m30; + let m48 = m39 & y5; + let m57 = m39 & t19; + // m48 -> stack + let m36 = m24 ^ m25; + let m31 = m20 & m23; + let m27 = m20 ^ m21; + let m32 = m27 & m31; + let m29 = m28 & m27; + let m37 = m21 ^ m29; + // m39 -> stack + let m42 = m37 ^ m39; + let m52 = m42 & t15; + // t27 -> stack + // t1 <- stack + let m61 = m42 & t1; + let p0 = m52 ^ m61; + let p16 = m57 ^ m61; + // m57 -> stack + // t20 <- stack + let m60 = m37 & t20; + // p16 -> stack + // t17 <- stack + let m51 = m37 & t17; + let m33 = m27 ^ m25; + let m38 = m32 ^ m33; + let m43 = m37 ^ m38; + let m49 = m43 & t16; + let p6 = m49 ^ m60; + let p13 = m49 ^ m51; + let m58 = m43 & t3; + // t9 <- stack + let m50 = m38 & t9; + // t22 <- stack + let m59 = m38 & t22; + // p6 -> stack + let p1 = m58 ^ m59; + let p7 = p0 ^ p1; + let m34 = m21 & m22; + let m35 = m24 & m34; + let m40 = m35 ^ m36; + let m41 = m38 ^ m40; + let m45 = m42 ^ m41; + // t27 <- stack + let m53 = m45 & t27; + let p8 = m50 ^ m53; + let p23 = p7 ^ p8; + // t4 <- stack + let m62 = m45 & t4; + let p14 = m49 ^ m62; + let s6 = p14 ^ p23; + // t10 <- stack + let m54 = m41 & t10; + let p2 = m54 ^ m62; + let p22 = p2 ^ p7; + let s0 = p13 ^ p22; + let p17 = m58 ^ p2; + let p15 = m54 ^ m59; + // t2 <- stack + let m63 = m41 & t2; + // m39 <- stack + let m44 = m39 ^ m40; + // p17 -> stack + // t6 <- stack + let m46 = m44 & t6; + let p5 = m46 ^ m51; + // p23 -> stack + let p18 = m63 ^ p5; + let p24 = p5 ^ p7; + // m48 <- stack + let p12 = m46 ^ m48; + let s3 = p12 ^ p22; + // t13 <- stack + let m55 = m44 & t13; + let p9 = m55 ^ m63; + // p16 <- stack + let s7 = p9 ^ p16; + // t8 <- stack + let m47 = m40 & t8; + let p3 = m47 ^ m50; + let p19 = p2 ^ p3; + let s5 = p19 ^ p24; + let p11 = p0 ^ p3; + let p26 = p9 ^ p11; + // t23 <- stack + let m56 = m40 & t23; + let p4 = m48 ^ m56; + // p6 <- stack + let p20 = p4 ^ p6; + let p29 = p15 ^ p20; + let s1 = p26 ^ p29; + // m57 <- stack + let p10 = m57 ^ p4; + let p27 = p10 ^ p18; + // p23 <- stack + let s4 = p23 ^ p27; + let p25 = p6 ^ p10; + let p28 = p11 ^ p25; + // p17 <- stack + let s2 = p17 ^ p28; + + state[0] = s7; + state[1] = s6; + state[2] = s5; + state[3] = s4; + state[4] = s3; + state[5] = s2; + state[6] = s1; + state[7] = s0; +} + +/// Bitsliced implementation of the AES Sbox based on Boyar, Peralta and Calik. +/// +/// See: <http://www.cs.yale.edu/homes/peralta/CircuitStuff/SLP_AES_113.txt> +/// +/// Note that the 4 bitwise NOT (^= 0xffffffff) are moved to the key schedule. +fn sub_bytes(state: &mut [u32]) { + debug_assert_eq!(state.len(), 8); + + // Scheduled using https://github.com/Ko-/aes-armcortexm/tree/public/scheduler + // Inline "stack" comments reflect suggested stores and loads (ARM Cortex-M3 and M4) + + let u7 = state[0]; + let u6 = state[1]; + let u5 = state[2]; + let u4 = state[3]; + let u3 = state[4]; + let u2 = state[5]; + let u1 = state[6]; + let u0 = state[7]; + + let y14 = u3 ^ u5; + let y13 = u0 ^ u6; + let y12 = y13 ^ y14; + let t1 = u4 ^ y12; + let y15 = t1 ^ u5; + let t2 = y12 & y15; + let y6 = y15 ^ u7; + let y20 = t1 ^ u1; + // y12 -> stack + let y9 = u0 ^ u3; + // y20 -> stack + let y11 = y20 ^ y9; + // y9 -> stack + let t12 = y9 & y11; + // y6 -> stack + let y7 = u7 ^ y11; + let y8 = u0 ^ u5; + let t0 = u1 ^ u2; + let y10 = y15 ^ t0; + // y15 -> stack + let y17 = y10 ^ y11; + // y14 -> stack + let t13 = y14 & y17; + let t14 = t13 ^ t12; + // y17 -> stack + let y19 = y10 ^ y8; + // y10 -> stack + let t15 = y8 & y10; + let t16 = t15 ^ t12; + let y16 = t0 ^ y11; + // y11 -> stack + let y21 = y13 ^ y16; + // y13 -> stack + let t7 = y13 & y16; + // y16 -> stack + let y18 = u0 ^ y16; + let y1 = t0 ^ u7; + let y4 = y1 ^ u3; + // u7 -> stack + let t5 = y4 & u7; + let t6 = t5 ^ t2; + let t18 = t6 ^ t16; + let t22 = t18 ^ y19; + let y2 = y1 ^ u0; + let t10 = y2 & y7; + let t11 = t10 ^ t7; + let t20 = t11 ^ t16; + let t24 = t20 ^ y18; + let y5 = y1 ^ u6; + let t8 = y5 & y1; + let t9 = t8 ^ t7; + let t19 = t9 ^ t14; + let t23 = t19 ^ y21; + let y3 = y5 ^ y8; + // y6 <- stack + let t3 = y3 & y6; + let t4 = t3 ^ t2; + // y20 <- stack + let t17 = t4 ^ y20; + let t21 = t17 ^ t14; + let t26 = t21 & t23; + let t27 = t24 ^ t26; + let t31 = t22 ^ t26; + let t25 = t21 ^ t22; + // y4 -> stack + let t28 = t25 & t27; + let t29 = t28 ^ t22; + let z14 = t29 & y2; + let z5 = t29 & y7; + let t30 = t23 ^ t24; + let t32 = t31 & t30; + let t33 = t32 ^ t24; + let t35 = t27 ^ t33; + let t36 = t24 & t35; + let t38 = t27 ^ t36; + let t39 = t29 & t38; + let t40 = t25 ^ t39; + let t43 = t29 ^ t40; + // y16 <- stack + let z3 = t43 & y16; + let tc12 = z3 ^ z5; + // tc12 -> stack + // y13 <- stack + let z12 = t43 & y13; + let z13 = t40 & y5; + let z4 = t40 & y1; + let tc6 = z3 ^ z4; + let t34 = t23 ^ t33; + let t37 = t36 ^ t34; + let t41 = t40 ^ t37; + // y10 <- stack + let z8 = t41 & y10; + let z17 = t41 & y8; + let t44 = t33 ^ t37; + // y15 <- stack + let z0 = t44 & y15; + // z17 -> stack + // y12 <- stack + let z9 = t44 & y12; + let z10 = t37 & y3; + let z1 = t37 & y6; + let tc5 = z1 ^ z0; + let tc11 = tc6 ^ tc5; + // y4 <- stack + let z11 = t33 & y4; + let t42 = t29 ^ t33; + let t45 = t42 ^ t41; + // y17 <- stack + let z7 = t45 & y17; + let tc8 = z7 ^ tc6; + // y14 <- stack + let z16 = t45 & y14; + // y11 <- stack + let z6 = t42 & y11; + let tc16 = z6 ^ tc8; + // z14 -> stack + // y9 <- stack + let z15 = t42 & y9; + let tc20 = z15 ^ tc16; + let tc1 = z15 ^ z16; + let tc2 = z10 ^ tc1; + let tc21 = tc2 ^ z11; + let tc3 = z9 ^ tc2; + let s0 = tc3 ^ tc16; + let s3 = tc3 ^ tc11; + let s1 = s3 ^ tc16; + let tc13 = z13 ^ tc1; + // u7 <- stack + let z2 = t33 & u7; + let tc4 = z0 ^ z2; + let tc7 = z12 ^ tc4; + let tc9 = z8 ^ tc7; + let tc10 = tc8 ^ tc9; + // z14 <- stack + let tc17 = z14 ^ tc10; + let s5 = tc21 ^ tc17; + let tc26 = tc17 ^ tc20; + // z17 <- stack + let s2 = tc26 ^ z17; + // tc12 <- stack + let tc14 = tc4 ^ tc12; + let tc18 = tc13 ^ tc14; + let s6 = tc10 ^ tc18; + let s7 = z12 ^ tc18; + let s4 = tc14 ^ s3; + + state[0] = s7; + state[1] = s6; + state[2] = s5; + state[3] = s4; + state[4] = s3; + state[5] = s2; + state[6] = s1; + state[7] = s0; +} + +/// NOT operations that are omitted in S-box +#[inline] +fn sub_bytes_nots(state: &mut [u32]) { + debug_assert_eq!(state.len(), 8); + state[0] ^= 0xffffffff; + state[1] ^= 0xffffffff; + state[5] ^= 0xffffffff; + state[6] ^= 0xffffffff; +} + +/// Computation of the MixColumns transformation in the fixsliced representation, with different +/// rotations used according to the round number mod 4. +/// +/// Based on Käsper-Schwabe, similar to https://github.com/Ko-/aes-armcortexm. +macro_rules! define_mix_columns { + ( + $name:ident, + $name_inv:ident, + $first_rotate:path, + $second_rotate:path + ) => { + #[rustfmt::skip] + fn $name(state: &mut State) { + let (a0, a1, a2, a3, a4, a5, a6, a7) = ( + state[0], state[1], state[2], state[3], state[4], state[5], state[6], state[7] + ); + let (b0, b1, b2, b3, b4, b5, b6, b7) = ( + $first_rotate(a0), + $first_rotate(a1), + $first_rotate(a2), + $first_rotate(a3), + $first_rotate(a4), + $first_rotate(a5), + $first_rotate(a6), + $first_rotate(a7), + ); + let (c0, c1, c2, c3, c4, c5, c6, c7) = ( + a0 ^ b0, + a1 ^ b1, + a2 ^ b2, + a3 ^ b3, + a4 ^ b4, + a5 ^ b5, + a6 ^ b6, + a7 ^ b7, + ); + state[0] = b0 ^ c7 ^ $second_rotate(c0); + state[1] = b1 ^ c0 ^ c7 ^ $second_rotate(c1); + state[2] = b2 ^ c1 ^ $second_rotate(c2); + state[3] = b3 ^ c2 ^ c7 ^ $second_rotate(c3); + state[4] = b4 ^ c3 ^ c7 ^ $second_rotate(c4); + state[5] = b5 ^ c4 ^ $second_rotate(c5); + state[6] = b6 ^ c5 ^ $second_rotate(c6); + state[7] = b7 ^ c6 ^ $second_rotate(c7); + } + + #[rustfmt::skip] + fn $name_inv(state: &mut State) { + let (a0, a1, a2, a3, a4, a5, a6, a7) = ( + state[0], state[1], state[2], state[3], state[4], state[5], state[6], state[7] + ); + let (b0, b1, b2, b3, b4, b5, b6, b7) = ( + $first_rotate(a0), + $first_rotate(a1), + $first_rotate(a2), + $first_rotate(a3), + $first_rotate(a4), + $first_rotate(a5), + $first_rotate(a6), + $first_rotate(a7), + ); + let (c0, c1, c2, c3, c4, c5, c6, c7) = ( + a0 ^ b0, + a1 ^ b1, + a2 ^ b2, + a3 ^ b3, + a4 ^ b4, + a5 ^ b5, + a6 ^ b6, + a7 ^ b7, + ); + let (d0, d1, d2, d3, d4, d5, d6, d7) = ( + a0 ^ c7, + a1 ^ c0 ^ c7, + a2 ^ c1, + a3 ^ c2 ^ c7, + a4 ^ c3 ^ c7, + a5 ^ c4, + a6 ^ c5, + a7 ^ c6, + ); + let (e0, e1, e2, e3, e4, e5, e6, e7) = ( + c0 ^ d6, + c1 ^ d6 ^ d7, + c2 ^ d0 ^ d7, + c3 ^ d1 ^ d6, + c4 ^ d2 ^ d6 ^ d7, + c5 ^ d3 ^ d7, + c6 ^ d4, + c7 ^ d5, + ); + state[0] = d0 ^ e0 ^ $second_rotate(e0); + state[1] = d1 ^ e1 ^ $second_rotate(e1); + state[2] = d2 ^ e2 ^ $second_rotate(e2); + state[3] = d3 ^ e3 ^ $second_rotate(e3); + state[4] = d4 ^ e4 ^ $second_rotate(e4); + state[5] = d5 ^ e5 ^ $second_rotate(e5); + state[6] = d6 ^ e6 ^ $second_rotate(e6); + state[7] = d7 ^ e7 ^ $second_rotate(e7); + } + } +} + +define_mix_columns!( + mix_columns_0, + inv_mix_columns_0, + rotate_rows_1, + rotate_rows_2 +); + +define_mix_columns!( + mix_columns_1, + inv_mix_columns_1, + rotate_rows_and_columns_1_1, + rotate_rows_and_columns_2_2 +); + +#[cfg(not(feature = "compact"))] +define_mix_columns!( + mix_columns_2, + inv_mix_columns_2, + rotate_rows_and_columns_1_2, + rotate_rows_2 +); + +#[cfg(not(feature = "compact"))] +define_mix_columns!( + mix_columns_3, + inv_mix_columns_3, + rotate_rows_and_columns_1_3, + rotate_rows_and_columns_2_2 +); + +#[inline] +fn delta_swap_1(a: &mut u32, shift: u32, mask: u32) { + let t = (*a ^ ((*a) >> shift)) & mask; + *a ^= t ^ (t << shift); +} + +#[inline] +fn delta_swap_2(a: &mut u32, b: &mut u32, shift: u32, mask: u32) { + let t = (*a ^ ((*b) >> shift)) & mask; + *a ^= t; + *b ^= t << shift; +} + +/// Applies ShiftRows once on an AES state (or key). +#[cfg(any(not(feature = "compact"), feature = "hazmat"))] +#[inline] +fn shift_rows_1(state: &mut [u32]) { + debug_assert_eq!(state.len(), 8); + for x in state.iter_mut() { + delta_swap_1(x, 4, 0x0c0f0300); + delta_swap_1(x, 2, 0x33003300); + } +} + +/// Applies ShiftRows twice on an AES state (or key). +#[inline] +fn shift_rows_2(state: &mut [u32]) { + debug_assert_eq!(state.len(), 8); + for x in state.iter_mut() { + delta_swap_1(x, 4, 0x0f000f00); + } +} + +/// Applies ShiftRows three times on an AES state (or key). +#[inline] +fn shift_rows_3(state: &mut [u32]) { + debug_assert_eq!(state.len(), 8); + for x in state.iter_mut() { + delta_swap_1(x, 4, 0x030f0c00); + delta_swap_1(x, 2, 0x33003300); + } +} + +#[inline(always)] +fn inv_shift_rows_1(state: &mut [u32]) { + shift_rows_3(state); +} + +#[inline(always)] +fn inv_shift_rows_2(state: &mut [u32]) { + shift_rows_2(state); +} + +#[cfg(not(feature = "compact"))] +#[inline(always)] +fn inv_shift_rows_3(state: &mut [u32]) { + shift_rows_1(state); +} + +/// XOR the columns after the S-box during the key schedule round function. +/// +/// The `idx_xor` parameter refers to the index of the previous round key that is +/// involved in the XOR computation (should be 8 and 16 for AES-128 and AES-256, +/// respectively). +/// +/// The `idx_ror` parameter refers to the rotation value, which varies between the +/// different key schedules. +fn xor_columns(rkeys: &mut [u32], offset: usize, idx_xor: usize, idx_ror: u32) { + for i in 0..8 { + let off_i = offset + i; + let rk = rkeys[off_i - idx_xor] ^ (0x03030303 & ror(rkeys[off_i], idx_ror)); + rkeys[off_i] = + rk ^ (0xfcfcfcfc & (rk << 2)) ^ (0xf0f0f0f0 & (rk << 4)) ^ (0xc0c0c0c0 & (rk << 6)); + } +} + +/// Bitslice two 128-bit input blocks input0, input1 into a 256-bit internal state. +fn bitslice(output: &mut [u32], input0: &[u8], input1: &[u8]) { + debug_assert_eq!(output.len(), 8); + debug_assert_eq!(input0.len(), 16); + debug_assert_eq!(input1.len(), 16); + + // Bitslicing is a bit index manipulation. 256 bits of data means each bit is positioned at an + // 8-bit index. AES data is 2 blocks, each one a 4x4 column-major matrix of bytes, so the + // index is initially ([b]lock, [c]olumn, [r]ow, [p]osition): + // b0 c1 c0 r1 r0 p2 p1 p0 + // + // The desired bitsliced data groups first by bit position, then row, column, block: + // p2 p1 p0 r1 r0 c1 c0 b0 + + // Interleave the columns on input (note the order of input) + // b0 c1 c0 __ __ __ __ __ => c1 c0 b0 __ __ __ __ __ + let mut t0 = u32::from_le_bytes(input0[0x00..0x04].try_into().unwrap()); + let mut t2 = u32::from_le_bytes(input0[0x04..0x08].try_into().unwrap()); + let mut t4 = u32::from_le_bytes(input0[0x08..0x0c].try_into().unwrap()); + let mut t6 = u32::from_le_bytes(input0[0x0c..0x10].try_into().unwrap()); + let mut t1 = u32::from_le_bytes(input1[0x00..0x04].try_into().unwrap()); + let mut t3 = u32::from_le_bytes(input1[0x04..0x08].try_into().unwrap()); + let mut t5 = u32::from_le_bytes(input1[0x08..0x0c].try_into().unwrap()); + let mut t7 = u32::from_le_bytes(input1[0x0c..0x10].try_into().unwrap()); + + // Bit Index Swap 5 <-> 0: + // __ __ b0 __ __ __ __ p0 => __ __ p0 __ __ __ __ b0 + let m0 = 0x55555555; + delta_swap_2(&mut t1, &mut t0, 1, m0); + delta_swap_2(&mut t3, &mut t2, 1, m0); + delta_swap_2(&mut t5, &mut t4, 1, m0); + delta_swap_2(&mut t7, &mut t6, 1, m0); + + // Bit Index Swap 6 <-> 1: + // __ c0 __ __ __ __ p1 __ => __ p1 __ __ __ __ c0 __ + let m1 = 0x33333333; + delta_swap_2(&mut t2, &mut t0, 2, m1); + delta_swap_2(&mut t3, &mut t1, 2, m1); + delta_swap_2(&mut t6, &mut t4, 2, m1); + delta_swap_2(&mut t7, &mut t5, 2, m1); + + // Bit Index Swap 7 <-> 2: + // c1 __ __ __ __ p2 __ __ => p2 __ __ __ __ c1 __ __ + let m2 = 0x0f0f0f0f; + delta_swap_2(&mut t4, &mut t0, 4, m2); + delta_swap_2(&mut t5, &mut t1, 4, m2); + delta_swap_2(&mut t6, &mut t2, 4, m2); + delta_swap_2(&mut t7, &mut t3, 4, m2); + + // Final bitsliced bit index, as desired: + // p2 p1 p0 r1 r0 c1 c0 b0 + output[0] = t0; + output[1] = t1; + output[2] = t2; + output[3] = t3; + output[4] = t4; + output[5] = t5; + output[6] = t6; + output[7] = t7; +} + +/// Un-bitslice a 256-bit internal state into two 128-bit blocks of output. +fn inv_bitslice(input: &[u32], output: &mut [Block]) { + debug_assert_eq!(input.len(), 8); + debug_assert_eq!(output.len(), 2); + + // Unbitslicing is a bit index manipulation. 256 bits of data means each bit is positioned at + // an 8-bit index. AES data is 2 blocks, each one a 4x4 column-major matrix of bytes, so the + // desired index for the output is ([b]lock, [c]olumn, [r]ow, [p]osition): + // b0 c1 c0 r1 r0 p2 p1 p0 + // + // The initially bitsliced data groups first by bit position, then row, column, block: + // p2 p1 p0 r1 r0 c1 c0 b0 + + let mut t0 = input[0]; + let mut t1 = input[1]; + let mut t2 = input[2]; + let mut t3 = input[3]; + let mut t4 = input[4]; + let mut t5 = input[5]; + let mut t6 = input[6]; + let mut t7 = input[7]; + + // TODO: these bit index swaps are identical to those in 'packing' + + // Bit Index Swap 5 <-> 0: + // __ __ p0 __ __ __ __ b0 => __ __ b0 __ __ __ __ p0 + let m0 = 0x55555555; + delta_swap_2(&mut t1, &mut t0, 1, m0); + delta_swap_2(&mut t3, &mut t2, 1, m0); + delta_swap_2(&mut t5, &mut t4, 1, m0); + delta_swap_2(&mut t7, &mut t6, 1, m0); + + // Bit Index Swap 6 <-> 1: + // __ p1 __ __ __ __ c0 __ => __ c0 __ __ __ __ p1 __ + let m1 = 0x33333333; + delta_swap_2(&mut t2, &mut t0, 2, m1); + delta_swap_2(&mut t3, &mut t1, 2, m1); + delta_swap_2(&mut t6, &mut t4, 2, m1); + delta_swap_2(&mut t7, &mut t5, 2, m1); + + // Bit Index Swap 7 <-> 2: + // p2 __ __ __ __ c1 __ __ => c1 __ __ __ __ p2 __ __ + let m2 = 0x0f0f0f0f; + delta_swap_2(&mut t4, &mut t0, 4, m2); + delta_swap_2(&mut t5, &mut t1, 4, m2); + delta_swap_2(&mut t6, &mut t2, 4, m2); + delta_swap_2(&mut t7, &mut t3, 4, m2); + + // De-interleave the columns on output (note the order of output) + // c1 c0 b0 __ __ __ __ __ => b0 c1 c0 __ __ __ __ __ + output[0][0x00..0x04].copy_from_slice(&t0.to_le_bytes()); + output[0][0x04..0x08].copy_from_slice(&t2.to_le_bytes()); + output[0][0x08..0x0c].copy_from_slice(&t4.to_le_bytes()); + output[0][0x0c..0x10].copy_from_slice(&t6.to_le_bytes()); + output[1][0x00..0x04].copy_from_slice(&t1.to_le_bytes()); + output[1][0x04..0x08].copy_from_slice(&t3.to_le_bytes()); + output[1][0x08..0x0c].copy_from_slice(&t5.to_le_bytes()); + output[1][0x0c..0x10].copy_from_slice(&t7.to_le_bytes()); + + // Final AES bit index, as desired: + // b0 c1 c0 r1 r0 p2 p1 p0 +} + +/// Copy 32-bytes within the provided slice to an 8-byte offset +fn memshift32(buffer: &mut [u32], src_offset: usize) { + debug_assert_eq!(src_offset % 8, 0); + + let dst_offset = src_offset + 8; + debug_assert!(dst_offset + 8 <= buffer.len()); + + for i in (0..8).rev() { + buffer[dst_offset + i] = buffer[src_offset + i]; + } +} + +/// XOR the round key to the internal state. The round keys are expected to be +/// pre-computed and to be packed in the fixsliced representation. +#[inline] +fn add_round_key(state: &mut State, rkey: &[u32]) { + debug_assert_eq!(rkey.len(), 8); + for (a, b) in state.iter_mut().zip(rkey) { + *a ^= b; + } +} + +#[inline(always)] +fn add_round_constant_bit(state: &mut [u32], bit: usize) { + state[bit] ^= 0x0000c000; +} + +#[inline(always)] +fn ror(x: u32, y: u32) -> u32 { + x.rotate_right(y) +} + +#[inline(always)] +fn ror_distance(rows: u32, cols: u32) -> u32 { + (rows << 3) + (cols << 1) +} + +#[inline(always)] +fn rotate_rows_1(x: u32) -> u32 { + ror(x, ror_distance(1, 0)) +} + +#[inline(always)] +fn rotate_rows_2(x: u32) -> u32 { + ror(x, ror_distance(2, 0)) +} + +#[inline(always)] +#[rustfmt::skip] +fn rotate_rows_and_columns_1_1(x: u32) -> u32 { + (ror(x, ror_distance(1, 1)) & 0x3f3f3f3f) | + (ror(x, ror_distance(0, 1)) & 0xc0c0c0c0) +} + +#[cfg(not(feature = "compact"))] +#[inline(always)] +#[rustfmt::skip] +fn rotate_rows_and_columns_1_2(x: u32) -> u32 { + (ror(x, ror_distance(1, 2)) & 0x0f0f0f0f) | + (ror(x, ror_distance(0, 2)) & 0xf0f0f0f0) +} + +#[cfg(not(feature = "compact"))] +#[inline(always)] +#[rustfmt::skip] +fn rotate_rows_and_columns_1_3(x: u32) -> u32 { + (ror(x, ror_distance(1, 3)) & 0x03030303) | + (ror(x, ror_distance(0, 3)) & 0xfcfcfcfc) +} + +#[inline(always)] +#[rustfmt::skip] +fn rotate_rows_and_columns_2_2(x: u32) -> u32 { + (ror(x, ror_distance(2, 2)) & 0x0f0f0f0f) | + (ror(x, ror_distance(1, 2)) & 0xf0f0f0f0) +} + +/// Low-level "hazmat" AES functions. +/// +/// Note: this isn't actually used in the `Aes128`/`Aes192`/`Aes256` +/// implementations in this crate, but instead provides raw access to +/// the AES round function gated under the `hazmat` crate feature. +#[cfg(feature = "hazmat")] +pub(crate) mod hazmat { + use super::{ + bitslice, inv_bitslice, inv_mix_columns_0, inv_shift_rows_1, inv_sub_bytes, mix_columns_0, + shift_rows_1, sub_bytes, sub_bytes_nots, State, + }; + use crate::{Block, ParBlocks}; + + /// XOR the `src` block into the `dst` block in-place. + fn xor_in_place(dst: &mut Block, src: &Block) { + for (a, b) in dst.iter_mut().zip(src.as_slice()) { + *a ^= *b; + } + } + + /// Perform a bitslice operation, loading a single block. + fn bitslice_block(block: &Block) -> State { + let mut state = State::default(); + bitslice(&mut state, block, block); + state + } + + /// Perform an inverse bitslice operation, extracting a single block. + fn inv_bitslice_block(block: &mut Block, state: &State) { + let mut out = [Block::default(); 2]; + inv_bitslice(state, &mut out); + block.copy_from_slice(&out[0]); + } + + /// AES cipher (encrypt) round function. + #[inline] + pub(crate) fn cipher_round(block: &mut Block, round_key: &Block) { + let mut state = bitslice_block(block); + sub_bytes(&mut state); + sub_bytes_nots(&mut state); + shift_rows_1(&mut state); + mix_columns_0(&mut state); + inv_bitslice_block(block, &state); + xor_in_place(block, round_key); + } + + /// AES cipher (encrypt) round function: parallel version. + #[inline] + pub(crate) fn cipher_round_par(blocks: &mut ParBlocks, round_keys: &ParBlocks) { + for (chunk, keys) in blocks.chunks_exact_mut(2).zip(round_keys.chunks_exact(2)) { + let mut state = State::default(); + bitslice(&mut state, &chunk[0], &chunk[1]); + sub_bytes(&mut state); + sub_bytes_nots(&mut state); + shift_rows_1(&mut state); + mix_columns_0(&mut state); + inv_bitslice(&state, chunk); + + for i in 0..2 { + xor_in_place(&mut chunk[i], &keys[i]); + } + } + } + + /// AES cipher (encrypt) round function. + #[inline] + pub(crate) fn equiv_inv_cipher_round(block: &mut Block, round_key: &Block) { + let mut state = bitslice_block(block); + sub_bytes_nots(&mut state); + inv_sub_bytes(&mut state); + inv_shift_rows_1(&mut state); + inv_mix_columns_0(&mut state); + inv_bitslice_block(block, &state); + xor_in_place(block, round_key); + } + + /// AES cipher (encrypt) round function: parallel version. + #[inline] + pub(crate) fn equiv_inv_cipher_round_par(blocks: &mut ParBlocks, round_keys: &ParBlocks) { + for (chunk, keys) in blocks.chunks_exact_mut(2).zip(round_keys.chunks_exact(2)) { + let mut state = State::default(); + bitslice(&mut state, &chunk[0], &chunk[1]); + sub_bytes_nots(&mut state); + inv_sub_bytes(&mut state); + inv_shift_rows_1(&mut state); + inv_mix_columns_0(&mut state); + inv_bitslice(&state, chunk); + + for i in 0..2 { + xor_in_place(&mut chunk[i], &keys[i]); + } + } + } + + /// AES mix columns function. + #[inline] + pub(crate) fn mix_columns(block: &mut Block) { + let mut state = bitslice_block(block); + mix_columns_0(&mut state); + inv_bitslice_block(block, &state); + } + + /// AES inverse mix columns function. + #[inline] + pub(crate) fn inv_mix_columns(block: &mut Block) { + let mut state = bitslice_block(block); + inv_mix_columns_0(&mut state); + inv_bitslice_block(block, &state); + } +} diff --git a/rust/vendor/aes/src/soft/fixslice64.rs b/rust/vendor/aes/src/soft/fixslice64.rs new file mode 100644 index 0000000..18315b7 --- /dev/null +++ b/rust/vendor/aes/src/soft/fixslice64.rs @@ -0,0 +1,1540 @@ +//! Fixsliced implementations of AES-128, AES-192 and AES-256 (64-bit) +//! adapted from the C implementation. +//! +//! All implementations are fully bitsliced and do not rely on any +//! Look-Up Table (LUT). +//! +//! See the paper at <https://eprint.iacr.org/2020/1123.pdf> for more details. +//! +//! # Author (original C code) +//! +//! Alexandre Adomnicai, Nanyang Technological University, Singapore +//! <alexandre.adomnicai@ntu.edu.sg> +//! +//! Originally licensed MIT. Relicensed as Apache 2.0+MIT with permission. + +#![allow(clippy::unreadable_literal)] + +use crate::Block; +use cipher::{ + consts::{U16, U24, U32}, + generic_array::GenericArray, +}; + +/// AES block batch size for this implementation +pub(crate) const FIXSLICE_BLOCKS: usize = 4; + +/// AES-128 round keys +pub(crate) type FixsliceKeys128 = [u64; 88]; + +/// AES-192 round keys +pub(crate) type FixsliceKeys192 = [u64; 104]; + +/// AES-256 round keys +pub(crate) type FixsliceKeys256 = [u64; 120]; + +/// 512-bit internal state +pub(crate) type State = [u64; 8]; + +/// Fully bitsliced AES-128 key schedule to match the fully-fixsliced representation. +pub(crate) fn aes128_key_schedule(key: &GenericArray<u8, U16>) -> FixsliceKeys128 { + let mut rkeys = [0u64; 88]; + + bitslice(&mut rkeys[..8], key, key, key, key); + + let mut rk_off = 0; + for rcon in 0..10 { + memshift32(&mut rkeys, rk_off); + rk_off += 8; + + sub_bytes(&mut rkeys[rk_off..(rk_off + 8)]); + sub_bytes_nots(&mut rkeys[rk_off..(rk_off + 8)]); + + if rcon < 8 { + add_round_constant_bit(&mut rkeys[rk_off..(rk_off + 8)], rcon); + } else { + add_round_constant_bit(&mut rkeys[rk_off..(rk_off + 8)], rcon - 8); + add_round_constant_bit(&mut rkeys[rk_off..(rk_off + 8)], rcon - 7); + add_round_constant_bit(&mut rkeys[rk_off..(rk_off + 8)], rcon - 5); + add_round_constant_bit(&mut rkeys[rk_off..(rk_off + 8)], rcon - 4); + } + + xor_columns(&mut rkeys, rk_off, 8, ror_distance(1, 3)); + } + + // Adjust to match fixslicing format + #[cfg(feature = "compact")] + { + for i in (8..88).step_by(16) { + inv_shift_rows_1(&mut rkeys[i..(i + 8)]); + } + } + #[cfg(not(feature = "compact"))] + { + for i in (8..72).step_by(32) { + inv_shift_rows_1(&mut rkeys[i..(i + 8)]); + inv_shift_rows_2(&mut rkeys[(i + 8)..(i + 16)]); + inv_shift_rows_3(&mut rkeys[(i + 16)..(i + 24)]); + } + inv_shift_rows_1(&mut rkeys[72..80]); + } + + // Account for NOTs removed from sub_bytes + for i in 1..11 { + sub_bytes_nots(&mut rkeys[(i * 8)..(i * 8 + 8)]); + } + + rkeys +} + +/// Fully bitsliced AES-192 key schedule to match the fully-fixsliced representation. +pub(crate) fn aes192_key_schedule(key: &GenericArray<u8, U24>) -> FixsliceKeys192 { + let mut rkeys = [0u64; 104]; + let mut tmp = [0u64; 8]; + + bitslice( + &mut rkeys[..8], + &key[..16], + &key[..16], + &key[..16], + &key[..16], + ); + bitslice(&mut tmp, &key[8..], &key[8..], &key[8..], &key[8..]); + + let mut rcon = 0; + let mut rk_off = 8; + + loop { + for i in 0..8 { + rkeys[rk_off + i] = (0x00ff00ff00ff00ff & (tmp[i] >> 8)) + | (0xff00ff00ff00ff00 & (rkeys[(rk_off - 8) + i] << 8)); + } + + sub_bytes(&mut tmp); + sub_bytes_nots(&mut tmp); + + add_round_constant_bit(&mut tmp, rcon); + rcon += 1; + + for i in 0..8 { + let mut ti = rkeys[rk_off + i]; + ti ^= 0x0f000f000f000f00 & ror(tmp[i], ror_distance(1, 1)); + ti ^= 0xf000f000f000f000 & (ti << 4); + tmp[i] = ti; + } + rkeys[rk_off..(rk_off + 8)].copy_from_slice(&tmp); + rk_off += 8; + + for i in 0..8 { + let ui = tmp[i]; + let mut ti = (0x00ff00ff00ff00ff & (rkeys[(rk_off - 16) + i] >> 8)) + | (0xff00ff00ff00ff00 & (ui << 8)); + ti ^= 0x000f000f000f000f & (ui >> 12); + tmp[i] = ti + ^ (0xfff0fff0fff0fff0 & (ti << 4)) + ^ (0xff00ff00ff00ff00 & (ti << 8)) + ^ (0xf000f000f000f000 & (ti << 12)); + } + rkeys[rk_off..(rk_off + 8)].copy_from_slice(&tmp); + rk_off += 8; + + sub_bytes(&mut tmp); + sub_bytes_nots(&mut tmp); + + add_round_constant_bit(&mut tmp, rcon); + rcon += 1; + + for i in 0..8 { + let mut ti = (0x00ff00ff00ff00ff & (rkeys[(rk_off - 16) + i] >> 8)) + | (0xff00ff00ff00ff00 & (rkeys[(rk_off - 8) + i] << 8)); + ti ^= 0x000f000f000f000f & ror(tmp[i], ror_distance(1, 3)); + rkeys[rk_off + i] = ti + ^ (0xfff0fff0fff0fff0 & (ti << 4)) + ^ (0xff00ff00ff00ff00 & (ti << 8)) + ^ (0xf000f000f000f000 & (ti << 12)); + } + rk_off += 8; + + if rcon >= 8 { + break; + } + + for i in 0..8 { + let ui = rkeys[(rk_off - 8) + i]; + let mut ti = rkeys[(rk_off - 16) + i]; + ti ^= 0x0f000f000f000f00 & (ui >> 4); + ti ^= 0xf000f000f000f000 & (ti << 4); + tmp[i] = ti; + } + } + + // Adjust to match fixslicing format + #[cfg(feature = "compact")] + { + for i in (8..104).step_by(16) { + inv_shift_rows_1(&mut rkeys[i..(i + 8)]); + } + } + #[cfg(not(feature = "compact"))] + { + for i in (0..96).step_by(32) { + inv_shift_rows_1(&mut rkeys[(i + 8)..(i + 16)]); + inv_shift_rows_2(&mut rkeys[(i + 16)..(i + 24)]); + inv_shift_rows_3(&mut rkeys[(i + 24)..(i + 32)]); + } + } + + // Account for NOTs removed from sub_bytes + for i in 1..13 { + sub_bytes_nots(&mut rkeys[(i * 8)..(i * 8 + 8)]); + } + + rkeys +} + +/// Fully bitsliced AES-256 key schedule to match the fully-fixsliced representation. +pub(crate) fn aes256_key_schedule(key: &GenericArray<u8, U32>) -> FixsliceKeys256 { + let mut rkeys = [0u64; 120]; + + bitslice( + &mut rkeys[..8], + &key[..16], + &key[..16], + &key[..16], + &key[..16], + ); + bitslice( + &mut rkeys[8..16], + &key[16..], + &key[16..], + &key[16..], + &key[16..], + ); + + let mut rk_off = 8; + + let mut rcon = 0; + loop { + memshift32(&mut rkeys, rk_off); + rk_off += 8; + + sub_bytes(&mut rkeys[rk_off..(rk_off + 8)]); + sub_bytes_nots(&mut rkeys[rk_off..(rk_off + 8)]); + + add_round_constant_bit(&mut rkeys[rk_off..(rk_off + 8)], rcon); + xor_columns(&mut rkeys, rk_off, 16, ror_distance(1, 3)); + rcon += 1; + + if rcon == 7 { + break; + } + + memshift32(&mut rkeys, rk_off); + rk_off += 8; + + sub_bytes(&mut rkeys[rk_off..(rk_off + 8)]); + sub_bytes_nots(&mut rkeys[rk_off..(rk_off + 8)]); + + xor_columns(&mut rkeys, rk_off, 16, ror_distance(0, 3)); + } + + // Adjust to match fixslicing format + #[cfg(feature = "compact")] + { + for i in (8..120).step_by(16) { + inv_shift_rows_1(&mut rkeys[i..(i + 8)]); + } + } + #[cfg(not(feature = "compact"))] + { + for i in (8..104).step_by(32) { + inv_shift_rows_1(&mut rkeys[i..(i + 8)]); + inv_shift_rows_2(&mut rkeys[(i + 8)..(i + 16)]); + inv_shift_rows_3(&mut rkeys[(i + 16)..(i + 24)]); + } + inv_shift_rows_1(&mut rkeys[104..112]); + } + + // Account for NOTs removed from sub_bytes + for i in 1..15 { + sub_bytes_nots(&mut rkeys[(i * 8)..(i * 8 + 8)]); + } + + rkeys +} + +/// Fully-fixsliced AES-128 decryption (the InvShiftRows is completely omitted). +/// +/// Decrypts four blocks in-place and in parallel. +pub(crate) fn aes128_decrypt(rkeys: &FixsliceKeys128, blocks: &mut [Block]) { + debug_assert_eq!(blocks.len(), FIXSLICE_BLOCKS); + let mut state = State::default(); + + bitslice(&mut state, &blocks[0], &blocks[1], &blocks[2], &blocks[3]); + + add_round_key(&mut state, &rkeys[80..]); + inv_sub_bytes(&mut state); + + #[cfg(not(feature = "compact"))] + { + inv_shift_rows_2(&mut state); + } + + let mut rk_off = 72; + loop { + #[cfg(feature = "compact")] + { + inv_shift_rows_2(&mut state); + } + + add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]); + inv_mix_columns_1(&mut state); + inv_sub_bytes(&mut state); + rk_off -= 8; + + if rk_off == 0 { + break; + } + + add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]); + inv_mix_columns_0(&mut state); + inv_sub_bytes(&mut state); + rk_off -= 8; + + #[cfg(not(feature = "compact"))] + { + add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]); + inv_mix_columns_3(&mut state); + inv_sub_bytes(&mut state); + rk_off -= 8; + + add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]); + inv_mix_columns_2(&mut state); + inv_sub_bytes(&mut state); + rk_off -= 8; + } + } + + add_round_key(&mut state, &rkeys[..8]); + + inv_bitslice(&state, blocks); +} + +/// Fully-fixsliced AES-128 encryption (the ShiftRows is completely omitted). +/// +/// Encrypts four blocks in-place and in parallel. +pub(crate) fn aes128_encrypt(rkeys: &FixsliceKeys128, blocks: &mut [Block]) { + debug_assert_eq!(blocks.len(), FIXSLICE_BLOCKS); + let mut state = State::default(); + + bitslice(&mut state, &blocks[0], &blocks[1], &blocks[2], &blocks[3]); + + add_round_key(&mut state, &rkeys[..8]); + + let mut rk_off = 8; + loop { + sub_bytes(&mut state); + mix_columns_1(&mut state); + add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]); + rk_off += 8; + + #[cfg(feature = "compact")] + { + shift_rows_2(&mut state); + } + + if rk_off == 80 { + break; + } + + #[cfg(not(feature = "compact"))] + { + sub_bytes(&mut state); + mix_columns_2(&mut state); + add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]); + rk_off += 8; + + sub_bytes(&mut state); + mix_columns_3(&mut state); + add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]); + rk_off += 8; + } + + sub_bytes(&mut state); + mix_columns_0(&mut state); + add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]); + rk_off += 8; + } + + #[cfg(not(feature = "compact"))] + { + shift_rows_2(&mut state); + } + + sub_bytes(&mut state); + add_round_key(&mut state, &rkeys[80..]); + + inv_bitslice(&state, blocks); +} + +/// Fully-fixsliced AES-192 decryption (the InvShiftRows is completely omitted). +/// +/// Decrypts four blocks in-place and in parallel. +pub(crate) fn aes192_decrypt(rkeys: &FixsliceKeys192, blocks: &mut [Block]) { + debug_assert_eq!(blocks.len(), FIXSLICE_BLOCKS); + let mut state = State::default(); + + bitslice(&mut state, &blocks[0], &blocks[1], &blocks[2], &blocks[3]); + + add_round_key(&mut state, &rkeys[96..]); + inv_sub_bytes(&mut state); + + let mut rk_off = 88; + loop { + #[cfg(feature = "compact")] + { + inv_shift_rows_2(&mut state); + } + #[cfg(not(feature = "compact"))] + { + add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]); + inv_mix_columns_3(&mut state); + inv_sub_bytes(&mut state); + rk_off -= 8; + + add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]); + inv_mix_columns_2(&mut state); + inv_sub_bytes(&mut state); + rk_off -= 8; + } + + add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]); + inv_mix_columns_1(&mut state); + inv_sub_bytes(&mut state); + rk_off -= 8; + + if rk_off == 0 { + break; + } + + add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]); + inv_mix_columns_0(&mut state); + inv_sub_bytes(&mut state); + rk_off -= 8; + } + + add_round_key(&mut state, &rkeys[..8]); + + inv_bitslice(&state, blocks); +} + +/// Fully-fixsliced AES-192 encryption (the ShiftRows is completely omitted). +/// +/// Encrypts four blocks in-place and in parallel. +pub(crate) fn aes192_encrypt(rkeys: &FixsliceKeys192, blocks: &mut [Block]) { + debug_assert_eq!(blocks.len(), FIXSLICE_BLOCKS); + let mut state = State::default(); + + bitslice(&mut state, &blocks[0], &blocks[1], &blocks[2], &blocks[3]); + + add_round_key(&mut state, &rkeys[..8]); + + let mut rk_off = 8; + loop { + sub_bytes(&mut state); + mix_columns_1(&mut state); + add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]); + rk_off += 8; + + #[cfg(feature = "compact")] + { + shift_rows_2(&mut state); + } + #[cfg(not(feature = "compact"))] + { + sub_bytes(&mut state); + mix_columns_2(&mut state); + add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]); + rk_off += 8; + + sub_bytes(&mut state); + mix_columns_3(&mut state); + add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]); + rk_off += 8; + } + + if rk_off == 96 { + break; + } + + sub_bytes(&mut state); + mix_columns_0(&mut state); + add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]); + rk_off += 8; + } + + sub_bytes(&mut state); + add_round_key(&mut state, &rkeys[96..]); + + inv_bitslice(&state, blocks); +} + +/// Fully-fixsliced AES-256 decryption (the InvShiftRows is completely omitted). +/// +/// Decrypts four blocks in-place and in parallel. +pub(crate) fn aes256_decrypt(rkeys: &FixsliceKeys256, blocks: &mut [Block]) { + debug_assert_eq!(blocks.len(), FIXSLICE_BLOCKS); + let mut state = State::default(); + + bitslice(&mut state, &blocks[0], &blocks[1], &blocks[2], &blocks[3]); + + add_round_key(&mut state, &rkeys[112..]); + inv_sub_bytes(&mut state); + + #[cfg(not(feature = "compact"))] + { + inv_shift_rows_2(&mut state); + } + + let mut rk_off = 104; + loop { + #[cfg(feature = "compact")] + { + inv_shift_rows_2(&mut state); + } + + add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]); + inv_mix_columns_1(&mut state); + inv_sub_bytes(&mut state); + rk_off -= 8; + + if rk_off == 0 { + break; + } + + add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]); + inv_mix_columns_0(&mut state); + inv_sub_bytes(&mut state); + rk_off -= 8; + + #[cfg(not(feature = "compact"))] + { + add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]); + inv_mix_columns_3(&mut state); + inv_sub_bytes(&mut state); + rk_off -= 8; + + add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]); + inv_mix_columns_2(&mut state); + inv_sub_bytes(&mut state); + rk_off -= 8; + } + } + + add_round_key(&mut state, &rkeys[..8]); + + inv_bitslice(&state, blocks); +} + +/// Fully-fixsliced AES-256 encryption (the ShiftRows is completely omitted). +/// +/// Encrypts four blocks in-place and in parallel. +pub(crate) fn aes256_encrypt(rkeys: &FixsliceKeys256, blocks: &mut [Block]) { + debug_assert_eq!(blocks.len(), FIXSLICE_BLOCKS); + let mut state = State::default(); + + bitslice(&mut state, &blocks[0], &blocks[1], &blocks[2], &blocks[3]); + + add_round_key(&mut state, &rkeys[..8]); + + let mut rk_off = 8; + loop { + sub_bytes(&mut state); + mix_columns_1(&mut state); + add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]); + rk_off += 8; + + #[cfg(feature = "compact")] + { + shift_rows_2(&mut state); + } + + if rk_off == 112 { + break; + } + + #[cfg(not(feature = "compact"))] + { + sub_bytes(&mut state); + mix_columns_2(&mut state); + add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]); + rk_off += 8; + + sub_bytes(&mut state); + mix_columns_3(&mut state); + add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]); + rk_off += 8; + } + + sub_bytes(&mut state); + mix_columns_0(&mut state); + add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]); + rk_off += 8; + } + + #[cfg(not(feature = "compact"))] + { + shift_rows_2(&mut state); + } + + sub_bytes(&mut state); + add_round_key(&mut state, &rkeys[112..]); + + inv_bitslice(&state, blocks); +} + +/// Note that the 4 bitwise NOT (^= 0xffffffffffffffff) are accounted for here so that it is a true +/// inverse of 'sub_bytes'. +fn inv_sub_bytes(state: &mut [u64]) { + debug_assert_eq!(state.len(), 8); + + // Scheduled using https://github.com/Ko-/aes-armcortexm/tree/public/scheduler + // Inline "stack" comments reflect suggested stores and loads (ARM Cortex-M3 and M4) + + let u7 = state[0]; + let u6 = state[1]; + let u5 = state[2]; + let u4 = state[3]; + let u3 = state[4]; + let u2 = state[5]; + let u1 = state[6]; + let u0 = state[7]; + + let t23 = u0 ^ u3; + let t8 = u1 ^ t23; + let m2 = t23 & t8; + let t4 = u4 ^ t8; + let t22 = u1 ^ u3; + let t2 = u0 ^ u1; + let t1 = u3 ^ u4; + // t23 -> stack + let t9 = u7 ^ t1; + // t8 -> stack + let m7 = t22 & t9; + // t9 -> stack + let t24 = u4 ^ u7; + // m7 -> stack + let t10 = t2 ^ t24; + // u4 -> stack + let m14 = t2 & t10; + let r5 = u6 ^ u7; + // m2 -> stack + let t3 = t1 ^ r5; + // t2 -> stack + let t13 = t2 ^ r5; + let t19 = t22 ^ r5; + // t3 -> stack + let t17 = u2 ^ t19; + // t4 -> stack + let t25 = u2 ^ t1; + let r13 = u1 ^ u6; + // t25 -> stack + let t20 = t24 ^ r13; + // t17 -> stack + let m9 = t20 & t17; + // t20 -> stack + let r17 = u2 ^ u5; + // t22 -> stack + let t6 = t22 ^ r17; + // t13 -> stack + let m1 = t13 & t6; + let y5 = u0 ^ r17; + let m4 = t19 & y5; + let m5 = m4 ^ m1; + let m17 = m5 ^ t24; + let r18 = u5 ^ u6; + let t27 = t1 ^ r18; + let t15 = t10 ^ t27; + // t6 -> stack + let m11 = t1 & t15; + let m15 = m14 ^ m11; + let m21 = m17 ^ m15; + // t1 -> stack + // t4 <- stack + let m12 = t4 & t27; + let m13 = m12 ^ m11; + let t14 = t10 ^ r18; + let m3 = t14 ^ m1; + // m2 <- stack + let m16 = m3 ^ m2; + let m20 = m16 ^ m13; + // u4 <- stack + let r19 = u2 ^ u4; + let t16 = r13 ^ r19; + // t3 <- stack + let t26 = t3 ^ t16; + let m6 = t3 & t16; + let m8 = t26 ^ m6; + // t10 -> stack + // m7 <- stack + let m18 = m8 ^ m7; + let m22 = m18 ^ m13; + let m25 = m22 & m20; + let m26 = m21 ^ m25; + let m10 = m9 ^ m6; + let m19 = m10 ^ m15; + // t25 <- stack + let m23 = m19 ^ t25; + let m28 = m23 ^ m25; + let m24 = m22 ^ m23; + let m30 = m26 & m24; + let m39 = m23 ^ m30; + let m48 = m39 & y5; + let m57 = m39 & t19; + // m48 -> stack + let m36 = m24 ^ m25; + let m31 = m20 & m23; + let m27 = m20 ^ m21; + let m32 = m27 & m31; + let m29 = m28 & m27; + let m37 = m21 ^ m29; + // m39 -> stack + let m42 = m37 ^ m39; + let m52 = m42 & t15; + // t27 -> stack + // t1 <- stack + let m61 = m42 & t1; + let p0 = m52 ^ m61; + let p16 = m57 ^ m61; + // m57 -> stack + // t20 <- stack + let m60 = m37 & t20; + // p16 -> stack + // t17 <- stack + let m51 = m37 & t17; + let m33 = m27 ^ m25; + let m38 = m32 ^ m33; + let m43 = m37 ^ m38; + let m49 = m43 & t16; + let p6 = m49 ^ m60; + let p13 = m49 ^ m51; + let m58 = m43 & t3; + // t9 <- stack + let m50 = m38 & t9; + // t22 <- stack + let m59 = m38 & t22; + // p6 -> stack + let p1 = m58 ^ m59; + let p7 = p0 ^ p1; + let m34 = m21 & m22; + let m35 = m24 & m34; + let m40 = m35 ^ m36; + let m41 = m38 ^ m40; + let m45 = m42 ^ m41; + // t27 <- stack + let m53 = m45 & t27; + let p8 = m50 ^ m53; + let p23 = p7 ^ p8; + // t4 <- stack + let m62 = m45 & t4; + let p14 = m49 ^ m62; + let s6 = p14 ^ p23; + // t10 <- stack + let m54 = m41 & t10; + let p2 = m54 ^ m62; + let p22 = p2 ^ p7; + let s0 = p13 ^ p22; + let p17 = m58 ^ p2; + let p15 = m54 ^ m59; + // t2 <- stack + let m63 = m41 & t2; + // m39 <- stack + let m44 = m39 ^ m40; + // p17 -> stack + // t6 <- stack + let m46 = m44 & t6; + let p5 = m46 ^ m51; + // p23 -> stack + let p18 = m63 ^ p5; + let p24 = p5 ^ p7; + // m48 <- stack + let p12 = m46 ^ m48; + let s3 = p12 ^ p22; + // t13 <- stack + let m55 = m44 & t13; + let p9 = m55 ^ m63; + // p16 <- stack + let s7 = p9 ^ p16; + // t8 <- stack + let m47 = m40 & t8; + let p3 = m47 ^ m50; + let p19 = p2 ^ p3; + let s5 = p19 ^ p24; + let p11 = p0 ^ p3; + let p26 = p9 ^ p11; + // t23 <- stack + let m56 = m40 & t23; + let p4 = m48 ^ m56; + // p6 <- stack + let p20 = p4 ^ p6; + let p29 = p15 ^ p20; + let s1 = p26 ^ p29; + // m57 <- stack + let p10 = m57 ^ p4; + let p27 = p10 ^ p18; + // p23 <- stack + let s4 = p23 ^ p27; + let p25 = p6 ^ p10; + let p28 = p11 ^ p25; + // p17 <- stack + let s2 = p17 ^ p28; + + state[0] = s7; + state[1] = s6; + state[2] = s5; + state[3] = s4; + state[4] = s3; + state[5] = s2; + state[6] = s1; + state[7] = s0; +} + +/// Bitsliced implementation of the AES Sbox based on Boyar, Peralta and Calik. +/// +/// See: <http://www.cs.yale.edu/homes/peralta/CircuitStuff/SLP_AES_113.txt> +/// +/// Note that the 4 bitwise NOT (^= 0xffffffffffffffff) are moved to the key schedule. +fn sub_bytes(state: &mut [u64]) { + debug_assert_eq!(state.len(), 8); + + // Scheduled using https://github.com/Ko-/aes-armcortexm/tree/public/scheduler + // Inline "stack" comments reflect suggested stores and loads (ARM Cortex-M3 and M4) + + let u7 = state[0]; + let u6 = state[1]; + let u5 = state[2]; + let u4 = state[3]; + let u3 = state[4]; + let u2 = state[5]; + let u1 = state[6]; + let u0 = state[7]; + + let y14 = u3 ^ u5; + let y13 = u0 ^ u6; + let y12 = y13 ^ y14; + let t1 = u4 ^ y12; + let y15 = t1 ^ u5; + let t2 = y12 & y15; + let y6 = y15 ^ u7; + let y20 = t1 ^ u1; + // y12 -> stack + let y9 = u0 ^ u3; + // y20 -> stack + let y11 = y20 ^ y9; + // y9 -> stack + let t12 = y9 & y11; + // y6 -> stack + let y7 = u7 ^ y11; + let y8 = u0 ^ u5; + let t0 = u1 ^ u2; + let y10 = y15 ^ t0; + // y15 -> stack + let y17 = y10 ^ y11; + // y14 -> stack + let t13 = y14 & y17; + let t14 = t13 ^ t12; + // y17 -> stack + let y19 = y10 ^ y8; + // y10 -> stack + let t15 = y8 & y10; + let t16 = t15 ^ t12; + let y16 = t0 ^ y11; + // y11 -> stack + let y21 = y13 ^ y16; + // y13 -> stack + let t7 = y13 & y16; + // y16 -> stack + let y18 = u0 ^ y16; + let y1 = t0 ^ u7; + let y4 = y1 ^ u3; + // u7 -> stack + let t5 = y4 & u7; + let t6 = t5 ^ t2; + let t18 = t6 ^ t16; + let t22 = t18 ^ y19; + let y2 = y1 ^ u0; + let t10 = y2 & y7; + let t11 = t10 ^ t7; + let t20 = t11 ^ t16; + let t24 = t20 ^ y18; + let y5 = y1 ^ u6; + let t8 = y5 & y1; + let t9 = t8 ^ t7; + let t19 = t9 ^ t14; + let t23 = t19 ^ y21; + let y3 = y5 ^ y8; + // y6 <- stack + let t3 = y3 & y6; + let t4 = t3 ^ t2; + // y20 <- stack + let t17 = t4 ^ y20; + let t21 = t17 ^ t14; + let t26 = t21 & t23; + let t27 = t24 ^ t26; + let t31 = t22 ^ t26; + let t25 = t21 ^ t22; + // y4 -> stack + let t28 = t25 & t27; + let t29 = t28 ^ t22; + let z14 = t29 & y2; + let z5 = t29 & y7; + let t30 = t23 ^ t24; + let t32 = t31 & t30; + let t33 = t32 ^ t24; + let t35 = t27 ^ t33; + let t36 = t24 & t35; + let t38 = t27 ^ t36; + let t39 = t29 & t38; + let t40 = t25 ^ t39; + let t43 = t29 ^ t40; + // y16 <- stack + let z3 = t43 & y16; + let tc12 = z3 ^ z5; + // tc12 -> stack + // y13 <- stack + let z12 = t43 & y13; + let z13 = t40 & y5; + let z4 = t40 & y1; + let tc6 = z3 ^ z4; + let t34 = t23 ^ t33; + let t37 = t36 ^ t34; + let t41 = t40 ^ t37; + // y10 <- stack + let z8 = t41 & y10; + let z17 = t41 & y8; + let t44 = t33 ^ t37; + // y15 <- stack + let z0 = t44 & y15; + // z17 -> stack + // y12 <- stack + let z9 = t44 & y12; + let z10 = t37 & y3; + let z1 = t37 & y6; + let tc5 = z1 ^ z0; + let tc11 = tc6 ^ tc5; + // y4 <- stack + let z11 = t33 & y4; + let t42 = t29 ^ t33; + let t45 = t42 ^ t41; + // y17 <- stack + let z7 = t45 & y17; + let tc8 = z7 ^ tc6; + // y14 <- stack + let z16 = t45 & y14; + // y11 <- stack + let z6 = t42 & y11; + let tc16 = z6 ^ tc8; + // z14 -> stack + // y9 <- stack + let z15 = t42 & y9; + let tc20 = z15 ^ tc16; + let tc1 = z15 ^ z16; + let tc2 = z10 ^ tc1; + let tc21 = tc2 ^ z11; + let tc3 = z9 ^ tc2; + let s0 = tc3 ^ tc16; + let s3 = tc3 ^ tc11; + let s1 = s3 ^ tc16; + let tc13 = z13 ^ tc1; + // u7 <- stack + let z2 = t33 & u7; + let tc4 = z0 ^ z2; + let tc7 = z12 ^ tc4; + let tc9 = z8 ^ tc7; + let tc10 = tc8 ^ tc9; + // z14 <- stack + let tc17 = z14 ^ tc10; + let s5 = tc21 ^ tc17; + let tc26 = tc17 ^ tc20; + // z17 <- stack + let s2 = tc26 ^ z17; + // tc12 <- stack + let tc14 = tc4 ^ tc12; + let tc18 = tc13 ^ tc14; + let s6 = tc10 ^ tc18; + let s7 = z12 ^ tc18; + let s4 = tc14 ^ s3; + + state[0] = s7; + state[1] = s6; + state[2] = s5; + state[3] = s4; + state[4] = s3; + state[5] = s2; + state[6] = s1; + state[7] = s0; +} + +/// NOT operations that are omitted in S-box +#[inline] +fn sub_bytes_nots(state: &mut [u64]) { + debug_assert_eq!(state.len(), 8); + state[0] ^= 0xffffffffffffffff; + state[1] ^= 0xffffffffffffffff; + state[5] ^= 0xffffffffffffffff; + state[6] ^= 0xffffffffffffffff; +} + +/// Computation of the MixColumns transformation in the fixsliced representation, with different +/// rotations used according to the round number mod 4. +/// +/// Based on Käsper-Schwabe, similar to https://github.com/Ko-/aes-armcortexm. +macro_rules! define_mix_columns { + ( + $name:ident, + $name_inv:ident, + $first_rotate:path, + $second_rotate:path + ) => { + #[rustfmt::skip] + fn $name(state: &mut State) { + let (a0, a1, a2, a3, a4, a5, a6, a7) = ( + state[0], state[1], state[2], state[3], state[4], state[5], state[6], state[7] + ); + let (b0, b1, b2, b3, b4, b5, b6, b7) = ( + $first_rotate(a0), + $first_rotate(a1), + $first_rotate(a2), + $first_rotate(a3), + $first_rotate(a4), + $first_rotate(a5), + $first_rotate(a6), + $first_rotate(a7), + ); + let (c0, c1, c2, c3, c4, c5, c6, c7) = ( + a0 ^ b0, + a1 ^ b1, + a2 ^ b2, + a3 ^ b3, + a4 ^ b4, + a5 ^ b5, + a6 ^ b6, + a7 ^ b7, + ); + state[0] = b0 ^ c7 ^ $second_rotate(c0); + state[1] = b1 ^ c0 ^ c7 ^ $second_rotate(c1); + state[2] = b2 ^ c1 ^ $second_rotate(c2); + state[3] = b3 ^ c2 ^ c7 ^ $second_rotate(c3); + state[4] = b4 ^ c3 ^ c7 ^ $second_rotate(c4); + state[5] = b5 ^ c4 ^ $second_rotate(c5); + state[6] = b6 ^ c5 ^ $second_rotate(c6); + state[7] = b7 ^ c6 ^ $second_rotate(c7); + } + + #[rustfmt::skip] + fn $name_inv(state: &mut State) { + let (a0, a1, a2, a3, a4, a5, a6, a7) = ( + state[0], state[1], state[2], state[3], state[4], state[5], state[6], state[7] + ); + let (b0, b1, b2, b3, b4, b5, b6, b7) = ( + $first_rotate(a0), + $first_rotate(a1), + $first_rotate(a2), + $first_rotate(a3), + $first_rotate(a4), + $first_rotate(a5), + $first_rotate(a6), + $first_rotate(a7), + ); + let (c0, c1, c2, c3, c4, c5, c6, c7) = ( + a0 ^ b0, + a1 ^ b1, + a2 ^ b2, + a3 ^ b3, + a4 ^ b4, + a5 ^ b5, + a6 ^ b6, + a7 ^ b7, + ); + let (d0, d1, d2, d3, d4, d5, d6, d7) = ( + a0 ^ c7, + a1 ^ c0 ^ c7, + a2 ^ c1, + a3 ^ c2 ^ c7, + a4 ^ c3 ^ c7, + a5 ^ c4, + a6 ^ c5, + a7 ^ c6, + ); + let (e0, e1, e2, e3, e4, e5, e6, e7) = ( + c0 ^ d6, + c1 ^ d6 ^ d7, + c2 ^ d0 ^ d7, + c3 ^ d1 ^ d6, + c4 ^ d2 ^ d6 ^ d7, + c5 ^ d3 ^ d7, + c6 ^ d4, + c7 ^ d5, + ); + state[0] = d0 ^ e0 ^ $second_rotate(e0); + state[1] = d1 ^ e1 ^ $second_rotate(e1); + state[2] = d2 ^ e2 ^ $second_rotate(e2); + state[3] = d3 ^ e3 ^ $second_rotate(e3); + state[4] = d4 ^ e4 ^ $second_rotate(e4); + state[5] = d5 ^ e5 ^ $second_rotate(e5); + state[6] = d6 ^ e6 ^ $second_rotate(e6); + state[7] = d7 ^ e7 ^ $second_rotate(e7); + } + } +} + +define_mix_columns!( + mix_columns_0, + inv_mix_columns_0, + rotate_rows_1, + rotate_rows_2 +); + +define_mix_columns!( + mix_columns_1, + inv_mix_columns_1, + rotate_rows_and_columns_1_1, + rotate_rows_and_columns_2_2 +); + +#[cfg(not(feature = "compact"))] +define_mix_columns!( + mix_columns_2, + inv_mix_columns_2, + rotate_rows_and_columns_1_2, + rotate_rows_2 +); + +#[cfg(not(feature = "compact"))] +define_mix_columns!( + mix_columns_3, + inv_mix_columns_3, + rotate_rows_and_columns_1_3, + rotate_rows_and_columns_2_2 +); + +#[inline] +fn delta_swap_1(a: &mut u64, shift: u32, mask: u64) { + let t = (*a ^ ((*a) >> shift)) & mask; + *a ^= t ^ (t << shift); +} + +#[inline] +fn delta_swap_2(a: &mut u64, b: &mut u64, shift: u32, mask: u64) { + let t = (*a ^ ((*b) >> shift)) & mask; + *a ^= t; + *b ^= t << shift; +} + +/// Applies ShiftRows once on an AES state (or key). +#[cfg(any(not(feature = "compact"), feature = "hazmat"))] +#[inline] +fn shift_rows_1(state: &mut [u64]) { + debug_assert_eq!(state.len(), 8); + for x in state.iter_mut() { + delta_swap_1(x, 8, 0x00f000ff000f0000); + delta_swap_1(x, 4, 0x0f0f00000f0f0000); + } +} + +/// Applies ShiftRows twice on an AES state (or key). +#[inline] +fn shift_rows_2(state: &mut [u64]) { + debug_assert_eq!(state.len(), 8); + for x in state.iter_mut() { + delta_swap_1(x, 8, 0x00ff000000ff0000); + } +} + +/// Applies ShiftRows three times on an AES state (or key). +#[inline] +fn shift_rows_3(state: &mut [u64]) { + debug_assert_eq!(state.len(), 8); + for x in state.iter_mut() { + delta_swap_1(x, 8, 0x000f00ff00f00000); + delta_swap_1(x, 4, 0x0f0f00000f0f0000); + } +} + +#[inline(always)] +fn inv_shift_rows_1(state: &mut [u64]) { + shift_rows_3(state); +} + +#[inline(always)] +fn inv_shift_rows_2(state: &mut [u64]) { + shift_rows_2(state); +} + +#[cfg(not(feature = "compact"))] +#[inline(always)] +fn inv_shift_rows_3(state: &mut [u64]) { + shift_rows_1(state); +} + +/// XOR the columns after the S-box during the key schedule round function. +/// +/// The `idx_xor` parameter refers to the index of the previous round key that is +/// involved in the XOR computation (should be 8 and 16 for AES-128 and AES-256, +/// respectively). +/// +/// The `idx_ror` parameter refers to the rotation value, which varies between the +/// different key schedules. +fn xor_columns(rkeys: &mut [u64], offset: usize, idx_xor: usize, idx_ror: u32) { + for i in 0..8 { + let off_i = offset + i; + let rk = rkeys[off_i - idx_xor] ^ (0x000f000f000f000f & ror(rkeys[off_i], idx_ror)); + rkeys[off_i] = rk + ^ (0xfff0fff0fff0fff0 & (rk << 4)) + ^ (0xff00ff00ff00ff00 & (rk << 8)) + ^ (0xf000f000f000f000 & (rk << 12)); + } +} + +/// Bitslice four 128-bit input blocks input0, input1, input2, input3 into a 512-bit internal state. +fn bitslice(output: &mut [u64], input0: &[u8], input1: &[u8], input2: &[u8], input3: &[u8]) { + debug_assert_eq!(output.len(), 8); + debug_assert_eq!(input0.len(), 16); + debug_assert_eq!(input1.len(), 16); + debug_assert_eq!(input2.len(), 16); + debug_assert_eq!(input3.len(), 16); + + // Bitslicing is a bit index manipulation. 512 bits of data means each bit is positioned at a + // 9-bit index. AES data is 4 blocks, each one a 4x4 column-major matrix of bytes, so the + // index is initially ([b]lock, [c]olumn, [r]ow, [p]osition): + // b1 b0 c1 c0 r1 r0 p2 p1 p0 + // + // The desired bitsliced data groups first by bit position, then row, column, block: + // p2 p1 p0 r1 r0 c1 c0 b1 b0 + + #[rustfmt::skip] + fn read_reordered(input: &[u8]) -> u64 { + (u64::from(input[0x0]) ) | + (u64::from(input[0x1]) << 0x10) | + (u64::from(input[0x2]) << 0x20) | + (u64::from(input[0x3]) << 0x30) | + (u64::from(input[0x8]) << 0x08) | + (u64::from(input[0x9]) << 0x18) | + (u64::from(input[0xa]) << 0x28) | + (u64::from(input[0xb]) << 0x38) + } + + // Reorder each block's bytes on input + // __ __ c1 c0 r1 r0 __ __ __ => __ __ c0 r1 r0 c1 __ __ __ + // Reorder by relabeling (note the order of input) + // b1 b0 c0 __ __ __ __ __ __ => c0 b1 b0 __ __ __ __ __ __ + let mut t0 = read_reordered(&input0[0x00..0x0c]); + let mut t4 = read_reordered(&input0[0x04..0x10]); + let mut t1 = read_reordered(&input1[0x00..0x0c]); + let mut t5 = read_reordered(&input1[0x04..0x10]); + let mut t2 = read_reordered(&input2[0x00..0x0c]); + let mut t6 = read_reordered(&input2[0x04..0x10]); + let mut t3 = read_reordered(&input3[0x00..0x0c]); + let mut t7 = read_reordered(&input3[0x04..0x10]); + + // Bit Index Swap 6 <-> 0: + // __ __ b0 __ __ __ __ __ p0 => __ __ p0 __ __ __ __ __ b0 + let m0 = 0x5555555555555555; + delta_swap_2(&mut t1, &mut t0, 1, m0); + delta_swap_2(&mut t3, &mut t2, 1, m0); + delta_swap_2(&mut t5, &mut t4, 1, m0); + delta_swap_2(&mut t7, &mut t6, 1, m0); + + // Bit Index Swap 7 <-> 1: + // __ b1 __ __ __ __ __ p1 __ => __ p1 __ __ __ __ __ b1 __ + let m1 = 0x3333333333333333; + delta_swap_2(&mut t2, &mut t0, 2, m1); + delta_swap_2(&mut t3, &mut t1, 2, m1); + delta_swap_2(&mut t6, &mut t4, 2, m1); + delta_swap_2(&mut t7, &mut t5, 2, m1); + + // Bit Index Swap 8 <-> 2: + // c0 __ __ __ __ __ p2 __ __ => p2 __ __ __ __ __ c0 __ __ + let m2 = 0x0f0f0f0f0f0f0f0f; + delta_swap_2(&mut t4, &mut t0, 4, m2); + delta_swap_2(&mut t5, &mut t1, 4, m2); + delta_swap_2(&mut t6, &mut t2, 4, m2); + delta_swap_2(&mut t7, &mut t3, 4, m2); + + // Final bitsliced bit index, as desired: + // p2 p1 p0 r1 r0 c1 c0 b1 b0 + output[0] = t0; + output[1] = t1; + output[2] = t2; + output[3] = t3; + output[4] = t4; + output[5] = t5; + output[6] = t6; + output[7] = t7; +} + +/// Un-bitslice a 512-bit internal state into four 128-bit blocks of output. +fn inv_bitslice(input: &[u64], output: &mut [Block]) { + debug_assert_eq!(input.len(), 8); + debug_assert_eq!(output.len(), 4); + + // Unbitslicing is a bit index manipulation. 512 bits of data means each bit is positioned at + // a 9-bit index. AES data is 4 blocks, each one a 4x4 column-major matrix of bytes, so the + // desired index for the output is ([b]lock, [c]olumn, [r]ow, [p]osition): + // b1 b0 c1 c0 r1 r0 p2 p1 p0 + // + // The initially bitsliced data groups first by bit position, then row, column, block: + // p2 p1 p0 r1 r0 c1 c0 b1 b0 + + let mut t0 = input[0]; + let mut t1 = input[1]; + let mut t2 = input[2]; + let mut t3 = input[3]; + let mut t4 = input[4]; + let mut t5 = input[5]; + let mut t6 = input[6]; + let mut t7 = input[7]; + + // TODO: these bit index swaps are identical to those in 'packing' + + // Bit Index Swap 6 <-> 0: + // __ __ p0 __ __ __ __ __ b0 => __ __ b0 __ __ __ __ __ p0 + let m0 = 0x5555555555555555; + delta_swap_2(&mut t1, &mut t0, 1, m0); + delta_swap_2(&mut t3, &mut t2, 1, m0); + delta_swap_2(&mut t5, &mut t4, 1, m0); + delta_swap_2(&mut t7, &mut t6, 1, m0); + + // Bit Index Swap 7 <-> 1: + // __ p1 __ __ __ __ __ b1 __ => __ b1 __ __ __ __ __ p1 __ + let m1 = 0x3333333333333333; + delta_swap_2(&mut t2, &mut t0, 2, m1); + delta_swap_2(&mut t3, &mut t1, 2, m1); + delta_swap_2(&mut t6, &mut t4, 2, m1); + delta_swap_2(&mut t7, &mut t5, 2, m1); + + // Bit Index Swap 8 <-> 2: + // p2 __ __ __ __ __ c0 __ __ => c0 __ __ __ __ __ p2 __ __ + let m2 = 0x0f0f0f0f0f0f0f0f; + delta_swap_2(&mut t4, &mut t0, 4, m2); + delta_swap_2(&mut t5, &mut t1, 4, m2); + delta_swap_2(&mut t6, &mut t2, 4, m2); + delta_swap_2(&mut t7, &mut t3, 4, m2); + + #[rustfmt::skip] + fn write_reordered(columns: u64, output: &mut [u8]) { + output[0x0] = (columns ) as u8; + output[0x1] = (columns >> 0x10) as u8; + output[0x2] = (columns >> 0x20) as u8; + output[0x3] = (columns >> 0x30) as u8; + output[0x8] = (columns >> 0x08) as u8; + output[0x9] = (columns >> 0x18) as u8; + output[0xa] = (columns >> 0x28) as u8; + output[0xb] = (columns >> 0x38) as u8; + } + + // Reorder by relabeling (note the order of output) + // c0 b1 b0 __ __ __ __ __ __ => b1 b0 c0 __ __ __ __ __ __ + // Reorder each block's bytes on output + // __ __ c0 r1 r0 c1 __ __ __ => __ __ c1 c0 r1 r0 __ __ __ + write_reordered(t0, &mut output[0][0x00..0x0c]); + write_reordered(t4, &mut output[0][0x04..0x10]); + write_reordered(t1, &mut output[1][0x00..0x0c]); + write_reordered(t5, &mut output[1][0x04..0x10]); + write_reordered(t2, &mut output[2][0x00..0x0c]); + write_reordered(t6, &mut output[2][0x04..0x10]); + write_reordered(t3, &mut output[3][0x00..0x0c]); + write_reordered(t7, &mut output[3][0x04..0x10]); + + // Final AES bit index, as desired: + // b1 b0 c1 c0 r1 r0 p2 p1 p0 +} + +/// Copy 32-bytes within the provided slice to an 8-byte offset +fn memshift32(buffer: &mut [u64], src_offset: usize) { + debug_assert_eq!(src_offset % 8, 0); + + let dst_offset = src_offset + 8; + debug_assert!(dst_offset + 8 <= buffer.len()); + + for i in (0..8).rev() { + buffer[dst_offset + i] = buffer[src_offset + i]; + } +} + +/// XOR the round key to the internal state. The round keys are expected to be +/// pre-computed and to be packed in the fixsliced representation. +#[inline] +fn add_round_key(state: &mut State, rkey: &[u64]) { + debug_assert_eq!(rkey.len(), 8); + for (a, b) in state.iter_mut().zip(rkey) { + *a ^= b; + } +} + +#[inline(always)] +fn add_round_constant_bit(state: &mut [u64], bit: usize) { + state[bit] ^= 0x00000000f0000000; +} + +#[inline(always)] +fn ror(x: u64, y: u32) -> u64 { + x.rotate_right(y) +} + +#[inline(always)] +fn ror_distance(rows: u32, cols: u32) -> u32 { + (rows << 4) + (cols << 2) +} + +#[inline(always)] +fn rotate_rows_1(x: u64) -> u64 { + ror(x, ror_distance(1, 0)) +} + +#[inline(always)] +fn rotate_rows_2(x: u64) -> u64 { + ror(x, ror_distance(2, 0)) +} + +#[inline(always)] +#[rustfmt::skip] +fn rotate_rows_and_columns_1_1(x: u64) -> u64 { + (ror(x, ror_distance(1, 1)) & 0x0fff0fff0fff0fff) | + (ror(x, ror_distance(0, 1)) & 0xf000f000f000f000) +} + +#[cfg(not(feature = "compact"))] +#[inline(always)] +#[rustfmt::skip] +fn rotate_rows_and_columns_1_2(x: u64) -> u64 { + (ror(x, ror_distance(1, 2)) & 0x00ff00ff00ff00ff) | + (ror(x, ror_distance(0, 2)) & 0xff00ff00ff00ff00) +} + +#[cfg(not(feature = "compact"))] +#[inline(always)] +#[rustfmt::skip] +fn rotate_rows_and_columns_1_3(x: u64) -> u64 { + (ror(x, ror_distance(1, 3)) & 0x000f000f000f000f) | + (ror(x, ror_distance(0, 3)) & 0xfff0fff0fff0fff0) +} + +#[inline(always)] +#[rustfmt::skip] +fn rotate_rows_and_columns_2_2(x: u64) -> u64 { + (ror(x, ror_distance(2, 2)) & 0x00ff00ff00ff00ff) | + (ror(x, ror_distance(1, 2)) & 0xff00ff00ff00ff00) +} + +/// Low-level "hazmat" AES functions. +/// +/// Note: this isn't actually used in the `Aes128`/`Aes192`/`Aes256` +/// implementations in this crate, but instead provides raw access to +/// the AES round function gated under the `hazmat` crate feature. +#[cfg(feature = "hazmat")] +pub(crate) mod hazmat { + use super::{ + bitslice, inv_bitslice, inv_mix_columns_0, inv_shift_rows_1, inv_sub_bytes, mix_columns_0, + shift_rows_1, sub_bytes, sub_bytes_nots, State, + }; + use crate::{Block, ParBlocks}; + + /// XOR the `src` block into the `dst` block in-place. + fn xor_in_place(dst: &mut Block, src: &Block) { + for (a, b) in dst.iter_mut().zip(src.as_slice()) { + *a ^= *b; + } + } + + /// Perform a bitslice operation, loading a single block. + fn bitslice_block(block: &Block) -> State { + let mut state = State::default(); + bitslice(&mut state, block, block, block, block); + state + } + + /// Perform an inverse bitslice operation, extracting a single block. + fn inv_bitslice_block(block: &mut Block, state: &State) { + let mut out = [Block::default(); 4]; + inv_bitslice(state, &mut out); + block.copy_from_slice(&out[0]); + } + + /// AES cipher (encrypt) round function. + #[inline] + pub(crate) fn cipher_round(block: &mut Block, round_key: &Block) { + let mut state = bitslice_block(block); + sub_bytes(&mut state); + sub_bytes_nots(&mut state); + shift_rows_1(&mut state); + mix_columns_0(&mut state); + inv_bitslice_block(block, &state); + xor_in_place(block, round_key); + } + + /// AES cipher (encrypt) round function: parallel version. + #[inline] + pub(crate) fn cipher_round_par(blocks: &mut ParBlocks, round_keys: &ParBlocks) { + for (chunk, keys) in blocks.chunks_exact_mut(4).zip(round_keys.chunks_exact(4)) { + let mut state = State::default(); + bitslice(&mut state, &chunk[0], &chunk[1], &chunk[2], &chunk[3]); + sub_bytes(&mut state); + sub_bytes_nots(&mut state); + shift_rows_1(&mut state); + mix_columns_0(&mut state); + inv_bitslice(&state, chunk); + + for i in 0..4 { + xor_in_place(&mut chunk[i], &keys[i]); + } + } + } + + /// AES cipher (encrypt) round function. + #[inline] + pub(crate) fn equiv_inv_cipher_round(block: &mut Block, round_key: &Block) { + let mut state = State::default(); + bitslice(&mut state, &block, &block, &block, &block); + sub_bytes_nots(&mut state); + inv_sub_bytes(&mut state); + inv_shift_rows_1(&mut state); + inv_mix_columns_0(&mut state); + inv_bitslice_block(block, &state); + xor_in_place(block, round_key); + } + + /// AES cipher (encrypt) round function: parallel version. + #[inline] + pub(crate) fn equiv_inv_cipher_round_par(blocks: &mut ParBlocks, round_keys: &ParBlocks) { + for (chunk, keys) in blocks.chunks_exact_mut(4).zip(round_keys.chunks_exact(4)) { + let mut state = State::default(); + bitslice(&mut state, &chunk[0], &chunk[1], &chunk[2], &chunk[3]); + sub_bytes_nots(&mut state); + inv_sub_bytes(&mut state); + inv_shift_rows_1(&mut state); + inv_mix_columns_0(&mut state); + inv_bitslice(&state, chunk); + + for i in 0..4 { + xor_in_place(&mut chunk[i], &keys[i]); + } + } + } + + /// AES mix columns function. + #[inline] + pub(crate) fn mix_columns(block: &mut Block) { + let mut state = bitslice_block(block); + mix_columns_0(&mut state); + inv_bitslice_block(block, &state); + } + + /// AES inverse mix columns function. + #[inline] + pub(crate) fn inv_mix_columns(block: &mut Block) { + let mut state = bitslice_block(block); + inv_mix_columns_0(&mut state); + inv_bitslice_block(block, &state); + } +} |