Adding upstream version 1:7.0.3.upstream/1%7.0.3

Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
author: Daniel Baumann <daniel.baumann@progress-linux.org> 2024-04-19 17:39:49 +0000
committer: Daniel Baumann <daniel.baumann@progress-linux.org> 2024-04-19 17:39:49 +0000
commit: a0aa2307322cd47bbf416810ac0292925e03be87 (patch)
tree: 37076262a026c4b48c8a0e84f44ff9187556ca35 /rust/vendor/aes/src
parent: Initial commit. (diff)
download: suricata-a0aa2307322cd47bbf416810ac0292925e03be87.tar.xz
suricata-a0aa2307322cd47bbf416810ac0292925e03be87.zip
25 files changed, 5945 insertions, 0 deletions
diff --git a/rust/vendor/aes/src/armv8.rs b/rust/vendor/aes/src/armv8.rs
new file mode 100644
index 0000000..187ac1b
--- /dev/null
+++ b/rust/vendor/aes/src/armv8.rs
@@ -0,0 +1,376 @@
+//! AES block cipher implementation using the ARMv8 Cryptography Extensions.
+//!
+//! Based on this C intrinsics implementation:
+//! <https://github.com/noloader/AES-Intrinsics/blob/master/aes-arm.c>
+//!
+//! Original C written and placed in public domain by Jeffrey Walton.
+//! Based on code from ARM, and by Johannes Schneiders, Skip Hovsmith and
+//! Barry O'Rourke for the mbedTLS project.
+
+#![allow(clippy::needless_range_loop)]
+
+#[cfg(feature = "hazmat")]
+pub(crate) mod hazmat;
+
+mod decrypt;
+mod encrypt;
+mod expand;
+
+use self::{
+    decrypt::{decrypt, decrypt8},
+    encrypt::{encrypt, encrypt8},
+    expand::{expand_key, inv_expanded_keys},
+};
+use crate::{Block, ParBlocks};
+use cipher::{
+    consts::{U16, U24, U32, U8},
+    generic_array::GenericArray,
+    BlockCipher, BlockDecrypt, BlockEncrypt, NewBlockCipher,
+};
+use core::arch::aarch64::*;
+
+macro_rules! define_aes_impl {
+    (
+        $name:ident,
+        $name_enc:ident,
+        $name_dec:ident,
+        $key_size:ty,
+        $rounds:tt,
+        $doc:expr
+    ) => {
+        #[doc=$doc]
+        #[doc = "block cipher"]
+        #[derive(Clone)]
+        pub struct $name {
+            encrypt: $name_enc,
+            decrypt: $name_dec,
+        }
+
+        impl NewBlockCipher for $name {
+            type KeySize = $key_size;
+
+            #[inline]
+            fn new(key: &GenericArray<u8, $key_size>) -> Self {
+                let encrypt = $name_enc::new(key);
+                let decrypt = $name_dec::from(&encrypt);
+                Self { encrypt, decrypt }
+            }
+        }
+
+        impl BlockCipher for $name {
+            type BlockSize = U16;
+            type ParBlocks = U8;
+        }
+
+        impl BlockEncrypt for $name {
+            #[inline]
+            fn encrypt_block(&self, block: &mut Block) {
+                self.encrypt.encrypt_block(block)
+            }
+
+            #[inline]
+            fn encrypt_par_blocks(&self, blocks: &mut ParBlocks) {
+                self.encrypt.encrypt_par_blocks(blocks)
+            }
+        }
+
+        impl BlockDecrypt for $name {
+            #[inline]
+            fn decrypt_block(&self, block: &mut Block) {
+                self.decrypt.decrypt_block(block)
+            }
+
+            #[inline]
+            fn decrypt_par_blocks(&self, blocks: &mut ParBlocks) {
+                self.decrypt.decrypt_par_blocks(blocks)
+            }
+        }
+
+        #[doc=$doc]
+        #[doc = "block cipher (encrypt-only)"]
+        #[derive(Clone)]
+        pub struct $name_enc {
+            round_keys: [uint8x16_t; $rounds],
+        }
+
+        impl NewBlockCipher for $name_enc {
+            type KeySize = $key_size;
+
+            fn new(key: &GenericArray<u8, $key_size>) -> Self {
+                Self {
+                    round_keys: expand_key(key.as_ref()),
+                }
+            }
+        }
+
+        impl BlockCipher for $name_enc {
+            type BlockSize = U16;
+            type ParBlocks = U8;
+        }
+
+        impl BlockEncrypt for $name_enc {
+            fn encrypt_block(&self, block: &mut Block) {
+                unsafe { encrypt(&self.round_keys, block) }
+            }
+
+            fn encrypt_par_blocks(&self, blocks: &mut ParBlocks) {
+                unsafe { encrypt8(&self.round_keys, blocks) }
+            }
+        }
+
+        #[doc=$doc]
+        #[doc = "block cipher (decrypt-only)"]
+        #[derive(Clone)]
+        pub struct $name_dec {
+            round_keys: [uint8x16_t; $rounds],
+        }
+
+        impl NewBlockCipher for $name_dec {
+            type KeySize = $key_size;
+
+            fn new(key: &GenericArray<u8, $key_size>) -> Self {
+                $name_enc::new(key).into()
+            }
+        }
+
+        impl From<$name_enc> for $name_dec {
+            fn from(enc: $name_enc) -> $name_dec {
+                Self::from(&enc)
+            }
+        }
+
+        impl From<&$name_enc> for $name_dec {
+            fn from(enc: &$name_enc) -> $name_dec {
+                let mut round_keys = enc.round_keys;
+                inv_expanded_keys(&mut round_keys);
+                Self { round_keys }
+            }
+        }
+
+        impl BlockCipher for $name_dec {
+            type BlockSize = U16;
+            type ParBlocks = U8;
+        }
+
+        impl BlockDecrypt for $name_dec {
+            fn decrypt_block(&self, block: &mut Block) {
+                unsafe { decrypt(&self.round_keys, block) }
+            }
+
+            fn decrypt_par_blocks(&self, blocks: &mut ParBlocks) {
+                unsafe { decrypt8(&self.round_keys, blocks) }
+            }
+        }
+
+        opaque_debug::implement!($name);
+        opaque_debug::implement!($name_enc);
+        opaque_debug::implement!($name_dec);
+    };
+}
+
+define_aes_impl!(Aes128, Aes128Enc, Aes128Dec, U16, 11, "AES-128");
+define_aes_impl!(Aes192, Aes192Enc, Aes192Dec, U24, 13, "AES-192");
+define_aes_impl!(Aes256, Aes256Enc, Aes256Dec, U32, 15, "AES-256");
+
+#[cfg(test)]
+mod tests {
+    use super::{decrypt, decrypt8, encrypt, encrypt8, expand_key, inv_expanded_keys, ParBlocks};
+    use core::{arch::aarch64::*, convert::TryInto};
+    use hex_literal::hex;
+
+    /// FIPS 197, Appendix A.1: AES-128 Cipher Key
+    /// user input, unaligned buffer
+    const AES128_KEY: [u8; 16] = hex!("2b7e151628aed2a6abf7158809cf4f3c");
+
+    /// FIPS 197 Appendix A.1: Expansion of a 128-bit Cipher Key
+    /// library controlled, aligned buffer
+    const AES128_EXP_KEYS: [[u8; 16]; 11] = [
+        AES128_KEY,
+        hex!("a0fafe1788542cb123a339392a6c7605"),
+        hex!("f2c295f27a96b9435935807a7359f67f"),
+        hex!("3d80477d4716fe3e1e237e446d7a883b"),
+        hex!("ef44a541a8525b7fb671253bdb0bad00"),
+        hex!("d4d1c6f87c839d87caf2b8bc11f915bc"),
+        hex!("6d88a37a110b3efddbf98641ca0093fd"),
+        hex!("4e54f70e5f5fc9f384a64fb24ea6dc4f"),
+        hex!("ead27321b58dbad2312bf5607f8d292f"),
+        hex!("ac7766f319fadc2128d12941575c006e"),
+        hex!("d014f9a8c9ee2589e13f0cc8b6630ca6"),
+    ];
+
+    /// Inverse expanded keys for [`AES128_EXPANDED_KEYS`]
+    const AES128_EXP_INVKEYS: [[u8; 16]; 11] = [
+        hex!("d014f9a8c9ee2589e13f0cc8b6630ca6"),
+        hex!("0c7b5a631319eafeb0398890664cfbb4"),
+        hex!("df7d925a1f62b09da320626ed6757324"),
+        hex!("12c07647c01f22c7bc42d2f37555114a"),
+        hex!("6efcd876d2df54807c5df034c917c3b9"),
+        hex!("6ea30afcbc238cf6ae82a4b4b54a338d"),
+        hex!("90884413d280860a12a128421bc89739"),
+        hex!("7c1f13f74208c219c021ae480969bf7b"),
+        hex!("cc7505eb3e17d1ee82296c51c9481133"),
+        hex!("2b3708a7f262d405bc3ebdbf4b617d62"),
+        AES128_KEY,
+    ];
+
+    /// FIPS 197, Appendix A.2: AES-192 Cipher Key
+    /// user input, unaligned buffer
+    const AES192_KEY: [u8; 24] = hex!("8e73b0f7da0e6452c810f32b809079e562f8ead2522c6b7b");
+
+    /// FIPS 197 Appendix A.2: Expansion of a 192-bit Cipher Key
+    /// library controlled, aligned buffer
+    const AES192_EXP_KEYS: [[u8; 16]; 13] = [
+        hex!("8e73b0f7da0e6452c810f32b809079e5"),
+        hex!("62f8ead2522c6b7bfe0c91f72402f5a5"),
+        hex!("ec12068e6c827f6b0e7a95b95c56fec2"),
+        hex!("4db7b4bd69b5411885a74796e92538fd"),
+        hex!("e75fad44bb095386485af05721efb14f"),
+        hex!("a448f6d94d6dce24aa326360113b30e6"),
+        hex!("a25e7ed583b1cf9a27f939436a94f767"),
+        hex!("c0a69407d19da4e1ec1786eb6fa64971"),
+        hex!("485f703222cb8755e26d135233f0b7b3"),
+        hex!("40beeb282f18a2596747d26b458c553e"),
+        hex!("a7e1466c9411f1df821f750aad07d753"),
+        hex!("ca4005388fcc5006282d166abc3ce7b5"),
+        hex!("e98ba06f448c773c8ecc720401002202"),
+    ];
+
+    /// FIPS 197, Appendix A.3: AES-256 Cipher Key
+    /// user input, unaligned buffer
+    const AES256_KEY: [u8; 32] =
+        hex!("603deb1015ca71be2b73aef0857d77811f352c073b6108d72d9810a30914dff4");
+
+    /// FIPS 197 Appendix A.3: Expansion of a 256-bit Cipher Key
+    /// library controlled, aligned buffer
+    const AES256_EXP_KEYS: [[u8; 16]; 15] = [
+        hex!("603deb1015ca71be2b73aef0857d7781"),
+        hex!("1f352c073b6108d72d9810a30914dff4"),
+        hex!("9ba354118e6925afa51a8b5f2067fcde"),
+        hex!("a8b09c1a93d194cdbe49846eb75d5b9a"),
+        hex!("d59aecb85bf3c917fee94248de8ebe96"),
+        hex!("b5a9328a2678a647983122292f6c79b3"),
+        hex!("812c81addadf48ba24360af2fab8b464"),
+        hex!("98c5bfc9bebd198e268c3ba709e04214"),
+        hex!("68007bacb2df331696e939e46c518d80"),
+        hex!("c814e20476a9fb8a5025c02d59c58239"),
+        hex!("de1369676ccc5a71fa2563959674ee15"),
+        hex!("5886ca5d2e2f31d77e0af1fa27cf73c3"),
+        hex!("749c47ab18501ddae2757e4f7401905a"),
+        hex!("cafaaae3e4d59b349adf6acebd10190d"),
+        hex!("fe4890d1e6188d0b046df344706c631e"),
+    ];
+
+    /// FIPS 197, Appendix B input
+    /// user input, unaligned buffer
+    const INPUT: [u8; 16] = hex!("3243f6a8885a308d313198a2e0370734");
+
+    /// FIPS 197, Appendix B output
+    const EXPECTED: [u8; 16] = hex!("3925841d02dc09fbdc118597196a0b32");
+
+    fn load_expanded_keys<const N: usize>(input: [[u8; 16]; N]) -> [uint8x16_t; N] {
+        let mut output = [unsafe { vdupq_n_u8(0) }; N];
+
+        for (src, dst) in input.iter().zip(output.iter_mut()) {
+            *dst = unsafe { vld1q_u8(src.as_ptr()) }
+        }
+
+        output
+    }
+
+    fn store_expanded_keys<const N: usize>(input: [uint8x16_t; N]) -> [[u8; 16]; N] {
+        let mut output = [[0u8; 16]; N];
+
+        for (src, dst) in input.iter().zip(output.iter_mut()) {
+            unsafe { vst1q_u8(dst.as_mut_ptr(), *src) }
+        }
+
+        output
+    }
+
+    #[test]
+    fn aes128_key_expansion() {
+        let ek = expand_key(&AES128_KEY);
+        assert_eq!(store_expanded_keys(ek), AES128_EXP_KEYS);
+    }
+
+    #[test]
+    fn aes128_key_expansion_inv() {
+        let mut ek = load_expanded_keys(AES128_EXP_KEYS);
+        inv_expanded_keys(&mut ek);
+        assert_eq!(store_expanded_keys(ek), AES128_EXP_INVKEYS);
+    }
+
+    #[test]
+    fn aes192_key_expansion() {
+        let ek = expand_key(&AES192_KEY);
+        assert_eq!(store_expanded_keys(ek), AES192_EXP_KEYS);
+    }
+
+    #[test]
+    fn aes256_key_expansion() {
+        let ek = expand_key(&AES256_KEY);
+        assert_eq!(store_expanded_keys(ek), AES256_EXP_KEYS);
+    }
+
+    #[test]
+    fn aes128_encrypt() {
+        // Intentionally misaligned block
+        let mut block = [0u8; 19];
+        block[3..].copy_from_slice(&INPUT);
+
+        unsafe {
+            encrypt(
+                &load_expanded_keys(AES128_EXP_KEYS),
+                (&mut block[3..]).try_into().unwrap(),
+            )
+        };
+
+        assert_eq!(&block[3..], &EXPECTED);
+    }
+
+    #[test]
+    fn aes128_encrypt8() {
+        let mut blocks = ParBlocks::default();
+
+        for block in &mut blocks {
+            block.copy_from_slice(&INPUT);
+        }
+
+        unsafe { encrypt8(&load_expanded_keys(AES128_EXP_KEYS), &mut blocks) };
+
+        for block in &blocks {
+            assert_eq!(block.as_slice(), &EXPECTED);
+        }
+    }
+
+    #[test]
+    fn aes128_decrypt() {
+        // Intentionally misaligned block
+        let mut block = [0u8; 19];
+        block[3..].copy_from_slice(&EXPECTED);
+
+        unsafe {
+            decrypt(
+                &load_expanded_keys(AES128_EXP_INVKEYS),
+                (&mut block[3..]).try_into().unwrap(),
+            )
+        };
+
+        assert_eq!(&block[3..], &INPUT);
+    }
+
+    #[test]
+    fn aes128_decrypt8() {
+        let mut blocks = ParBlocks::default();
+
+        for block in &mut blocks {
+            block.copy_from_slice(&EXPECTED);
+        }
+
+        unsafe { decrypt8(&load_expanded_keys(AES128_EXP_INVKEYS), &mut blocks) };
+
+        for block in &blocks {
+            assert_eq!(block.as_slice(), &INPUT);
+        }
+    }
+}
diff --git a/rust/vendor/aes/src/armv8/decrypt.rs b/rust/vendor/aes/src/armv8/decrypt.rs
new file mode 100644
index 0000000..cb84193
--- /dev/null
+++ b/rust/vendor/aes/src/armv8/decrypt.rs
@@ -0,0 +1,72 @@
+//! AES decryption support.
+
+use crate::{Block, ParBlocks};
+use core::arch::aarch64::*;
+
+/// Perform AES decryption using the given expanded keys.
+#[target_feature(enable = "aes")]
+#[target_feature(enable = "neon")]
+pub(super) unsafe fn decrypt<const N: usize>(expanded_keys: &[uint8x16_t; N], block: &mut Block) {
+    let rounds = N - 1;
+    assert!(rounds == 10 || rounds == 12 || rounds == 14);
+
+    let mut state = vld1q_u8(block.as_ptr());
+
+    for k in expanded_keys.iter().take(rounds - 1) {
+        // AES single round decryption
+        state = vaesdq_u8(state, *k);
+
+        // AES inverse mix columns
+        state = vaesimcq_u8(state);
+    }
+
+    // AES single round decryption
+    state = vaesdq_u8(state, expanded_keys[rounds - 1]);
+
+    // Final add (bitwise XOR)
+    state = veorq_u8(state, expanded_keys[rounds]);
+
+    vst1q_u8(block.as_mut_ptr(), state);
+}
+
+/// Perform parallel AES decryption 8-blocks-at-a-time using the given expanded keys.
+#[target_feature(enable = "aes")]
+#[target_feature(enable = "neon")]
+pub(super) unsafe fn decrypt8<const N: usize>(
+    expanded_keys: &[uint8x16_t; N],
+    blocks: &mut ParBlocks,
+) {
+    let rounds = N - 1;
+    assert!(rounds == 10 || rounds == 12 || rounds == 14);
+
+    let mut state = [
+        vld1q_u8(blocks[0].as_ptr()),
+        vld1q_u8(blocks[1].as_ptr()),
+        vld1q_u8(blocks[2].as_ptr()),
+        vld1q_u8(blocks[3].as_ptr()),
+        vld1q_u8(blocks[4].as_ptr()),
+        vld1q_u8(blocks[5].as_ptr()),
+        vld1q_u8(blocks[6].as_ptr()),
+        vld1q_u8(blocks[7].as_ptr()),
+    ];
+
+    for k in expanded_keys.iter().take(rounds - 1) {
+        for i in 0..8 {
+            // AES single round decryption
+            state[i] = vaesdq_u8(state[i], *k);
+
+            // AES inverse mix columns
+            state[i] = vaesimcq_u8(state[i]);
+        }
+    }
+
+    for i in 0..8 {
+        // AES single round decryption
+        state[i] = vaesdq_u8(state[i], expanded_keys[rounds - 1]);
+
+        // Final add (bitwise XOR)
+        state[i] = veorq_u8(state[i], expanded_keys[rounds]);
+
+        vst1q_u8(blocks[i].as_mut_ptr(), state[i]);
+    }
+}
diff --git a/rust/vendor/aes/src/armv8/encrypt.rs b/rust/vendor/aes/src/armv8/encrypt.rs
new file mode 100644
index 0000000..8464173
--- /dev/null
+++ b/rust/vendor/aes/src/armv8/encrypt.rs
@@ -0,0 +1,72 @@
+//! AES encryption support
+
+use crate::{Block, ParBlocks};
+use core::arch::aarch64::*;
+
+/// Perform AES encryption using the given expanded keys.
+#[target_feature(enable = "aes")]
+#[target_feature(enable = "neon")]
+pub(super) unsafe fn encrypt<const N: usize>(expanded_keys: &[uint8x16_t; N], block: &mut Block) {
+    let rounds = N - 1;
+    assert!(rounds == 10 || rounds == 12 || rounds == 14);
+
+    let mut state = vld1q_u8(block.as_ptr());
+
+    for k in expanded_keys.iter().take(rounds - 1) {
+        // AES single round encryption
+        state = vaeseq_u8(state, *k);
+
+        // AES mix columns
+        state = vaesmcq_u8(state);
+    }
+
+    // AES single round encryption
+    state = vaeseq_u8(state, expanded_keys[rounds - 1]);
+
+    // Final add (bitwise XOR)
+    state = veorq_u8(state, expanded_keys[rounds]);
+
+    vst1q_u8(block.as_mut_ptr(), state);
+}
+
+/// Perform parallel AES encryption 8-blocks-at-a-time using the given expanded keys.
+#[target_feature(enable = "aes")]
+#[target_feature(enable = "neon")]
+pub(super) unsafe fn encrypt8<const N: usize>(
+    expanded_keys: &[uint8x16_t; N],
+    blocks: &mut ParBlocks,
+) {
+    let rounds = N - 1;
+    assert!(rounds == 10 || rounds == 12 || rounds == 14);
+
+    let mut state = [
+        vld1q_u8(blocks[0].as_ptr()),
+        vld1q_u8(blocks[1].as_ptr()),
+        vld1q_u8(blocks[2].as_ptr()),
+        vld1q_u8(blocks[3].as_ptr()),
+        vld1q_u8(blocks[4].as_ptr()),
+        vld1q_u8(blocks[5].as_ptr()),
+        vld1q_u8(blocks[6].as_ptr()),
+        vld1q_u8(blocks[7].as_ptr()),
+    ];
+
+    for k in expanded_keys.iter().take(rounds - 1) {
+        for i in 0..8 {
+            // AES single round encryption
+            state[i] = vaeseq_u8(state[i], *k);
+
+            // AES mix columns
+            state[i] = vaesmcq_u8(state[i]);
+        }
+    }
+
+    for i in 0..8 {
+        // AES single round encryption
+        state[i] = vaeseq_u8(state[i], expanded_keys[rounds - 1]);
+
+        // Final add (bitwise XOR)
+        state[i] = veorq_u8(state[i], expanded_keys[rounds]);
+
+        vst1q_u8(blocks[i].as_mut_ptr(), state[i]);
+    }
+}
diff --git a/rust/vendor/aes/src/armv8/expand.rs b/rust/vendor/aes/src/armv8/expand.rs
new file mode 100644
index 0000000..2e26e39
--- /dev/null
+++ b/rust/vendor/aes/src/armv8/expand.rs
@@ -0,0 +1,77 @@
+//! AES key expansion support.
+
+use core::{arch::aarch64::*, convert::TryInto, mem, slice};
+
+/// There are 4 AES words in a block.
+const BLOCK_WORDS: usize = 4;
+
+/// The AES (nee Rijndael) notion of a word is always 32-bits, or 4-bytes.
+const WORD_SIZE: usize = 4;
+
+/// AES round constants.
+const ROUND_CONSTS: [u32; 10] = [0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x1b, 0x36];
+
+/// AES key expansion
+// TODO(tarcieri): big endian support?
+#[inline]
+pub(super) fn expand_key<const L: usize, const N: usize>(key: &[u8; L]) -> [uint8x16_t; N] {
+    assert!((L == 16 && N == 11) || (L == 24 && N == 13) || (L == 32 && N == 15));
+
+    let mut expanded_keys: [uint8x16_t; N] = unsafe { mem::zeroed() };
+
+    // TODO(tarcieri): construct expanded keys using `vreinterpretq_u8_u32`
+    let ek_words = unsafe {
+        slice::from_raw_parts_mut(expanded_keys.as_mut_ptr() as *mut u32, N * BLOCK_WORDS)
+    };
+
+    for (i, chunk) in key.chunks_exact(WORD_SIZE).enumerate() {
+        ek_words[i] = u32::from_ne_bytes(chunk.try_into().unwrap());
+    }
+
+    // From "The Rijndael Block Cipher" Section 4.1:
+    // > The number of columns of the Cipher Key is denoted by `Nk` and is
+    // > equal to the key length divided by 32 [bits].
+    let nk = L / WORD_SIZE;
+
+    for i in nk..(N * BLOCK_WORDS) {
+        let mut word = ek_words[i - 1];
+
+        if i % nk == 0 {
+            word = sub_word(word).rotate_right(8) ^ ROUND_CONSTS[i / nk - 1];
+        } else if nk > 6 && i % nk == 4 {
+            word = sub_word(word)
+        }
+
+        ek_words[i] = ek_words[i - nk] ^ word;
+    }
+
+    expanded_keys
+}
+
+/// Compute inverse expanded keys (for decryption).
+///
+/// This is the reverse of the encryption keys, with the Inverse Mix Columns
+/// operation applied to all but the first and last expanded key.
+#[inline]
+pub(super) fn inv_expanded_keys<const N: usize>(expanded_keys: &mut [uint8x16_t; N]) {
+    assert!(N == 11 || N == 13 || N == 15);
+
+    for ek in expanded_keys.iter_mut().take(N - 1).skip(1) {
+        unsafe { *ek = vaesimcq_u8(*ek) }
+    }
+
+    expanded_keys.reverse();
+}
+
+/// Sub bytes for a single AES word: used for key expansion.
+#[inline(always)]
+fn sub_word(input: u32) -> u32 {
+    unsafe {
+        let input = vreinterpretq_u8_u32(vdupq_n_u32(input));
+
+        // AES single round encryption (with a "round" key of all zeros)
+        let sub_input = vaeseq_u8(input, vdupq_n_u8(0));
+
+        vgetq_lane_u32(vreinterpretq_u32_u8(sub_input), 0)
+    }
+}
diff --git a/rust/vendor/aes/src/armv8/hazmat.rs b/rust/vendor/aes/src/armv8/hazmat.rs
new file mode 100644
index 0000000..022db3a
--- /dev/null
+++ b/rust/vendor/aes/src/armv8/hazmat.rs
@@ -0,0 +1,104 @@
+//! Low-level "hazmat" AES functions: ARMv8 Cryptography Extensions support.
+//!
+//! Note: this isn't actually used in the `Aes128`/`Aes192`/`Aes256`
+//! implementations in this crate, but instead provides raw AES-NI accelerated
+//! access to the AES round function gated under the `hazmat` crate feature.
+
+use crate::{Block, ParBlocks};
+use core::arch::aarch64::*;
+
+/// AES cipher (encrypt) round function.
+#[allow(clippy::cast_ptr_alignment)]
+#[target_feature(enable = "aes")]
+pub(crate) unsafe fn cipher_round(block: &mut Block, round_key: &Block) {
+    let b = vld1q_u8(block.as_ptr());
+    let k = vld1q_u8(round_key.as_ptr());
+
+    // AES single round encryption (all-zero round key, deferred until the end)
+    let mut state = vaeseq_u8(b, vdupq_n_u8(0));
+
+    // AES mix columns (the `vaeseq_u8` instruction otherwise omits this step)
+    state = vaesmcq_u8(state);
+
+    // AES add round key (bitwise XOR)
+    state = veorq_u8(state, k);
+
+    vst1q_u8(block.as_mut_ptr(), state);
+}
+
+/// AES cipher (encrypt) round function: parallel version.
+#[allow(clippy::cast_ptr_alignment)]
+#[target_feature(enable = "aes")]
+pub(crate) unsafe fn cipher_round_par(blocks: &mut ParBlocks, round_keys: &ParBlocks) {
+    for i in 0..8 {
+        let mut state = vld1q_u8(blocks[i].as_ptr());
+
+        // AES single round encryption
+        state = vaeseq_u8(state, vdupq_n_u8(0));
+
+        // AES mix columns
+        state = vaesmcq_u8(state);
+
+        // AES add round key (bitwise XOR)
+        state = veorq_u8(state, vld1q_u8(round_keys[i].as_ptr()));
+
+        vst1q_u8(blocks[i].as_mut_ptr(), state);
+    }
+}
+
+/// AES equivalent inverse cipher (decrypt) round function.
+#[allow(clippy::cast_ptr_alignment)]
+#[target_feature(enable = "aes")]
+pub(crate) unsafe fn equiv_inv_cipher_round(block: &mut Block, round_key: &Block) {
+    let b = vld1q_u8(block.as_ptr());
+    let k = vld1q_u8(round_key.as_ptr());
+
+    // AES single round decryption (all-zero round key, deferred until the end)
+    let mut state = vaesdq_u8(b, vdupq_n_u8(0));
+
+    // AES inverse mix columns (the `vaesdq_u8` instruction otherwise omits this step)
+    state = vaesimcq_u8(state);
+
+    // AES add round key (bitwise XOR)
+    state = veorq_u8(state, k);
+
+    vst1q_u8(block.as_mut_ptr(), state);
+}
+
+/// AES equivalent inverse cipher (decrypt) round function: parallel version.
+#[allow(clippy::cast_ptr_alignment)]
+#[target_feature(enable = "aes")]
+pub(crate) unsafe fn equiv_inv_cipher_round_par(blocks: &mut ParBlocks, round_keys: &ParBlocks) {
+    for i in 0..8 {
+        let mut state = vld1q_u8(blocks[i].as_ptr());
+
+        // AES single round decryption (all-zero round key, deferred until the end)
+        state = vaesdq_u8(state, vdupq_n_u8(0));
+
+        // AES inverse mix columns (the `vaesdq_u8` instruction otherwise omits this step)
+        state = vaesimcq_u8(state);
+
+        // AES add round key (bitwise XOR)
+        state = veorq_u8(state, vld1q_u8(round_keys[i].as_ptr()));
+
+        vst1q_u8(blocks[i].as_mut_ptr(), state);
+    }
+}
+
+/// AES mix columns function.
+#[allow(clippy::cast_ptr_alignment)]
+#[target_feature(enable = "aes")]
+pub(crate) unsafe fn mix_columns(block: &mut Block) {
+    let b = vld1q_u8(block.as_ptr());
+    let out = vaesmcq_u8(b);
+    vst1q_u8(block.as_mut_ptr(), out);
+}
+
+/// AES inverse mix columns function.
+#[allow(clippy::cast_ptr_alignment)]
+#[target_feature(enable = "aes")]
+pub(crate) unsafe fn inv_mix_columns(block: &mut Block) {
+    let b = vld1q_u8(block.as_ptr());
+    let out = vaesimcq_u8(b);
+    vst1q_u8(block.as_mut_ptr(), out);
+}
diff --git a/rust/vendor/aes/src/autodetect.rs b/rust/vendor/aes/src/autodetect.rs
new file mode 100644
index 0000000..dbbdeab
--- /dev/null
+++ b/rust/vendor/aes/src/autodetect.rs
@@ -0,0 +1,259 @@
+//! Autodetection support for hardware accelerated AES backends with fallback
+//! to the fixsliced "soft" implementation.
+
+use crate::{soft, Block, ParBlocks};
+use cipher::{
+    consts::{U16, U24, U32, U8},
+    generic_array::GenericArray,
+    BlockCipher, BlockDecrypt, BlockEncrypt, NewBlockCipher,
+};
+use core::mem::ManuallyDrop;
+
+#[cfg(all(target_arch = "aarch64", feature = "armv8"))]
+use crate::armv8 as intrinsics;
+
+#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
+use crate::ni as intrinsics;
+
+cpufeatures::new!(aes_intrinsics, "aes");
+
+macro_rules! define_aes_impl {
+    (
+        $name:tt,
+        $module:tt,
+        $key_size:ty,
+        $doc:expr
+    ) => {
+        #[doc=$doc]
+        pub struct $name {
+            inner: $module::Inner,
+            token: aes_intrinsics::InitToken,
+        }
+
+        mod $module {
+            use super::{intrinsics, soft};
+            use core::mem::ManuallyDrop;
+
+            pub(super) union Inner {
+                pub(super) intrinsics: ManuallyDrop<intrinsics::$name>,
+                pub(super) soft: ManuallyDrop<soft::$name>,
+            }
+        }
+
+        impl NewBlockCipher for $name {
+            type KeySize = $key_size;
+
+            #[inline]
+            fn new(key: &GenericArray<u8, $key_size>) -> Self {
+                let (token, aesni_present) = aes_intrinsics::init_get();
+
+                let inner = if aesni_present {
+                    $module::Inner {
+                        intrinsics: ManuallyDrop::new(intrinsics::$name::new(key)),
+                    }
+                } else {
+                    $module::Inner {
+                        soft: ManuallyDrop::new(soft::$name::new(key)),
+                    }
+                };
+
+                Self { inner, token }
+            }
+        }
+
+        impl Clone for $name {
+            fn clone(&self) -> Self {
+                let inner = if self.token.get() {
+                    $module::Inner {
+                        intrinsics: unsafe { self.inner.intrinsics.clone() },
+                    }
+                } else {
+                    $module::Inner {
+                        soft: unsafe { self.inner.soft.clone() },
+                    }
+                };
+
+                Self {
+                    inner,
+                    token: self.token,
+                }
+            }
+        }
+
+        impl BlockCipher for $name {
+            type BlockSize = U16;
+            type ParBlocks = U8;
+        }
+
+        impl BlockEncrypt for $name {
+            #[inline]
+            fn encrypt_block(&self, block: &mut Block) {
+                if self.token.get() {
+                    unsafe { self.inner.intrinsics.encrypt_block(block) }
+                } else {
+                    unsafe { self.inner.soft.encrypt_block(block) }
+                }
+            }
+
+            #[inline]
+            fn encrypt_par_blocks(&self, blocks: &mut ParBlocks) {
+                if self.token.get() {
+                    unsafe { self.inner.intrinsics.encrypt_par_blocks(blocks) }
+                } else {
+                    unsafe { self.inner.soft.encrypt_par_blocks(blocks) }
+                }
+            }
+        }
+
+        impl BlockDecrypt for $name {
+            #[inline]
+            fn decrypt_block(&self, block: &mut Block) {
+                if self.token.get() {
+                    unsafe { self.inner.intrinsics.decrypt_block(block) }
+                } else {
+                    unsafe { self.inner.soft.decrypt_block(block) }
+                }
+            }
+
+            #[inline]
+            fn decrypt_par_blocks(&self, blocks: &mut ParBlocks) {
+                if self.token.get() {
+                    unsafe { self.inner.intrinsics.decrypt_par_blocks(blocks) }
+                } else {
+                    unsafe { self.inner.soft.decrypt_par_blocks(blocks) }
+                }
+            }
+        }
+
+        opaque_debug::implement!($name);
+    };
+}
+
+define_aes_impl!(Aes128, aes128, U16, "AES-128 block cipher instance");
+define_aes_impl!(Aes192, aes192, U24, "AES-192 block cipher instance");
+define_aes_impl!(Aes256, aes256, U32, "AES-256 block cipher instance");
+
+#[cfg(all(feature = "ctr", target_arch = "aarch64"))]
+pub(crate) mod ctr {
+    use super::{Aes128, Aes192, Aes256};
+
+    /// AES-128 in CTR mode
+    pub type Aes128Ctr = ::ctr::Ctr64BE<Aes128>;
+
+    /// AES-192 in CTR mode
+    pub type Aes192Ctr = ::ctr::Ctr64BE<Aes192>;
+
+    /// AES-256 in CTR mode
+    pub type Aes256Ctr = ::ctr::Ctr64BE<Aes256>;
+}
+
+#[cfg(all(feature = "ctr", any(target_arch = "x86_64", target_arch = "x86")))]
+pub(crate) mod ctr {
+    use super::{Aes128, Aes192, Aes256};
+    use crate::{ni, soft};
+    use cipher::{
+        errors::{LoopError, OverflowError},
+        generic_array::GenericArray,
+        BlockCipher, FromBlockCipher, SeekNum, StreamCipher, StreamCipherSeek,
+    };
+    use core::mem::ManuallyDrop;
+
+    cpufeatures::new!(aes_ssse3_cpuid, "aes", "ssse3");
+
+    macro_rules! define_aes_ctr_impl {
+        (
+            $name:tt,
+            $cipher:ident,
+            $module:tt,
+            $doc:expr
+        ) => {
+            #[doc=$doc]
+            #[cfg_attr(docsrs, doc(cfg(feature = "ctr")))]
+            pub struct $name {
+                inner: $module::Inner,
+                token: aes_ssse3_cpuid::InitToken,
+            }
+
+            mod $module {
+                use crate::{ni, soft};
+                use core::mem::ManuallyDrop;
+
+                pub(super) union Inner {
+                    pub(super) ni: ManuallyDrop<ni::$name>,
+                    pub(super) soft: ManuallyDrop<soft::$name>,
+                }
+            }
+
+            impl FromBlockCipher for $name {
+                type BlockCipher = $cipher;
+                type NonceSize = <$cipher as BlockCipher>::BlockSize;
+
+                fn from_block_cipher(
+                    cipher: $cipher,
+                    nonce: &GenericArray<u8, Self::NonceSize>,
+                ) -> Self {
+                    let (token, aesni_present) = aes_ssse3_cpuid::init_get();
+
+                    let inner = if aesni_present {
+                        let ni = ni::$name::from_block_cipher(
+                            unsafe { (*cipher.inner.intrinsics).clone() },
+                            nonce,
+                        );
+
+                        $module::Inner {
+                            ni: ManuallyDrop::new(ni),
+                        }
+                    } else {
+                        let soft = soft::$name::from_block_cipher(
+                            unsafe { (*cipher.inner.soft).clone() },
+                            nonce,
+                        );
+
+                        $module::Inner {
+                            soft: ManuallyDrop::new(soft),
+                        }
+                    };
+
+                    Self { inner, token }
+                }
+            }
+
+            impl StreamCipher for $name {
+                #[inline]
+                fn try_apply_keystream(&mut self, data: &mut [u8]) -> Result<(), LoopError> {
+                    if self.token.get() {
+                        unsafe { (*self.inner.ni).try_apply_keystream(data) }
+                    } else {
+                        unsafe { (*self.inner.soft).try_apply_keystream(data) }
+                    }
+                }
+            }
+
+            impl StreamCipherSeek for $name {
+                #[inline]
+                fn try_current_pos<T: SeekNum>(&self) -> Result<T, OverflowError> {
+                    if self.token.get() {
+                        unsafe { (*self.inner.ni).try_current_pos() }
+                    } else {
+                        unsafe { (*self.inner.soft).try_current_pos() }
+                    }
+                }
+
+                #[inline]
+                fn try_seek<T: SeekNum>(&mut self, pos: T) -> Result<(), LoopError> {
+                    if self.token.get() {
+                        unsafe { (*self.inner.ni).try_seek(pos) }
+                    } else {
+                        unsafe { (*self.inner.soft).try_seek(pos) }
+                    }
+                }
+            }
+
+            opaque_debug::implement!($name);
+        };
+    }
+
+    define_aes_ctr_impl!(Aes128Ctr, Aes128, aes128ctr, "AES-128 in CTR mode");
+    define_aes_ctr_impl!(Aes192Ctr, Aes192, aes192ctr, "AES-192 in CTR mode");
+    define_aes_ctr_impl!(Aes256Ctr, Aes256, aes256ctr, "AES-256 in CTR mode");
+}
diff --git a/rust/vendor/aes/src/hazmat.rs b/rust/vendor/aes/src/hazmat.rs
new file mode 100644
index 0000000..ff628e5
--- /dev/null
+++ b/rust/vendor/aes/src/hazmat.rs
@@ -0,0 +1,166 @@
+//! ⚠️ Low-level "hazmat" AES functions.
+//!
+//! # ☢️️ WARNING: HAZARDOUS API ☢️
+//!
+//! This module contains an extremely low-level cryptographic primitive
+//! which is likewise extremely difficult to use correctly.
+//!
+//! There are very few valid uses cases for this API. It's intended to be used
+//! for implementing well-reviewed higher-level constructions.
+//!
+//! We do NOT recommending using it to implement any algorithm which has not
+//! received extensive peer review by cryptographers.
+
+use crate::{soft::fixslice::hazmat as soft, Block, ParBlocks};
+
+#[cfg(all(
+    target_arch = "aarch64",
+    feature = "armv8",
+    not(feature = "force-soft")
+))]
+use crate::armv8::hazmat as intrinsics;
+
+#[cfg(all(
+    any(target_arch = "x86_64", target_arch = "x86"),
+    not(feature = "force-soft")
+))]
+use crate::ni::hazmat as intrinsics;
+
+#[cfg(all(
+    any(
+        target_arch = "x86",
+        target_arch = "x86_64",
+        all(target_arch = "aarch64", feature = "armv8")
+    ),
+    not(feature = "force-soft")
+))]
+cpufeatures::new!(aes_intrinsics, "aes");
+
+/// Execute the provided body if CPU intrinsics are available.
+// TODO(tarcieri): more `cfg-if`-like macro with an else branch?
+macro_rules! if_intrinsics_available {
+    ($body:expr) => {{
+        #[cfg(all(
+            any(
+                target_arch = "x86",
+                target_arch = "x86_64",
+                all(target_arch = "aarch64", feature = "armv8")
+            ),
+            not(feature = "force-soft")
+        ))]
+        if aes_intrinsics::get() {
+            unsafe { $body }
+            return;
+        }
+    }};
+}
+
+/// ⚠️ AES cipher (encrypt) round function.
+///
+/// This API performs the following steps as described in FIPS 197 Appendix C:
+///
+/// - `s_box`: state after `SubBytes()`
+/// - `s_row`: state after `ShiftRows()`
+/// - `m_col`: state after `MixColumns()`
+/// - `k_sch`: key schedule value for `round[r]`
+///
+/// This series of operations is equivalent to the Intel AES-NI `AESENC` instruction.
+///
+/// # ☢️️ WARNING: HAZARDOUS API ☢️
+///
+/// Use this function with great care! See the [module-level documentation][crate::hazmat]
+/// for more information.
+pub fn cipher_round(block: &mut Block, round_key: &Block) {
+    if_intrinsics_available! {
+        intrinsics::cipher_round(block, round_key)
+    }
+
+    soft::cipher_round(block, round_key);
+}
+
+/// ⚠️ AES cipher (encrypt) round function: parallel version.
+///
+/// Equivalent to [`cipher_round`], but acts on 8 blocks-at-a-time, applying
+/// the same number of round keys.
+///
+/// # ☢️️ WARNING: HAZARDOUS API ☢️
+///
+/// Use this function with great care! See the [module-level documentation][crate::hazmat]
+/// for more information.
+pub fn cipher_round_par(blocks: &mut ParBlocks, round_keys: &ParBlocks) {
+    if_intrinsics_available! {
+        intrinsics::cipher_round_par(blocks, round_keys)
+    }
+
+    soft::cipher_round_par(blocks, round_keys);
+}
+
+/// ⚠️ AES equivalent inverse cipher (decrypt) round function.
+///
+/// This API performs the following steps as described in FIPS 197 Appendix C:
+///
+/// - `is_box`: state after `InvSubBytes()`
+/// - `is_row`: state after `InvShiftRows()`
+/// - `im_col`: state after `InvMixColumns()`
+/// - `ik_sch`: key schedule value for `round[r]`
+///
+/// This series of operations is equivalent to the Intel AES-NI `AESDEC` instruction.
+///
+/// # ☢️️ WARNING: HAZARDOUS API ☢️
+///
+/// Use this function with great care! See the [module-level documentation][crate::hazmat]
+/// for more information.
+pub fn equiv_inv_cipher_round(block: &mut Block, round_key: &Block) {
+    if_intrinsics_available! {
+        intrinsics::equiv_inv_cipher_round(block, round_key)
+    }
+
+    soft::equiv_inv_cipher_round(block, round_key);
+}
+
+/// ⚠️ AES equivalent inverse cipher (decrypt) round function: parallel version.
+///
+/// Equivalent to [`equiv_inv_cipher_round`], but acts on 8 blocks-at-a-time,
+/// applying the same number of round keys.
+///
+/// # ☢️️ WARNING: HAZARDOUS API ☢️
+///
+/// Use this function with great care! See the [module-level documentation][crate::hazmat]
+/// for more information.
+pub fn equiv_inv_cipher_round_par(blocks: &mut ParBlocks, round_keys: &ParBlocks) {
+    if_intrinsics_available! {
+        intrinsics::equiv_inv_cipher_round_par(blocks, round_keys)
+    }
+
+    soft::equiv_inv_cipher_round_par(blocks, round_keys);
+}
+
+/// ⚠️ AES mix columns function.
+///
+/// # ☢️️ WARNING: HAZARDOUS API ☢️
+///
+/// Use this function with great care! See the [module-level documentation][crate::hazmat]
+/// for more information.
+pub fn mix_columns(block: &mut Block) {
+    if_intrinsics_available! {
+        intrinsics::mix_columns(block)
+    }
+
+    soft::mix_columns(block);
+}
+
+/// ⚠️ AES inverse mix columns function.
+///
+/// This function is equivalent to the Intel AES-NI `AESIMC` instruction.
+///
+/// # ☢️️ WARNING: HAZARDOUS API ☢️
+///
+/// Use this function with great care! See the [module-level documentation][crate::hazmat]
+/// for more information.
+pub fn inv_mix_columns(block: &mut Block) {
+    if_intrinsics_available! {
+        intrinsics::inv_mix_columns(block)
+    }
+
+    soft::inv_mix_columns(block);
+}
diff --git a/rust/vendor/aes/src/lib.rs b/rust/vendor/aes/src/lib.rs
new file mode 100644
index 0000000..f33183c
--- /dev/null
+++ b/rust/vendor/aes/src/lib.rs
@@ -0,0 +1,138 @@
+//! Pure Rust implementation of the Advanced Encryption Standard
+//! (a.k.a. Rijndael)
+//!
+//! # Supported backends
+//! This crate provides multiple backends including a portable pure Rust
+//! backend as well as ones based on CPU intrinsics.
+//!
+//! By default, it performs runtime detection of CPU intrinsics and uses them
+//! if they are available.
+//!
+//! ## "soft" portable backend
+//! As a baseline implementation, this crate provides a constant-time pure Rust
+//! implementation based on [fixslicing], a more advanced form of bitslicing
+//! implemented entirely in terms of bitwise arithmetic with no use of any
+//! lookup tables or data-dependent branches.
+//!
+//! Enabling the `compact` Cargo feature will reduce the code size of this
+//! backend at the cost of decreased performance (using a modified form of
+//! the fixslicing technique called "semi-fixslicing").
+//!
+//! ## ARMv8 intrinsics (nightly-only)
+//! On `aarch64` targets including `aarch64-apple-darwin` (Apple M1) and Linux
+//! targets such as `aarch64-unknown-linux-gnu` and `aarch64-unknown-linux-musl`,
+//! support for using AES intrinsics provided by the ARMv8 Cryptography Extensions
+//! is available when using the nightly compiler, and can be enabled using the
+//! `armv8` crate feature.
+//!
+//! On Linux and macOS, when the `armv8` feature is enabled support for AES
+//! intrinsics is autodetected at runtime. On other platforms the `aes`
+//! target feature must be enabled via RUSTFLAGS.
+//!
+//! ## `x86`/`x86_64` intrinsics (AES-NI)
+//! By default this crate uses runtime detection on `i686`/`x86_64` targets
+//! in order to determine if AES-NI is available, and if it is not, it will
+//! fallback to using a constant-time software implementation.
+//!
+//! Passing `RUSTFLAGS=-Ctarget-feature=+aes,+ssse3` explicitly at compile-time
+//! will override runtime detection and ensure that AES-NI is always used.
+//! Programs built in this manner will crash with an illegal instruction on
+//! CPUs which do not have AES-NI enabled.
+//!
+//! Note: runtime detection is not possible on SGX targets. Please use the
+//! afforementioned `RUSTFLAGS` to leverage AES-NI on these targets.
+//!
+//! # Usage example
+//! ```
+//! use aes::{Aes128, Block, ParBlocks};
+//! use aes::cipher::{
+//!     BlockCipher, BlockEncrypt, BlockDecrypt, NewBlockCipher,
+//!     generic_array::GenericArray,
+//! };
+//!
+//! let key = GenericArray::from_slice(&[0u8; 16]);
+//! let mut block = Block::default();
+//! let mut block8 = ParBlocks::default();
+//!
+//! // Initialize cipher
+//! let cipher = Aes128::new(&key);
+//!
+//! let block_copy = block.clone();
+//!
+//! // Encrypt block in-place
+//! cipher.encrypt_block(&mut block);
+//!
+//! // And decrypt it back
+//! cipher.decrypt_block(&mut block);
+//! assert_eq!(block, block_copy);
+//!
+//! // We can encrypt 8 blocks simultaneously using
+//! // instruction-level parallelism
+//! let block8_copy = block8.clone();
+//! cipher.encrypt_par_blocks(&mut block8);
+//! cipher.decrypt_par_blocks(&mut block8);
+//! assert_eq!(block8, block8_copy);
+//! ```
+//!
+//! For implementations of block cipher modes of operation see
+//! [`block-modes`] crate.
+//!
+//! [fixslicing]: https://eprint.iacr.org/2020/1123.pdf
+//! [AES-NI]: https://en.wikipedia.org/wiki/AES_instruction_set
+//! [`block-modes`]: https://docs.rs/block-modes
+
+#![no_std]
+#![cfg_attr(
+    all(feature = "armv8", target_arch = "aarch64"),
+    feature(stdsimd, aarch64_target_feature)
+)]
+#![cfg_attr(docsrs, feature(doc_cfg))]
+#![doc(
+    html_logo_url = "https://raw.githubusercontent.com/RustCrypto/meta/master/logo.svg",
+    html_favicon_url = "https://raw.githubusercontent.com/RustCrypto/meta/master/logo.svg"
+)]
+#![warn(missing_docs, rust_2018_idioms)]
+
+#[cfg(feature = "hazmat")]
+pub mod hazmat;
+
+mod soft;
+
+use cfg_if::cfg_if;
+
+cfg_if! {
+    if #[cfg(all(target_arch = "aarch64", feature = "armv8", not(feature = "force-soft")))] {
+        mod armv8;
+        mod autodetect;
+        pub use autodetect::{Aes128, Aes192, Aes256};
+
+        #[cfg(feature = "ctr")]
+        pub use autodetect::ctr::{Aes128Ctr, Aes192Ctr, Aes256Ctr};
+    } else if #[cfg(all(
+        any(target_arch = "x86", target_arch = "x86_64"),
+        not(feature = "force-soft")
+    ))] {
+        mod autodetect;
+        mod ni;
+        pub use autodetect::{Aes128, Aes192, Aes256};
+
+        #[cfg(feature = "ctr")]
+        pub use autodetect::ctr::{Aes128Ctr, Aes192Ctr, Aes256Ctr};
+    } else {
+        pub use soft::{Aes128, Aes192, Aes256};
+
+        #[cfg(feature = "ctr")]
+        pub use soft::{Aes128Ctr, Aes192Ctr, Aes256Ctr};
+    }
+}
+
+pub use cipher::{self, BlockCipher, BlockDecrypt, BlockEncrypt, NewBlockCipher};
+
+/// 128-bit AES block
+pub type Block = cipher::generic_array::GenericArray<u8, cipher::consts::U16>;
+
+/// 8 x 128-bit AES blocks to be processed in parallel
+pub type ParBlocks = cipher::generic_array::GenericArray<Block, cipher::consts::U8>;
+
+/// Size of an AES block (128-bits; 16-bytes)
+pub const BLOCK_SIZE: usize = 16;
diff --git a/rust/vendor/aes/src/ni.rs b/rust/vendor/aes/src/ni.rs
new file mode 100644
index 0000000..56e8e1f
--- /dev/null
+++ b/rust/vendor/aes/src/ni.rs
@@ -0,0 +1,45 @@
+//! AES block ciphers implementation using AES-NI instruction set.
+//!
+//! Ciphers functionality is accessed using `BlockCipher` trait from the
+//! [`cipher`](https://docs.rs/cipher) crate.
+//!
+//! # CTR mode
+//! In addition to core block cipher functionality this crate provides optimized
+//! CTR mode implementation. This functionality requires additional `ssse3`
+//! target feature and feature-gated behind `ctr` feature flag, which is enabled
+//! by default.
+//!
+//! # Vulnerability
+//! Lazy FP state restory vulnerability can allow local process to leak content
+//! of the FPU register, in which round keys are stored. This vulnerability
+//! can be mitigated at the operating system level by installing relevant
+//! patches. (i.e. keep your OS updated!) More info:
+//! - [Intel advisory](https://www.intel.com/content/www/us/en/security-center/advisory/intel-sa-00145.html)
+//! - [Wikipedia](https://en.wikipedia.org/wiki/Lazy_FP_state_restore)
+//!
+//! # Related documents
+//! - [Intel AES-NI whitepaper](https://software.intel.com/sites/default/files/article/165683/aes-wp-2012-09-22-v01.pdf)
+//! - [Use of the AES Instruction Set](https://www.cosic.esat.kuleuven.be/ecrypt/AESday/slides/Use_of_the_AES_Instruction_Set.pdf)
+
+#[macro_use]
+mod utils;
+
+mod aes128;
+mod aes192;
+mod aes256;
+
+#[cfg(feature = "ctr")]
+mod ctr;
+
+#[cfg(feature = "hazmat")]
+pub(crate) mod hazmat;
+
+#[cfg(target_arch = "x86")]
+use core::arch::x86 as arch;
+#[cfg(target_arch = "x86_64")]
+use core::arch::x86_64 as arch;
+
+pub use self::{aes128::Aes128, aes192::Aes192, aes256::Aes256};
+
+#[cfg(feature = "ctr")]
+pub use self::ctr::{Aes128Ctr, Aes192Ctr, Aes256Ctr};
diff --git a/rust/vendor/aes/src/ni/aes128.rs b/rust/vendor/aes/src/ni/aes128.rs
new file mode 100644
index 0000000..f079fdd
--- /dev/null
+++ b/rust/vendor/aes/src/ni/aes128.rs
@@ -0,0 +1,163 @@
+use super::{
+    arch::*,
+    utils::{aesdec8, aesdeclast8, aesenc8, aesenclast8, load8, store8, xor8, U128x8},
+};
+use crate::{Block, ParBlocks};
+use cipher::{
+    consts::{U16, U8},
+    generic_array::GenericArray,
+    BlockCipher, BlockDecrypt, BlockEncrypt, NewBlockCipher,
+};
+
+mod expand;
+#[cfg(test)]
+mod test_expand;
+
+/// AES-128 round keys
+type RoundKeys = [__m128i; 11];
+
+/// AES-128 block cipher
+#[derive(Clone)]
+pub struct Aes128 {
+    encrypt_keys: RoundKeys,
+    decrypt_keys: RoundKeys,
+}
+
+impl Aes128 {
+    #[inline(always)]
+    pub(crate) fn encrypt8(&self, mut blocks: U128x8) -> U128x8 {
+        #[inline]
+        #[target_feature(enable = "aes")]
+        unsafe fn aesni128_encrypt8(keys: &RoundKeys, blocks: &mut U128x8) {
+            xor8(blocks, keys[0]);
+            aesenc8(blocks, keys[1]);
+            aesenc8(blocks, keys[2]);
+            aesenc8(blocks, keys[3]);
+            aesenc8(blocks, keys[4]);
+            aesenc8(blocks, keys[5]);
+            aesenc8(blocks, keys[6]);
+            aesenc8(blocks, keys[7]);
+            aesenc8(blocks, keys[8]);
+            aesenc8(blocks, keys[9]);
+            aesenclast8(blocks, keys[10]);
+        }
+        unsafe { aesni128_encrypt8(&self.encrypt_keys, &mut blocks) };
+        blocks
+    }
+
+    #[inline(always)]
+    pub(crate) fn encrypt(&self, block: __m128i) -> __m128i {
+        #[inline]
+        #[target_feature(enable = "aes")]
+        unsafe fn aesni128_encrypt1(keys: &RoundKeys, mut block: __m128i) -> __m128i {
+            block = _mm_xor_si128(block, keys[0]);
+            block = _mm_aesenc_si128(block, keys[1]);
+            block = _mm_aesenc_si128(block, keys[2]);
+            block = _mm_aesenc_si128(block, keys[3]);
+            block = _mm_aesenc_si128(block, keys[4]);
+            block = _mm_aesenc_si128(block, keys[5]);
+            block = _mm_aesenc_si128(block, keys[6]);
+            block = _mm_aesenc_si128(block, keys[7]);
+            block = _mm_aesenc_si128(block, keys[8]);
+            block = _mm_aesenc_si128(block, keys[9]);
+            _mm_aesenclast_si128(block, keys[10])
+        }
+        unsafe { aesni128_encrypt1(&self.encrypt_keys, block) }
+    }
+}
+
+impl NewBlockCipher for Aes128 {
+    type KeySize = U16;
+
+    #[inline]
+    fn new(key: &GenericArray<u8, U16>) -> Self {
+        let key = unsafe { &*(key as *const _ as *const [u8; 16]) };
+
+        let (encrypt_keys, decrypt_keys) = expand::expand(key);
+
+        Self {
+            encrypt_keys,
+            decrypt_keys,
+        }
+    }
+}
+
+impl BlockCipher for Aes128 {
+    type BlockSize = U16;
+    type ParBlocks = U8;
+}
+
+impl BlockEncrypt for Aes128 {
+    #[inline]
+    fn encrypt_block(&self, block: &mut Block) {
+        // Safety: `loadu` and `storeu` support unaligned access
+        #[allow(clippy::cast_ptr_alignment)]
+        unsafe {
+            let b = _mm_loadu_si128(block.as_ptr() as *const __m128i);
+            let b = self.encrypt(b);
+            _mm_storeu_si128(block.as_mut_ptr() as *mut __m128i, b);
+        }
+    }
+
+    #[inline]
+    fn encrypt_par_blocks(&self, blocks: &mut ParBlocks) {
+        let b = self.encrypt8(load8(blocks));
+        store8(blocks, b);
+    }
+}
+
+impl BlockDecrypt for Aes128 {
+    #[inline]
+    fn decrypt_block(&self, block: &mut Block) {
+        #[inline]
+        #[target_feature(enable = "aes")]
+        unsafe fn aes128_decrypt1(block: &mut Block, keys: &RoundKeys) {
+            // Safety: `loadu` and `storeu` support unaligned access
+            #[allow(clippy::cast_ptr_alignment)]
+            let mut b = _mm_loadu_si128(block.as_ptr() as *const __m128i);
+
+            b = _mm_xor_si128(b, keys[10]);
+            b = _mm_aesdec_si128(b, keys[9]);
+            b = _mm_aesdec_si128(b, keys[8]);
+            b = _mm_aesdec_si128(b, keys[7]);
+            b = _mm_aesdec_si128(b, keys[6]);
+            b = _mm_aesdec_si128(b, keys[5]);
+            b = _mm_aesdec_si128(b, keys[4]);
+            b = _mm_aesdec_si128(b, keys[3]);
+            b = _mm_aesdec_si128(b, keys[2]);
+            b = _mm_aesdec_si128(b, keys[1]);
+            b = _mm_aesdeclast_si128(b, keys[0]);
+
+            // Safety: `loadu` and `storeu` support unaligned access
+            #[allow(clippy::cast_ptr_alignment)]
+            _mm_storeu_si128(block.as_mut_ptr() as *mut __m128i, b);
+        }
+
+        unsafe { aes128_decrypt1(block, &self.decrypt_keys) }
+    }
+
+    #[inline]
+    fn decrypt_par_blocks(&self, blocks: &mut ParBlocks) {
+        #[inline]
+        #[target_feature(enable = "aes")]
+        unsafe fn aes128_decrypt8(blocks: &mut ParBlocks, keys: &RoundKeys) {
+            let mut b = load8(blocks);
+            xor8(&mut b, keys[10]);
+            aesdec8(&mut b, keys[9]);
+            aesdec8(&mut b, keys[8]);
+            aesdec8(&mut b, keys[7]);
+            aesdec8(&mut b, keys[6]);
+            aesdec8(&mut b, keys[5]);
+            aesdec8(&mut b, keys[4]);
+            aesdec8(&mut b, keys[3]);
+            aesdec8(&mut b, keys[2]);
+            aesdec8(&mut b, keys[1]);
+            aesdeclast8(&mut b, keys[0]);
+            store8(blocks, b);
+        }
+
+        unsafe { aes128_decrypt8(blocks, &self.decrypt_keys) }
+    }
+}
+
+opaque_debug::implement!(Aes128);
diff --git a/rust/vendor/aes/src/ni/aes128/expand.rs b/rust/vendor/aes/src/ni/aes128/expand.rs
new file mode 100644
index 0000000..f7b65b6
--- /dev/null
+++ b/rust/vendor/aes/src/ni/aes128/expand.rs
@@ -0,0 +1,53 @@
+use super::RoundKeys;
+use crate::ni::arch::*;
+
+use core::mem;
+
+macro_rules! expand_round {
+    ($enc_keys:expr, $dec_keys:expr, $pos:expr, $round:expr) => {
+        let mut t1 = $enc_keys[$pos - 1];
+        let mut t2;
+        let mut t3;
+
+        t2 = _mm_aeskeygenassist_si128(t1, $round);
+        t2 = _mm_shuffle_epi32(t2, 0xff);
+        t3 = _mm_slli_si128(t1, 0x4);
+        t1 = _mm_xor_si128(t1, t3);
+        t3 = _mm_slli_si128(t3, 0x4);
+        t1 = _mm_xor_si128(t1, t3);
+        t3 = _mm_slli_si128(t3, 0x4);
+        t1 = _mm_xor_si128(t1, t3);
+        t1 = _mm_xor_si128(t1, t2);
+
+        $enc_keys[$pos] = t1;
+        let t1 = if $pos != 10 { _mm_aesimc_si128(t1) } else { t1 };
+        $dec_keys[$pos] = t1;
+    };
+}
+
+#[inline(always)]
+pub(super) fn expand(key: &[u8; 16]) -> (RoundKeys, RoundKeys) {
+    unsafe {
+        let mut enc_keys: RoundKeys = mem::zeroed();
+        let mut dec_keys: RoundKeys = mem::zeroed();
+
+        // Safety: `loadu` supports unaligned loads
+        #[allow(clippy::cast_ptr_alignment)]
+        let k = _mm_loadu_si128(key.as_ptr() as *const __m128i);
+        enc_keys[0] = k;
+        dec_keys[0] = k;
+
+        expand_round!(enc_keys, dec_keys, 1, 0x01);
+        expand_round!(enc_keys, dec_keys, 2, 0x02);
+        expand_round!(enc_keys, dec_keys, 3, 0x04);
+        expand_round!(enc_keys, dec_keys, 4, 0x08);
+        expand_round!(enc_keys, dec_keys, 5, 0x10);
+        expand_round!(enc_keys, dec_keys, 6, 0x20);
+        expand_round!(enc_keys, dec_keys, 7, 0x40);
+        expand_round!(enc_keys, dec_keys, 8, 0x80);
+        expand_round!(enc_keys, dec_keys, 9, 0x1B);
+        expand_round!(enc_keys, dec_keys, 10, 0x36);
+
+        (enc_keys, dec_keys)
+    }
+}
diff --git a/rust/vendor/aes/src/ni/aes128/test_expand.rs b/rust/vendor/aes/src/ni/aes128/test_expand.rs
new file mode 100644
index 0000000..38744e6
--- /dev/null
+++ b/rust/vendor/aes/src/ni/aes128/test_expand.rs
@@ -0,0 +1,107 @@
+use super::expand::expand;
+use crate::ni::utils::check;
+
+#[test]
+fn test() {
+    let enc_keys = expand(&[0x00; 16]).0;
+    check(
+        &enc_keys,
+        &[
+            [0x0000000000000000, 0x0000000000000000],
+            [0x6263636362636363, 0x6263636362636363],
+            [0x9b9898c9f9fbfbaa, 0x9b9898c9f9fbfbaa],
+            [0x90973450696ccffa, 0xf2f457330b0fac99],
+            [0xee06da7b876a1581, 0x759e42b27e91ee2b],
+            [0x7f2e2b88f8443e09, 0x8dda7cbbf34b9290],
+            [0xec614b851425758c, 0x99ff09376ab49ba7],
+            [0x217517873550620b, 0xacaf6b3cc61bf09b],
+            [0x0ef903333ba96138, 0x97060a04511dfa9f],
+            [0xb1d4d8e28a7db9da, 0x1d7bb3de4c664941],
+            [0xb4ef5bcb3e92e211, 0x23e951cf6f8f188e],
+        ],
+    );
+
+    let enc_keys = expand(&[0xff; 16]).0;
+    check(
+        &enc_keys,
+        &[
+            [0xffffffffffffffff, 0xffffffffffffffff],
+            [0xe8e9e9e917161616, 0xe8e9e9e917161616],
+            [0xadaeae19bab8b80f, 0x525151e6454747f0],
+            [0x090e2277b3b69a78, 0xe1e7cb9ea4a08c6e],
+            [0xe16abd3e52dc2746, 0xb33becd8179b60b6],
+            [0xe5baf3ceb766d488, 0x045d385013c658e6],
+            [0x71d07db3c6b6a93b, 0xc2eb916bd12dc98d],
+            [0xe90d208d2fbb89b6, 0xed5018dd3c7dd150],
+            [0x96337366b988fad0, 0x54d8e20d68a5335d],
+            [0x8bf03f233278c5f3, 0x66a027fe0e0514a3],
+            [0xd60a3588e472f07b, 0x82d2d7858cd7c326],
+        ],
+    );
+
+    let enc_keys = expand(&[
+        0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e,
+        0x0f,
+    ])
+    .0;
+    check(
+        &enc_keys,
+        &[
+            [0x0001020304050607, 0x08090a0b0c0d0e0f],
+            [0xd6aa74fdd2af72fa, 0xdaa678f1d6ab76fe],
+            [0xb692cf0b643dbdf1, 0xbe9bc5006830b3fe],
+            [0xb6ff744ed2c2c9bf, 0x6c590cbf0469bf41],
+            [0x47f7f7bc95353e03, 0xf96c32bcfd058dfd],
+            [0x3caaa3e8a99f9deb, 0x50f3af57adf622aa],
+            [0x5e390f7df7a69296, 0xa7553dc10aa31f6b],
+            [0x14f9701ae35fe28c, 0x440adf4d4ea9c026],
+            [0x47438735a41c65b9, 0xe016baf4aebf7ad2],
+            [0x549932d1f0855768, 0x1093ed9cbe2c974e],
+            [0x13111d7fe3944a17, 0xf307a78b4d2b30c5],
+        ],
+    );
+
+    let enc_keys = expand(&[
+        0x69, 0x20, 0xe2, 0x99, 0xa5, 0x20, 0x2a, 0x6d, 0x65, 0x6e, 0x63, 0x68, 0x69, 0x74, 0x6f,
+        0x2a,
+    ])
+    .0;
+    check(
+        &enc_keys,
+        &[
+            [0x6920e299a5202a6d, 0x656e636869746f2a],
+            [0xfa8807605fa82d0d, 0x3ac64e6553b2214f],
+            [0xcf75838d90ddae80, 0xaa1be0e5f9a9c1aa],
+            [0x180d2f1488d08194, 0x22cb6171db62a0db],
+            [0xbaed96ad323d1739, 0x10f67648cb94d693],
+            [0x881b4ab2ba265d8b, 0xaad02bc36144fd50],
+            [0xb34f195d096944d6, 0xa3b96f15c2fd9245],
+            [0xa7007778ae6933ae, 0x0dd05cbbcf2dcefe],
+            [0xff8bccf251e2ff5c, 0x5c32a3e7931f6d19],
+            [0x24b7182e7555e772, 0x29674495ba78298c],
+            [0xae127cdadb479ba8, 0xf220df3d4858f6b1],
+        ],
+    );
+
+    let enc_keys = expand(&[
+        0x2b, 0x7e, 0x15, 0x16, 0x28, 0xae, 0xd2, 0xa6, 0xab, 0xf7, 0x15, 0x88, 0x09, 0xcf, 0x4f,
+        0x3c,
+    ])
+    .0;
+    check(
+        &enc_keys,
+        &[
+            [0x2b7e151628aed2a6, 0xabf7158809cf4f3c],
+            [0xa0fafe1788542cb1, 0x23a339392a6c7605],
+            [0xf2c295f27a96b943, 0x5935807a7359f67f],
+            [0x3d80477d4716fe3e, 0x1e237e446d7a883b],
+            [0xef44a541a8525b7f, 0xb671253bdb0bad00],
+            [0xd4d1c6f87c839d87, 0xcaf2b8bc11f915bc],
+            [0x6d88a37a110b3efd, 0xdbf98641ca0093fd],
+            [0x4e54f70e5f5fc9f3, 0x84a64fb24ea6dc4f],
+            [0xead27321b58dbad2, 0x312bf5607f8d292f],
+            [0xac7766f319fadc21, 0x28d12941575c006e],
+            [0xd014f9a8c9ee2589, 0xe13f0cc8b6630ca6],
+        ],
+    );
+}
diff --git a/rust/vendor/aes/src/ni/aes192.rs b/rust/vendor/aes/src/ni/aes192.rs
new file mode 100644
index 0000000..fb64289
--- /dev/null
+++ b/rust/vendor/aes/src/ni/aes192.rs
@@ -0,0 +1,169 @@
+use super::{
+    arch::*,
+    utils::{aesdec8, aesdeclast8, aesenc8, aesenclast8, load8, store8, xor8, U128x8},
+};
+use crate::{Block, ParBlocks};
+use cipher::{
+    consts::{U16, U24, U8},
+    generic_array::GenericArray,
+    BlockCipher, BlockDecrypt, BlockEncrypt, NewBlockCipher,
+};
+
+mod expand;
+#[cfg(test)]
+mod test_expand;
+
+/// AES-192 round keys
+type RoundKeys = [__m128i; 13];
+
+/// AES-192 block cipher
+#[derive(Clone)]
+pub struct Aes192 {
+    encrypt_keys: RoundKeys,
+    decrypt_keys: RoundKeys,
+}
+
+impl Aes192 {
+    #[inline(always)]
+    pub(crate) fn encrypt8(&self, mut blocks: U128x8) -> U128x8 {
+        #[inline]
+        #[target_feature(enable = "aes")]
+        unsafe fn aesni192_encrypt8(keys: &RoundKeys, blocks: &mut U128x8) {
+            xor8(blocks, keys[0]);
+            aesenc8(blocks, keys[1]);
+            aesenc8(blocks, keys[2]);
+            aesenc8(blocks, keys[3]);
+            aesenc8(blocks, keys[4]);
+            aesenc8(blocks, keys[5]);
+            aesenc8(blocks, keys[6]);
+            aesenc8(blocks, keys[7]);
+            aesenc8(blocks, keys[8]);
+            aesenc8(blocks, keys[9]);
+            aesenc8(blocks, keys[10]);
+            aesenc8(blocks, keys[11]);
+            aesenclast8(blocks, keys[12]);
+        }
+        unsafe { aesni192_encrypt8(&self.encrypt_keys, &mut blocks) };
+        blocks
+    }
+
+    #[inline(always)]
+    pub(crate) fn encrypt(&self, block: __m128i) -> __m128i {
+        #[inline]
+        #[target_feature(enable = "aes")]
+        unsafe fn aesni192_encrypt1(keys: &RoundKeys, mut block: __m128i) -> __m128i {
+            block = _mm_xor_si128(block, keys[0]);
+            block = _mm_aesenc_si128(block, keys[1]);
+            block = _mm_aesenc_si128(block, keys[2]);
+            block = _mm_aesenc_si128(block, keys[3]);
+            block = _mm_aesenc_si128(block, keys[4]);
+            block = _mm_aesenc_si128(block, keys[5]);
+            block = _mm_aesenc_si128(block, keys[6]);
+            block = _mm_aesenc_si128(block, keys[7]);
+            block = _mm_aesenc_si128(block, keys[8]);
+            block = _mm_aesenc_si128(block, keys[9]);
+            block = _mm_aesenc_si128(block, keys[10]);
+            block = _mm_aesenc_si128(block, keys[11]);
+            _mm_aesenclast_si128(block, keys[12])
+        }
+        unsafe { aesni192_encrypt1(&self.encrypt_keys, block) }
+    }
+}
+
+impl NewBlockCipher for Aes192 {
+    type KeySize = U24;
+
+    #[inline]
+    fn new(key: &GenericArray<u8, U24>) -> Self {
+        let key = unsafe { &*(key as *const _ as *const [u8; 24]) };
+        let (encrypt_keys, decrypt_keys) = expand::expand(key);
+        Self {
+            encrypt_keys,
+            decrypt_keys,
+        }
+    }
+}
+
+impl BlockCipher for Aes192 {
+    type BlockSize = U16;
+    type ParBlocks = U8;
+}
+
+impl BlockEncrypt for Aes192 {
+    #[inline]
+    fn encrypt_block(&self, block: &mut Block) {
+        // Safety: `loadu` and `storeu` support unaligned access
+        #[allow(clippy::cast_ptr_alignment)]
+        unsafe {
+            let b = _mm_loadu_si128(block.as_ptr() as *const __m128i);
+            let b = self.encrypt(b);
+            _mm_storeu_si128(block.as_mut_ptr() as *mut __m128i, b);
+        }
+    }
+
+    #[inline]
+    fn encrypt_par_blocks(&self, blocks: &mut ParBlocks) {
+        let b = self.encrypt8(load8(blocks));
+        store8(blocks, b);
+    }
+}
+
+impl BlockDecrypt for Aes192 {
+    #[inline]
+    fn decrypt_block(&self, block: &mut Block) {
+        #[inline]
+        #[target_feature(enable = "aes")]
+        unsafe fn aes192_decrypt1(block: &mut Block, keys: &RoundKeys) {
+            // Safety: `loadu` and `storeu` support unaligned access
+            #[allow(clippy::cast_ptr_alignment)]
+            let mut b = _mm_loadu_si128(block.as_ptr() as *const __m128i);
+
+            b = _mm_xor_si128(b, keys[12]);
+            b = _mm_aesdec_si128(b, keys[11]);
+            b = _mm_aesdec_si128(b, keys[10]);
+            b = _mm_aesdec_si128(b, keys[9]);
+            b = _mm_aesdec_si128(b, keys[8]);
+            b = _mm_aesdec_si128(b, keys[7]);
+            b = _mm_aesdec_si128(b, keys[6]);
+            b = _mm_aesdec_si128(b, keys[5]);
+            b = _mm_aesdec_si128(b, keys[4]);
+            b = _mm_aesdec_si128(b, keys[3]);
+            b = _mm_aesdec_si128(b, keys[2]);
+            b = _mm_aesdec_si128(b, keys[1]);
+            b = _mm_aesdeclast_si128(b, keys[0]);
+
+            // Safety: `loadu` and `storeu` support unaligned access
+            #[allow(clippy::cast_ptr_alignment)]
+            _mm_storeu_si128(block.as_mut_ptr() as *mut __m128i, b);
+        }
+
+        unsafe { aes192_decrypt1(block, &self.decrypt_keys) }
+    }
+
+    #[inline]
+    fn decrypt_par_blocks(&self, blocks: &mut ParBlocks) {
+        #[inline]
+        #[target_feature(enable = "aes")]
+        unsafe fn aes192_decrypt8(blocks: &mut ParBlocks, keys: &RoundKeys) {
+            let mut b = load8(blocks);
+            xor8(&mut b, keys[12]);
+            aesdec8(&mut b, keys[11]);
+            aesdec8(&mut b, keys[10]);
+            aesdec8(&mut b, keys[9]);
+            aesdec8(&mut b, keys[8]);
+            aesdec8(&mut b, keys[7]);
+            aesdec8(&mut b, keys[6]);
+            aesdec8(&mut b, keys[5]);
+            aesdec8(&mut b, keys[4]);
+            aesdec8(&mut b, keys[3]);
+            aesdec8(&mut b, keys[2]);
+            aesdec8(&mut b, keys[1]);
+            aesdeclast8(&mut b, keys[0]);
+            store8(blocks, b);
+        }
+
+        unsafe { aes192_decrypt8(blocks, &self.decrypt_keys) }
+    }
+}
+
+opaque_debug::implement!(Aes192);
diff --git a/rust/vendor/aes/src/ni/aes192/expand.rs b/rust/vendor/aes/src/ni/aes192/expand.rs
new file mode 100644
index 0000000..797b986
--- /dev/null
+++ b/rust/vendor/aes/src/ni/aes192/expand.rs
@@ -0,0 +1,108 @@
+use super::RoundKeys;
+use crate::ni::arch::*;
+
+use core::{mem, ptr};
+
+macro_rules! expand_round {
+    ($t1:expr, $t3:expr, $round:expr) => {{
+        let mut t1 = $t1;
+        let mut t2;
+        let mut t3 = $t3;
+        let mut t4;
+
+        t2 = _mm_aeskeygenassist_si128(t3, $round);
+        t2 = _mm_shuffle_epi32(t2, 0x55);
+        t4 = _mm_slli_si128(t1, 0x4);
+        t1 = _mm_xor_si128(t1, t4);
+        t4 = _mm_slli_si128(t4, 0x4);
+        t1 = _mm_xor_si128(t1, t4);
+        t4 = _mm_slli_si128(t4, 0x4);
+        t1 = _mm_xor_si128(t1, t4);
+        t1 = _mm_xor_si128(t1, t2);
+        t2 = _mm_shuffle_epi32(t1, 0xff);
+        t4 = _mm_slli_si128(t3, 0x4);
+        t3 = _mm_xor_si128(t3, t4);
+        t3 = _mm_xor_si128(t3, t2);
+
+        (t1, t3)
+    }};
+}
+
+macro_rules! shuffle {
+    ($a:expr, $b:expr, $imm:expr) => {
+        mem::transmute::<_, __m128i>(_mm_shuffle_pd(mem::transmute($a), mem::transmute($b), $imm))
+    };
+}
+
+#[inline(always)]
+pub(super) fn expand(key: &[u8; 24]) -> (RoundKeys, RoundKeys) {
+    unsafe {
+        let mut enc_keys: RoundKeys = mem::zeroed();
+        let mut dec_keys: RoundKeys = mem::zeroed();
+
+        macro_rules! store {
+            ($i:expr, $k:expr) => {
+                enc_keys[$i] = $k;
+                dec_keys[$i] = _mm_aesimc_si128($k);
+            };
+        }
+
+        // we are being extra pedantic here to remove out-of-bound access.
+        // this should be optimized out into movups, movsd sequence
+        // note that unaligned load MUST be used here, even though we read
+        // from the array (compiler missoptimizes aligned load)
+        let (k0, k1l) = {
+            let mut t = [0u8; 32];
+            ptr::write(t.as_mut_ptr() as *mut [u8; 24], *key);
+
+            // Safety: `loadu` supports unaligned loads
+            #[allow(clippy::cast_ptr_alignment)]
+            (
+                _mm_loadu_si128(t.as_ptr() as *const __m128i),
+                _mm_loadu_si128(t.as_ptr().offset(16) as *const __m128i),
+            )
+        };
+
+        enc_keys[0] = k0;
+        dec_keys[0] = k0;
+
+        let (k1_2, k2r) = expand_round!(k0, k1l, 0x01);
+        let k1 = shuffle!(k1l, k1_2, 0);
+        let k2 = shuffle!(k1_2, k2r, 1);
+        store!(1, k1);
+        store!(2, k2);
+
+        let (k3, k4l) = expand_round!(k1_2, k2r, 0x02);
+        store!(3, k3);
+
+        let (k4_5, k5r) = expand_round!(k3, k4l, 0x04);
+        let k4 = shuffle!(k4l, k4_5, 0);
+        let k5 = shuffle!(k4_5, k5r, 1);
+        store!(4, k4);
+        store!(5, k5);
+
+        let (k6, k7l) = expand_round!(k4_5, k5r, 0x08);
+        store!(6, k6);
+
+        let (k7_8, k8r) = expand_round!(k6, k7l, 0x10);
+        let k7 = shuffle!(k7l, k7_8, 0);
+        let k8 = shuffle!(k7_8, k8r, 1);
+        store!(7, k7);
+        store!(8, k8);
+
+        let (k9, k10l) = expand_round!(k7_8, k8r, 0x20);
+        store!(9, k9);
+
+        let (k10_11, k11r) = expand_round!(k9, k10l, 0x40);
+        let k10 = shuffle!(k10l, k10_11, 0);
+        let k11 = shuffle!(k10_11, k11r, 1);
+        store!(10, k10);
+        store!(11, k11);
+
+        let (k12, _) = expand_round!(k10_11, k11r, 0x80);
+        enc_keys[12] = k12;
+        dec_keys[12] = k12;
+
+        (enc_keys, dec_keys)
+    }
+}
diff --git a/rust/vendor/aes/src/ni/aes192/test_expand.rs b/rust/vendor/aes/src/ni/aes192/test_expand.rs
new file mode 100644
index 0000000..7811d4c
--- /dev/null
+++ b/rust/vendor/aes/src/ni/aes192/test_expand.rs
@@ -0,0 +1,93 @@
+use super::expand::expand;
+use crate::ni::utils::check;
+
+#[test]
+fn test() {
+    let enc_keys = expand(&[0x00; 24]).0;
+    check(
+        &enc_keys,
+        &[
+            [0x0000000000000000, 0x0000000000000000],
+            [0x0000000000000000, 0x6263636362636363],
+            [0x6263636362636363, 0x6263636362636363],
+            [0x9b9898c9f9fbfbaa, 0x9b9898c9f9fbfbaa],
+            [0x9b9898c9f9fbfbaa, 0x90973450696ccffa],
+            [0xf2f457330b0fac99, 0x90973450696ccffa],
+            [0xc81d19a9a171d653, 0x53858160588a2df9],
+            [0xc81d19a9a171d653, 0x7bebf49bda9a22c8],
+            [0x891fa3a8d1958e51, 0x198897f8b8f941ab],
+            [0xc26896f718f2b43f, 0x91ed1797407899c6],
+            [0x59f00e3ee1094f95, 0x83ecbc0f9b1e0830],
+            [0x0af31fa74a8b8661, 0x137b885ff272c7ca],
+            [0x432ac886d834c0b6, 0xd2c7df11984c5970],
+        ],
+    );
+
+    let enc_keys = expand(&[0xff; 24]).0;
+    check(
+        &enc_keys,
+        &[
+            [0xffffffffffffffff, 0xffffffffffffffff],
+            [0xffffffffffffffff, 0xe8e9e9e917161616],
+            [0xe8e9e9e917161616, 0xe8e9e9e917161616],
+            [0xadaeae19bab8b80f, 0x525151e6454747f0],
+            [0xadaeae19bab8b80f, 0xc5c2d8ed7f7a60e2],
+            [0x2d2b3104686c76f4, 0xc5c2d8ed7f7a60e2],
+            [0x1712403f686820dd, 0x454311d92d2f672d],
+            [0xe8edbfc09797df22, 0x8f8cd3b7e7e4f36a],
+            [0xa2a7e2b38f88859e, 0x67653a5ef0f2e57c],
+            [0x2655c33bc1b13051, 0x6316d2e2ec9e577c],
+            [0x8bfb6d227b09885e, 0x67919b1aa620ab4b],
+            [0xc53679a929a82ed5, 0xa25343f7d95acba9],
+            [0x598e482fffaee364, 0x3a989acd1330b418],
+        ],
+    );
+
+    let enc_keys = expand(&[
+        0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e,
+        0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
+    ])
+    .0;
+    check(
+        &enc_keys,
+        &[
+            [0x0001020304050607, 0x08090a0b0c0d0e0f],
+            [0x1011121314151617, 0x5846f2f95c43f4fe],
+            [0x544afef55847f0fa, 0x4856e2e95c43f4fe],
+            [0x40f949b31cbabd4d, 0x48f043b810b7b342],
+            [0x58e151ab04a2a555, 0x7effb5416245080c],
+            [0x2ab54bb43a02f8f6, 0x62e3a95d66410c08],
+            [0xf501857297448d7e, 0xbdf1c6ca87f33e3c],
+            [0xe510976183519b69, 0x34157c9ea351f1e0],
+            [0x1ea0372a99530916, 0x7c439e77ff12051e],
+            [0xdd7e0e887e2fff68, 0x608fc842f9dcc154],
+            [0x859f5f237a8d5a3d, 0xc0c02952beefd63a],
+            [0xde601e7827bcdf2c, 0xa223800fd8aeda32],
+            [0xa4970a331a78dc09, 0xc418c271e3a41d5d],
+        ],
+    );
+
+    let enc_keys = expand(&[
+        0x8e, 0x73, 0xb0, 0xf7, 0xda, 0x0e, 0x64, 0x52, 0xc8, 0x10, 0xf3, 0x2b, 0x80, 0x90, 0x79,
+        0xe5, 0x62, 0xf8, 0xea, 0xd2, 0x52, 0x2c, 0x6b, 0x7b,
+    ])
+    .0;
+    check(
+        &enc_keys,
+        &[
+            [0x8e73b0f7da0e6452, 0xc810f32b809079e5],
+            [0x62f8ead2522c6b7b, 0xfe0c91f72402f5a5],
+            [0xec12068e6c827f6b, 0x0e7a95b95c56fec2],
+            [0x4db7b4bd69b54118, 0x85a74796e92538fd],
+            [0xe75fad44bb095386, 0x485af05721efb14f],
+            [0xa448f6d94d6dce24, 0xaa326360113b30e6],
+            [0xa25e7ed583b1cf9a, 0x27f939436a94f767],
+            [0xc0a69407d19da4e1, 0xec1786eb6fa64971],
+            [0x485f703222cb8755, 0xe26d135233f0b7b3],
+            [0x40beeb282f18a259, 0x6747d26b458c553e],
+            [0xa7e1466c9411f1df, 0x821f750aad07d753],
+            [0xca4005388fcc5006, 0x282d166abc3ce7b5],
+            [0xe98ba06f448c773c, 0x8ecc720401002202],
+        ],
+    );
+}
diff --git a/rust/vendor/aes/src/ni/aes256.rs b/rust/vendor/aes/src/ni/aes256.rs
new file mode 100644
index 0000000..9a752c1
--- /dev/null
+++ b/rust/vendor/aes/src/ni/aes256.rs
@@ -0,0 +1,177 @@
+use super::{
+    arch::*,
+    utils::{aesdec8, aesdeclast8, aesenc8, aesenclast8, load8, store8, xor8, U128x8},
+};
+use crate::{Block, ParBlocks};
+use cipher::{
+    consts::{U16, U32, U8},
+    generic_array::GenericArray,
+    BlockCipher, BlockDecrypt, BlockEncrypt, NewBlockCipher,
+};
+
+mod expand;
+#[cfg(test)]
+mod test_expand;
+
+/// AES-256 round keys
+type RoundKeys = [__m128i; 15];
+
+/// AES-256 block cipher
+#[derive(Clone)]
+pub struct Aes256 {
+    encrypt_keys: RoundKeys,
+    decrypt_keys: RoundKeys,
+}
+
+impl Aes256 {
+    #[inline(always)]
+    pub(crate) fn encrypt8(&self, mut blocks: U128x8) -> U128x8 {
+        #[inline]
+        #[target_feature(enable = "aes")]
+        unsafe fn aesni256_encrypt8(keys: &RoundKeys, blocks: &mut U128x8) {
+            xor8(blocks, keys[0]);
+            aesenc8(blocks, keys[1]);
+            aesenc8(blocks, keys[2]);
+            aesenc8(blocks, keys[3]);
+            aesenc8(blocks, keys[4]);
+            aesenc8(blocks, keys[5]);
+            aesenc8(blocks, keys[6]);
+            aesenc8(blocks, keys[7]);
+            aesenc8(blocks, keys[8]);
+            aesenc8(blocks, keys[9]);
+            aesenc8(blocks, keys[10]);
+            aesenc8(blocks, keys[11]);
+            aesenc8(blocks, keys[12]);
+            aesenc8(blocks, keys[13]);
+            aesenclast8(blocks, keys[14]);
+        }
+        unsafe { aesni256_encrypt8(&self.encrypt_keys, &mut blocks) };
+        blocks
+    }
+
+    #[inline(always)]
+    pub(crate) fn encrypt(&self, block: __m128i) -> __m128i {
+        #[inline]
+        #[target_feature(enable = "aes")]
+        unsafe fn aesni256_encrypt1(keys: &RoundKeys, mut block: __m128i) -> __m128i {
+            block = _mm_xor_si128(block, keys[0]);
+            block = _mm_aesenc_si128(block, keys[1]);
+            block = _mm_aesenc_si128(block, keys[2]);
+            block = _mm_aesenc_si128(block, keys[3]);
+            block = _mm_aesenc_si128(block, keys[4]);
+            block = _mm_aesenc_si128(block, keys[5]);
+            block = _mm_aesenc_si128(block, keys[6]);
+            block = _mm_aesenc_si128(block, keys[7]);
+            block = _mm_aesenc_si128(block, keys[8]);
+            block = _mm_aesenc_si128(block, keys[9]);
+            block = _mm_aesenc_si128(block, keys[10]);
+            block = _mm_aesenc_si128(block, keys[11]);
+            block = _mm_aesenc_si128(block, keys[12]);
+            block = _mm_aesenc_si128(block, keys[13]);
+            _mm_aesenclast_si128(block, keys[14])
+        }
+        unsafe { aesni256_encrypt1(&self.encrypt_keys, block) }
+    }
+}
+
+impl NewBlockCipher for Aes256 {
+    type KeySize = U32;
+
+    #[inline]
+    fn new(key: &GenericArray<u8, U32>) -> Self {
+        let key = unsafe { &*(key as *const _ as *const [u8; 32]) };
+        let (encrypt_keys, decrypt_keys) = expand::expand(key);
+        Self {
+            encrypt_keys,
+            decrypt_keys,
+        }
+    }
+}
+
+impl BlockCipher for Aes256 {
+    type BlockSize = U16;
+    type ParBlocks = U8;
+}
+
+impl BlockEncrypt for Aes256 {
+    #[inline]
+    fn encrypt_block(&self, block: &mut Block) {
+        // Safety: `loadu` and `storeu` support unaligned access
+        #[allow(clippy::cast_ptr_alignment)]
+        unsafe {
+            let b = _mm_loadu_si128(block.as_ptr() as *const __m128i);
+            let b = self.encrypt(b);
+            _mm_storeu_si128(block.as_mut_ptr() as *mut __m128i, b);
+        }
+    }
+
+    #[inline]
+    fn encrypt_par_blocks(&self, blocks: &mut ParBlocks) {
+        let b = self.encrypt8(load8(blocks));
+        store8(blocks, b);
+    }
+}
+
+impl BlockDecrypt for Aes256 {
+    #[inline]
+    fn decrypt_block(&self, block: &mut Block) {
+        #[inline]
+        #[target_feature(enable = "aes")]
+        unsafe fn aes256_decrypt1(block: &mut Block, keys: &RoundKeys) {
+            // Safety: `loadu` and `storeu` support unaligned access
+            #[allow(clippy::cast_ptr_alignment)]
+            let mut b = _mm_loadu_si128(block.as_ptr() as *const __m128i);
+
+            b = _mm_xor_si128(b, keys[14]);
+            b = _mm_aesdec_si128(b, keys[13]);
+            b = _mm_aesdec_si128(b, keys[12]);
+            b = _mm_aesdec_si128(b, keys[11]);
+            b = _mm_aesdec_si128(b, keys[10]);
+            b = _mm_aesdec_si128(b, keys[9]);
+            b = _mm_aesdec_si128(b, keys[8]);
+            b = _mm_aesdec_si128(b, keys[7]);
+            b = _mm_aesdec_si128(b, keys[6]);
+            b = _mm_aesdec_si128(b, keys[5]);
+            b = _mm_aesdec_si128(b, keys[4]);
+            b = _mm_aesdec_si128(b, keys[3]);
+            b = _mm_aesdec_si128(b, keys[2]);
+            b = _mm_aesdec_si128(b, keys[1]);
+            b = _mm_aesdeclast_si128(b, keys[0]);
+
+            // Safety: `loadu` and `storeu` support unaligned access
+            #[allow(clippy::cast_ptr_alignment)]
+            _mm_storeu_si128(block.as_mut_ptr() as *mut __m128i, b);
+        }
+
+        unsafe { aes256_decrypt1(block, &self.decrypt_keys) }
+    }
+
+    #[inline]
+    fn decrypt_par_blocks(&self, blocks: &mut ParBlocks) {
+        #[inline]
+        #[target_feature(enable = "aes")]
+        unsafe fn aes256_decrypt8(blocks: &mut ParBlocks, keys: &RoundKeys) {
+            let mut b = load8(blocks);
+            xor8(&mut b, keys[14]);
+            aesdec8(&mut b, keys[13]);
+            aesdec8(&mut b, keys[12]);
+            aesdec8(&mut b, keys[11]);
+            aesdec8(&mut b, keys[10]);
+            aesdec8(&mut b, keys[9]);
+            aesdec8(&mut b, keys[8]);
+            aesdec8(&mut b, keys[7]);
+            aesdec8(&mut b, keys[6]);
+            aesdec8(&mut b, keys[5]);
+            aesdec8(&mut b, keys[4]);
+            aesdec8(&mut b, keys[3]);
+            aesdec8(&mut b, keys[2]);
+            aesdec8(&mut b, keys[1]);
+            aesdeclast8(&mut b, keys[0]);
+            store8(blocks, b);
+        }
+
+        unsafe { aes256_decrypt8(blocks, &self.decrypt_keys) }
+    }
+}
+
+opaque_debug::implement!(Aes256);
diff --git a/rust/vendor/aes/src/ni/aes256/expand.rs b/rust/vendor/aes/src/ni/aes256/expand.rs
new file mode 100644
index 0000000..88b8558
--- /dev/null
+++ b/rust/vendor/aes/src/ni/aes256/expand.rs
@@ -0,0 +1,89 @@
+use super::RoundKeys;
+use crate::ni::arch::*;
+
+use core::mem;
+
+macro_rules! expand_round {
+    ($enc_keys:expr, $dec_keys:expr, $pos:expr, $round:expr) => {
+        let mut t1 = $enc_keys[$pos - 2];
+        let mut t2;
+        let mut t3 = $enc_keys[$pos - 1];
+        let mut t4;
+
+        t2 = _mm_aeskeygenassist_si128(t3, $round);
+        t2 = _mm_shuffle_epi32(t2, 0xff);
+        t4 = _mm_slli_si128(t1, 0x4);
+        t1 = _mm_xor_si128(t1, t4);
+        t4 = _mm_slli_si128(t4, 0x4);
+        t1 = _mm_xor_si128(t1, t4);
+        t4 = _mm_slli_si128(t4, 0x4);
+        t1 = _mm_xor_si128(t1, t4);
+        t1 = _mm_xor_si128(t1, t2);
+
+        $enc_keys[$pos] = t1;
+        $dec_keys[$pos] = _mm_aesimc_si128(t1);
+
+        t4 = _mm_aeskeygenassist_si128(t1, 0x00);
+        t2 = _mm_shuffle_epi32(t4, 0xaa);
+        t4 = _mm_slli_si128(t3, 0x4);
+        t3 = _mm_xor_si128(t3, t4);
+        t4 = _mm_slli_si128(t4, 0x4);
+        t3 = _mm_xor_si128(t3, t4);
+        t4 = _mm_slli_si128(t4, 0x4);
+        t3 = _mm_xor_si128(t3, t4);
+        t3 = _mm_xor_si128(t3, t2);
+
+        $enc_keys[$pos + 1] = t3;
+        $dec_keys[$pos + 1] = _mm_aesimc_si128(t3);
+    };
+}
+
+macro_rules! expand_round_last {
+    ($enc_keys:expr, $dec_keys:expr, $pos:expr, $round:expr) => {
+        let mut t1 = $enc_keys[$pos - 2];
+        let mut t2;
+        let t3 = $enc_keys[$pos - 1];
+        let mut t4;
+
+        t2 = _mm_aeskeygenassist_si128(t3, $round);
+        t2 = _mm_shuffle_epi32(t2, 0xff);
+        t4 = _mm_slli_si128(t1, 0x4);
+        t1 = _mm_xor_si128(t1, t4);
+        t4 = _mm_slli_si128(t4, 0x4);
+        t1 = _mm_xor_si128(t1, t4);
+        t4 = _mm_slli_si128(t4, 0x4);
+        t1 = _mm_xor_si128(t1, t4);
+        t1 = _mm_xor_si128(t1, t2);
+
+        $enc_keys[$pos] = t1;
+        $dec_keys[$pos] = t1;
+    };
+}
+
+#[inline(always)]
+pub(super) fn expand(key: &[u8; 32]) -> (RoundKeys, RoundKeys) {
+    // Safety: `loadu` and `storeu` support unaligned access
+    #[allow(clippy::cast_ptr_alignment)]
+    unsafe {
+        let mut enc_keys: RoundKeys = mem::zeroed();
+        let mut dec_keys: RoundKeys = mem::zeroed();
+
+        let kp = key.as_ptr() as *const __m128i;
+        let k1 = _mm_loadu_si128(kp);
+        let k2 = _mm_loadu_si128(kp.offset(1));
+        enc_keys[0] = k1;
+        dec_keys[0] = k1;
+        enc_keys[1] = k2;
+        dec_keys[1] = _mm_aesimc_si128(k2);
+
+        expand_round!(enc_keys, dec_keys, 2, 0x01);
+        expand_round!(enc_keys, dec_keys, 4, 0x02);
+        expand_round!(enc_keys, dec_keys, 6, 0x04);
+        expand_round!(enc_keys, dec_keys, 8, 0x08);
+        expand_round!(enc_keys, dec_keys, 10, 0x10);
+        expand_round!(enc_keys, dec_keys, 12, 0x20);
+        expand_round_last!(enc_keys, dec_keys, 14, 0x40);
+
+        (enc_keys, dec_keys)
+    }
+}
diff --git a/rust/vendor/aes/src/ni/aes256/test_expand.rs b/rust/vendor/aes/src/ni/aes256/test_expand.rs
new file mode 100644
index 0000000..52e728f
--- /dev/null
+++ b/rust/vendor/aes/src/ni/aes256/test_expand.rs
@@ -0,0 +1,103 @@
+use super::expand::expand;
+use crate::ni::utils::check;
+
+#[test]
+fn test() {
+    let enc_keys = expand(&[0x00; 32]).0;
+    check(
+        &enc_keys,
+        &[
+            [0x0000000000000000, 0x0000000000000000],
+            [0x0000000000000000, 0x0000000000000000],
+            [0x6263636362636363, 0x6263636362636363],
+            [0xaafbfbfbaafbfbfb, 0xaafbfbfbaafbfbfb],
+            [0x6f6c6ccf0d0f0fac, 0x6f6c6ccf0d0f0fac],
+            [0x7d8d8d6ad7767691, 0x7d8d8d6ad7767691],
+            [0x5354edc15e5be26d, 0x31378ea23c38810e],
+            [0x968a81c141fcf750, 0x3c717a3aeb070cab],
+            [0x9eaa8f28c0f16d45, 0xf1c6e3e7cdfe62e9],
+            [0x2b312bdf6acddc8f, 0x56bca6b5bdbbaa1e],
+            [0x6406fd52a4f79017, 0x553173f098cf1119],
+            [0x6dbba90b07767584, 0x51cad331ec71792f],
+            [0xe7b0e89c4347788b, 0x16760b7b8eb91a62],
+            [0x74ed0ba1739b7e25, 0x2251ad14ce20d43b],
+            [0x10f80a1753bf729c, 0x45c979e7cb706385],
+        ],
+    );
+
+    let enc_keys = expand(&[0xff; 32]).0;
+    check(
+        &enc_keys,
+        &[
+            [0xffffffffffffffff, 0xffffffffffffffff],
+            [0xffffffffffffffff, 0xffffffffffffffff],
+            [0xe8e9e9e917161616, 0xe8e9e9e917161616],
+            [0x0fb8b8b8f0474747, 0x0fb8b8b8f0474747],
+            [0x4a4949655d5f5f73, 0xb5b6b69aa2a0a08c],
+            [0x355858dcc51f1f9b, 0xcaa7a7233ae0e064],
+            [0xafa80ae5f2f75596, 0x4741e30ce5e14380],
+            [0xeca0421129bf5d8a, 0xe318faa9d9f81acd],
+            [0xe60ab7d014fde246, 0x53bc014ab65d42ca],
+            [0xa2ec6e658b5333ef, 0x684bc946b1b3d38b],
+            [0x9b6c8a188f91685e, 0xdc2d69146a702bde],
+            [0xa0bd9f782beeac97, 0x43a565d1f216b65a],
+            [0xfc22349173b35ccf, 0xaf9e35dbc5ee1e05],
+            [0x0695ed132d7b4184, 0x6ede24559cc8920f],
+            [0x546d424f27de1e80, 0x88402b5b4dae355e],
+        ],
+    );
+
+    let enc_keys = expand(&[
+        0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e,
+        0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d,
+        0x1e, 0x1f,
+    ])
+    .0;
+    check(
+        &enc_keys,
+        &[
+            [0x0001020304050607, 0x08090a0b0c0d0e0f],
+            [0x1011121314151617, 0x18191a1b1c1d1e1f],
+            [0xa573c29fa176c498, 0xa97fce93a572c09c],
+            [0x1651a8cd0244beda, 0x1a5da4c10640bade],
+            [0xae87dff00ff11b68, 0xa68ed5fb03fc1567],
+            [0x6de1f1486fa54f92, 0x75f8eb5373b8518d],
+            [0xc656827fc9a79917, 0x6f294cec6cd5598b],
+            [0x3de23a75524775e7, 0x27bf9eb45407cf39],
+            [0x0bdc905fc27b0948, 0xad5245a4c1871c2f],
+            [0x45f5a66017b2d387, 0x300d4d33640a820a],
+            [0x7ccff71cbeb4fe54, 0x13e6bbf0d261a7df],
+            [0xf01afafee7a82979, 0xd7a5644ab3afe640],
+            [0x2541fe719bf50025, 0x8813bbd55a721c0a],
+            [0x4e5a6699a9f24fe0, 0x7e572baacdf8cdea],
+            [0x24fc79ccbf0979e9, 0x371ac23c6d68de36],
+        ],
+    );
+
+    let enc_keys = expand(&[
+        0x60, 0x3d, 0xeb, 0x10, 0x15, 0xca, 0x71, 0xbe, 0x2b, 0x73, 0xae, 0xf0, 0x85, 0x7d, 0x77,
+        0x81, 0x1f, 0x35, 0x2c, 0x07, 0x3b, 0x61, 0x08, 0xd7, 0x2d, 0x98, 0x10, 0xa3, 0x09, 0x14,
+        0xdf, 0xf4,
+    ])
+    .0;
+    check(
+        &enc_keys,
+        &[
+            [0x603deb1015ca71be, 0x2b73aef0857d7781],
+            [0x1f352c073b6108d7, 0x2d9810a30914dff4],
+            [0x9ba354118e6925af, 0xa51a8b5f2067fcde],
+            [0xa8b09c1a93d194cd, 0xbe49846eb75d5b9a],
+            [0xd59aecb85bf3c917, 0xfee94248de8ebe96],
+            [0xb5a9328a2678a647, 0x983122292f6c79b3],
+            [0x812c81addadf48ba, 0x24360af2fab8b464],
+            [0x98c5bfc9bebd198e, 0x268c3ba709e04214],
+            [0x68007bacb2df3316, 0x96e939e46c518d80],
+            [0xc814e20476a9fb8a, 0x5025c02d59c58239],
+            [0xde1369676ccc5a71, 0xfa2563959674ee15],
+            [0x5886ca5d2e2f31d7, 0x7e0af1fa27cf73c3],
+            [0x749c47ab18501dda, 0xe2757e4f7401905a],
+            [0xcafaaae3e4d59b34, 0x9adf6acebd10190d],
+            [0xfe4890d1e6188d0b, 0x046df344706c631e],
+        ],
+    );
+}
diff --git a/rust/vendor/aes/src/ni/ctr.rs b/rust/vendor/aes/src/ni/ctr.rs
new file mode 100644
index 0000000..cb78910
--- /dev/null
+++ b/rust/vendor/aes/src/ni/ctr.rs
@@ -0,0 +1,229 @@
+//! AES in counter mode (a.k.a. AES-CTR)
+
+// TODO(tarcieri): support generic CTR API
+
+#![allow(clippy::unreadable_literal)]
+
+use super::arch::*;
+use core::mem;
+
+use super::{Aes128, Aes192, Aes256};
+use crate::BLOCK_SIZE;
+use cipher::{
+    consts::U16,
+    errors::{LoopError, OverflowError},
+    generic_array::GenericArray,
+    BlockCipher, FromBlockCipher, SeekNum, StreamCipher, StreamCipherSeek,
+};
+
+const PAR_BLOCKS: usize = 8;
+const PAR_BLOCKS_SIZE: usize = PAR_BLOCKS * BLOCK_SIZE;
+
+#[inline(always)]
+pub fn xor(buf: &mut [u8], key: &[u8]) {
+    debug_assert_eq!(buf.len(), key.len());
+    for (a, b) in buf.iter_mut().zip(key) {
+        *a ^= *b;
+    }
+}
+
+#[inline(always)]
+fn xor_block8(buf: &mut [u8], ctr: [__m128i; 8]) {
+    debug_assert_eq!(buf.len(), PAR_BLOCKS_SIZE);
+
+    // Safety: `loadu` and `storeu` support unaligned access
+    #[allow(clippy::cast_ptr_alignment)]
+    unsafe {
+        // compiler should unroll this loop
+        for i in 0..8 {
+            let ptr = buf.as_mut_ptr().offset(16 * i) as *mut __m128i;
+            let data = _mm_loadu_si128(ptr);
+            let data = _mm_xor_si128(data, ctr[i as usize]);
+            _mm_storeu_si128(ptr, data);
+        }
+    }
+}
+
+#[inline(always)]
+fn swap_bytes(v: __m128i) -> __m128i {
+    unsafe {
+        let mask = _mm_set_epi64x(0x08090a0b0c0d0e0f, 0x0001020304050607);
+        _mm_shuffle_epi8(v, mask)
+    }
+}
+
+#[inline(always)]
+fn inc_be(v: __m128i) -> __m128i {
+    unsafe { _mm_add_epi64(v, _mm_set_epi64x(1, 0)) }
+}
+
+#[inline(always)]
+fn load(val: &GenericArray<u8, U16>) -> __m128i {
+    // Safety: `loadu` supports unaligned loads
+    #[allow(clippy::cast_ptr_alignment)]
+    unsafe {
+        _mm_loadu_si128(val.as_ptr() as *const __m128i)
+    }
+}
+
+macro_rules! impl_ctr {
+    ($name:ident, $cipher:ty, $doc:expr) => {
+        #[doc=$doc]
+        #[derive(Clone)]
+        #[cfg_attr(docsrs, doc(cfg(feature = "ctr")))]
+        pub struct $name {
+            nonce: __m128i,
+            ctr: __m128i,
+            cipher: $cipher,
+            block: [u8; BLOCK_SIZE],
+            pos: u8,
+        }
+
+        impl $name {
+            #[inline(always)]
+            fn gen_block(&mut self) {
+                let block = self.cipher.encrypt(swap_bytes(self.ctr));
+                self.block = unsafe { mem::transmute(block) }
+            }
+
+            #[inline(always)]
+            fn next_block(&mut self) -> __m128i {
+                let block = swap_bytes(self.ctr);
+                self.ctr = inc_be(self.ctr);
+                self.cipher.encrypt(block)
+            }
+
+            #[inline(always)]
+            fn next_block8(&mut self) -> [__m128i; 8] {
+                let mut ctr = self.ctr;
+                let mut block8: [__m128i; 8] = unsafe { mem::zeroed() };
+                for i in 0..8 {
+                    block8[i] = swap_bytes(ctr);
+                    ctr = inc_be(ctr);
+                }
+                self.ctr = ctr;
+
+                self.cipher.encrypt8(block8)
+            }
+
+            #[inline(always)]
+            fn get_u64_ctr(&self) -> u64 {
+                let (ctr, nonce) = unsafe {
+                    (
+                        mem::transmute::<__m128i, [u64; 2]>(self.ctr)[1],
+                        mem::transmute::<__m128i, [u64; 2]>(self.nonce)[1],
+                    )
+                };
+                ctr.wrapping_sub(nonce)
+            }
+
+            /// Check if provided data will not overflow counter
+            #[inline(always)]
+            fn check_data_len(&self, data: &[u8]) -> Result<(), LoopError> {
+                let bs = BLOCK_SIZE;
+                let leftover_bytes = bs - self.pos as usize;
+                if data.len() < leftover_bytes {
+                    return Ok(());
+                }
+                let blocks = 1 + (data.len() - leftover_bytes) / bs;
+                self.get_u64_ctr()
+                    .checked_add(blocks as u64)
+                    .ok_or(LoopError)
+                    .map(|_| ())
+            }
+        }
+
+        impl FromBlockCipher for $name {
+            type BlockCipher = $cipher;
+            type NonceSize = <$cipher as BlockCipher>::BlockSize;
+
+            fn from_block_cipher(
+                cipher: $cipher,
+                nonce: &GenericArray<u8, Self::NonceSize>,
+            ) -> Self {
+                let nonce = swap_bytes(load(nonce));
+                Self {
+                    nonce,
+                    ctr: nonce,
+                    cipher,
+                    block: [0u8; BLOCK_SIZE],
+                    pos: 0,
+                }
+            }
+        }
+
+        impl StreamCipher for $name {
+            #[inline]
+            fn try_apply_keystream(&mut self, mut data: &mut [u8]) -> Result<(), LoopError> {
+                self.check_data_len(data)?;
+                let bs = BLOCK_SIZE;
+                let pos = self.pos as usize;
+                debug_assert!(bs > pos);
+
+                if pos != 0 {
+                    if data.len() < bs - pos {
+                        let n = pos + data.len();
+                        xor(data, &self.block[pos..n]);
+                        self.pos = n as u8;
+                        return Ok(());
+                    } else {
+                        let (l, r) = data.split_at_mut(bs - pos);
+                        data = r;
+                        xor(l, &self.block[pos..]);
+                        self.ctr = inc_be(self.ctr);
+                    }
+                }
+
+                let mut chunks = data.chunks_exact_mut(PAR_BLOCKS_SIZE);
+                for chunk in &mut chunks {
+                    xor_block8(chunk, self.next_block8());
+                }
+                data = chunks.into_remainder();
+
+                let mut chunks = data.chunks_exact_mut(bs);
+                for chunk in &mut chunks {
+                    let block = self.next_block();
+
+                    unsafe {
+                        let t = _mm_loadu_si128(chunk.as_ptr() as *const __m128i);
+                        let res = _mm_xor_si128(block, t);
+                        _mm_storeu_si128(chunk.as_mut_ptr() as *mut __m128i, res);
+                    }
+                }
+
+                let rem = chunks.into_remainder();
+                self.pos = rem.len() as u8;
+                if !rem.is_empty() {
+                    self.gen_block();
+                    for (a, b) in rem.iter_mut().zip(&self.block) {
+                        *a ^= *b;
+                    }
+                }
+
+                Ok(())
+            }
+        }
+
+        impl StreamCipherSeek for $name {
+            fn try_current_pos<T: SeekNum>(&self) -> Result<T, OverflowError> {
+                T::from_block_byte(self.get_u64_ctr(), self.pos, BLOCK_SIZE as u8)
+            }
+
+            fn try_seek<T: SeekNum>(&mut self, pos: T) -> Result<(), LoopError> {
+                let res: (u64, u8) = pos.to_block_byte(BLOCK_SIZE as u8)?;
+                self.ctr = unsafe { _mm_add_epi64(self.nonce, _mm_set_epi64x(res.0 as i64, 0)) };
+                self.pos = res.1;
+                if self.pos != 0 {
+                    self.gen_block()
+                }
+                Ok(())
+            }
+        }
+
+        opaque_debug::implement!($name);
+    };
+}
+
+impl_ctr!(Aes128Ctr, Aes128, "AES-128 in CTR mode");
+impl_ctr!(Aes192Ctr, Aes192, "AES-192 in CTR mode");
+impl_ctr!(Aes256Ctr, Aes256, "AES-256 in CTR mode");
diff --git a/rust/vendor/aes/src/ni/hazmat.rs b/rust/vendor/aes/src/ni/hazmat.rs
new file mode 100644
index 0000000..5188ad7
--- /dev/null
+++ b/rust/vendor/aes/src/ni/hazmat.rs
@@ -0,0 +1,86 @@
+//! Low-level "hazmat" AES functions: AES-NI support.
+//!
+//! Note: this isn't actually used in the `Aes128`/`Aes192`/`Aes256`
+//! implementations in this crate, but instead provides raw AES-NI accelerated
+//! access to the AES round function gated under the `hazmat` crate feature.
+
+use super::{
+    arch::*,
+    utils::{load8, store8},
+};
+use crate::{Block, ParBlocks};
+
+/// AES cipher (encrypt) round function.
+#[allow(clippy::cast_ptr_alignment)]
+#[target_feature(enable = "aes")]
+pub(crate) unsafe fn cipher_round(block: &mut Block, round_key: &Block) {
+    // Safety: `loadu` and `storeu` support unaligned access
+    let b = _mm_loadu_si128(block.as_ptr() as *const __m128i);
+    let k = _mm_loadu_si128(round_key.as_ptr() as *const __m128i);
+    let out = _mm_aesenc_si128(b, k);
+    _mm_storeu_si128(block.as_mut_ptr() as *mut __m128i, out);
+}
+
+/// AES cipher (encrypt) round function: parallel version.
+#[allow(clippy::cast_ptr_alignment)]
+#[target_feature(enable = "aes")]
+pub(crate) unsafe fn cipher_round_par(blocks: &mut ParBlocks, round_keys: &ParBlocks) {
+    let xmm_keys = load8(round_keys);
+    let mut xmm_blocks = load8(blocks);
+
+    for i in 0..8 {
+        xmm_blocks[i] = _mm_aesenc_si128(xmm_blocks[i], xmm_keys[i]);
+    }
+
+    store8(blocks, xmm_blocks);
+}
+
+/// AES cipher (encrypt) round function.
+#[allow(clippy::cast_ptr_alignment)]
+#[target_feature(enable = "aes")]
+pub(crate) unsafe fn equiv_inv_cipher_round(block: &mut Block, round_key: &Block) {
+    // Safety: `loadu` and `storeu` support unaligned access
+    let b = _mm_loadu_si128(block.as_ptr() as *const __m128i);
+    let k = _mm_loadu_si128(round_key.as_ptr() as *const __m128i);
+    let out = _mm_aesdec_si128(b, k);
+    _mm_storeu_si128(block.as_mut_ptr() as *mut __m128i, out);
+}
+
+/// AES cipher (encrypt) round function: parallel version.
+#[allow(clippy::cast_ptr_alignment)]
+#[target_feature(enable = "aes")]
+pub(crate) unsafe fn equiv_inv_cipher_round_par(blocks: &mut ParBlocks, round_keys: &ParBlocks) {
+    let xmm_keys = load8(round_keys);
+    let mut xmm_blocks = load8(blocks);
+
+    for i in 0..8 {
+        xmm_blocks[i] = _mm_aesdec_si128(xmm_blocks[i], xmm_keys[i]);
+    }
+
+    store8(blocks, xmm_blocks);
+}
+
+/// AES mix columns function.
+#[allow(clippy::cast_ptr_alignment)]
+#[target_feature(enable = "aes")]
+pub(crate) unsafe fn mix_columns(block: &mut Block) {
+    // Safety: `loadu` and `storeu` support unaligned access
+    let mut state = _mm_loadu_si128(block.as_ptr() as *const __m128i);
+
+    // Emulate mix columns by performing three inverse mix columns operations
+    state = _mm_aesimc_si128(state);
+    state = _mm_aesimc_si128(state);
+    state = _mm_aesimc_si128(state);
+
+    _mm_storeu_si128(block.as_mut_ptr() as *mut __m128i, state);
+}
+
+/// AES inverse mix columns function.
+#[allow(clippy::cast_ptr_alignment)]
+#[target_feature(enable = "aes")]
+pub(crate) unsafe fn inv_mix_columns(block: &mut Block) {
+    // Safety: `loadu` and `storeu` support unaligned access
+    let b = _mm_loadu_si128(block.as_ptr() as *const __m128i);
+    let out = _mm_aesimc_si128(b);
+    _mm_storeu_si128(block.as_mut_ptr() as *mut __m128i, out);
+}
diff --git a/rust/vendor/aes/src/ni/utils.rs b/rust/vendor/aes/src/ni/utils.rs
new file mode 100644
index 0000000..1fc3403
--- /dev/null
+++ b/rust/vendor/aes/src/ni/utils.rs
@@ -0,0 +1,90 @@
+//! Utility functions
+
+// TODO(tarcieri): check performance impact / generated assembly changes
+#![allow(clippy::needless_range_loop)]
+
+use super::arch::*;
+use crate::ParBlocks;
+
+pub type U128x8 = [__m128i; 8];
+
+#[cfg(test)]
+pub(crate) fn check(a: &[__m128i], b: &[[u64; 2]]) {
+    for (v1, v2) in a.iter().zip(b) {
+        let t1: [u64; 2] = unsafe { core::mem::transmute(*v1) };
+        let t2 = [v2[0].to_be(), v2[1].to_be()];
+        assert_eq!(t1, t2);
+    }
+}
+
+#[inline(always)]
+pub(crate) fn load8(blocks: &ParBlocks) -> U128x8 {
+    unsafe {
+        [
+            _mm_loadu_si128(blocks[0].as_ptr() as *const __m128i),
+            _mm_loadu_si128(blocks[1].as_ptr() as *const __m128i),
+            _mm_loadu_si128(blocks[2].as_ptr() as *const __m128i),
+            _mm_loadu_si128(blocks[3].as_ptr() as *const __m128i),
+            _mm_loadu_si128(blocks[4].as_ptr() as *const __m128i),
+            _mm_loadu_si128(blocks[5].as_ptr() as *const __m128i),
+            _mm_loadu_si128(blocks[6].as_ptr() as *const __m128i),
+            _mm_loadu_si128(blocks[7].as_ptr() as *const __m128i),
+        ]
+    }
+}
+
+#[inline(always)]
+pub(crate) fn store8(blocks: &mut ParBlocks, b: U128x8) {
+    unsafe {
+        _mm_storeu_si128(blocks[0].as_mut_ptr() as *mut __m128i, b[0]);
+        _mm_storeu_si128(blocks[1].as_mut_ptr() as *mut __m128i, b[1]);
+        _mm_storeu_si128(blocks[2].as_mut_ptr() as *mut __m128i, b[2]);
+        _mm_storeu_si128(blocks[3].as_mut_ptr() as *mut __m128i, b[3]);
+        _mm_storeu_si128(blocks[4].as_mut_ptr() as *mut __m128i, b[4]);
+        _mm_storeu_si128(blocks[5].as_mut_ptr() as *mut __m128i, b[5]);
+        _mm_storeu_si128(blocks[6].as_mut_ptr() as *mut __m128i, b[6]);
+        _mm_storeu_si128(blocks[7].as_mut_ptr() as *mut __m128i, b[7]);
+    }
+}
+
+#[inline(always)]
+pub(crate) fn xor8(b: &mut U128x8, key: __m128i) {
+    unsafe {
+        b[0] = _mm_xor_si128(b[0], key);
+        b[1] = _mm_xor_si128(b[1], key);
+        b[2] = _mm_xor_si128(b[2], key);
+        b[3] = _mm_xor_si128(b[3], key);
+        b[4] = _mm_xor_si128(b[4], key);
+        b[5] = _mm_xor_si128(b[5], key);
+        b[6] = _mm_xor_si128(b[6], key);
+        b[7] = _mm_xor_si128(b[7], key);
+    }
+}
+
+#[inline(always)]
+pub(crate) fn aesenc8(buffer: &mut U128x8, key: __m128i) {
+    for i in 0..8 {
+        buffer[i] = unsafe { _mm_aesenc_si128(buffer[i], key) };
+    }
+}
+
+#[inline(always)]
+pub(crate) fn aesenclast8(buffer: &mut U128x8, key: __m128i) {
+    for i in 0..8 {
+        buffer[i] = unsafe { _mm_aesenclast_si128(buffer[i], key) };
+    }
+}
+
+#[inline(always)]
+pub(crate) fn aesdec8(buffer: &mut U128x8, key: __m128i) {
+    for i in 0..8 {
+        buffer[i] = unsafe { _mm_aesdec_si128(buffer[i], key) };
+    }
+}
+
+#[inline(always)]
+pub(crate) fn aesdeclast8(buffer: &mut U128x8, key: __m128i) {
+    for i in 0..8 {
+        buffer[i] = unsafe { _mm_aesdeclast_si128(buffer[i], key) };
+    }
+}
diff --git a/rust/vendor/aes/src/soft.rs b/rust/vendor/aes/src/soft.rs
new file mode 100644
index 0000000..1b51d22
--- /dev/null
+++ b/rust/vendor/aes/src/soft.rs
@@ -0,0 +1,127 @@
+//! AES block cipher constant-time implementation.
+//!
+//! The implementation uses a technique called [fixslicing][1], an improved
+//! form of bitslicing which represents ciphers in a way which enables
+//! very efficient constant-time implementations in software.
+//!
+//! [1]: https://eprint.iacr.org/2020/1123.pdf
+
+#![deny(unsafe_code)]
+
+#[cfg_attr(not(target_pointer_width = "64"), path = "soft/fixslice32.rs")]
+#[cfg_attr(target_pointer_width = "64", path = "soft/fixslice64.rs")]
+pub(crate) mod fixslice;
+
+#[cfg(feature = "ctr")]
+mod ctr;
+
+#[cfg(feature = "ctr")]
+pub use self::ctr::{Aes128Ctr, Aes192Ctr, Aes256Ctr};
+
+use crate::{Block, ParBlocks};
+use cipher::{
+    consts::{U16, U24, U32, U8},
+    generic_array::GenericArray,
+    BlockCipher, BlockDecrypt, BlockEncrypt, NewBlockCipher,
+};
+use fixslice::{FixsliceKeys128, FixsliceKeys192, FixsliceKeys256, FIXSLICE_BLOCKS};
+
+macro_rules! define_aes_impl {
+    (
+        $name:ident,
+        $key_size:ty,
+        $fixslice_keys:ty,
+        $fixslice_key_schedule:path,
+        $fixslice_decrypt:path,
+        $fixslice_encrypt:path,
+        $doc:expr
+    ) => {
+        #[doc=$doc]
+        #[derive(Clone)]
+        pub struct $name {
+            keys: $fixslice_keys,
+        }
+
+        impl NewBlockCipher for $name {
+            type KeySize = $key_size;
+
+            #[inline]
+            fn new(key: &GenericArray<u8, $key_size>) -> Self {
+                Self {
+                    keys: $fixslice_key_schedule(key),
+                }
+            }
+        }
+
+        impl BlockCipher for $name {
+            type BlockSize = U16;
+            type ParBlocks = U8;
+        }
+
+        impl BlockEncrypt for $name {
+            #[inline]
+            fn encrypt_block(&self, block: &mut Block) {
+                let mut blocks = [Block::default(); FIXSLICE_BLOCKS];
+                blocks[0].copy_from_slice(block);
+                $fixslice_encrypt(&self.keys, &mut blocks);
+                block.copy_from_slice(&blocks[0]);
+            }
+
+            #[inline]
+            fn encrypt_par_blocks(&self, blocks: &mut ParBlocks) {
+                for chunk in blocks.chunks_mut(FIXSLICE_BLOCKS) {
+                    $fixslice_encrypt(&self.keys, chunk);
+                }
+            }
+        }
+
+        impl BlockDecrypt for $name {
+            #[inline]
+            fn decrypt_block(&self, block: &mut Block) {
+                let mut blocks = [Block::default(); FIXSLICE_BLOCKS];
+                blocks[0].copy_from_slice(block);
+                $fixslice_decrypt(&self.keys, &mut blocks);
+                block.copy_from_slice(&blocks[0]);
+            }
+
+            #[inline]
+            fn decrypt_par_blocks(&self, blocks: &mut ParBlocks) {
+                for chunk in blocks.chunks_mut(FIXSLICE_BLOCKS) {
+                    $fixslice_decrypt(&self.keys, chunk);
+                }
+            }
+        }
+
+        opaque_debug::implement!($name);
+    };
+}
+
+define_aes_impl!(
+    Aes128,
+    U16,
+    FixsliceKeys128,
+    fixslice::aes128_key_schedule,
+    fixslice::aes128_decrypt,
+    fixslice::aes128_encrypt,
+    "AES-128 block cipher instance"
+);
+
+define_aes_impl!(
+    Aes192,
+    U24,
+    FixsliceKeys192,
+    fixslice::aes192_key_schedule,
+    fixslice::aes192_decrypt,
+    fixslice::aes192_encrypt,
+    "AES-192 block cipher instance"
+);
+
+define_aes_impl!(
+    Aes256,
+    U32,
+    FixsliceKeys256,
+    fixslice::aes256_key_schedule,
+    fixslice::aes256_decrypt,
+    fixslice::aes256_encrypt,
+    "AES-256 block cipher instance"
+);
diff --git a/rust/vendor/aes/src/soft/ctr.rs b/rust/vendor/aes/src/soft/ctr.rs
new file mode 100644
index 0000000..a288ab1
--- /dev/null
+++ b/rust/vendor/aes/src/soft/ctr.rs
@@ -0,0 +1,17 @@
+//! AES in counter mode (a.k.a. AES-CTR)
+
+// TODO(tarcieri): support generic CTR API
+
+use super::{Aes128, Aes192, Aes256};
+
+/// AES-128 in CTR mode
+#[cfg_attr(docsrs, doc(cfg(feature = "ctr")))]
+pub type Aes128Ctr = ::ctr::Ctr64BE<Aes128>;
+
+/// AES-192 in CTR mode
+#[cfg_attr(docsrs, doc(cfg(feature = "ctr")))]
+pub type Aes192Ctr = ::ctr::Ctr64BE<Aes192>;
+
+/// AES-256 in CTR mode
+#[cfg_attr(docsrs, doc(cfg(feature = "ctr")))]
+pub type Aes256Ctr = ::ctr::Ctr64BE<Aes256>;
diff --git a/rust/vendor/aes/src/soft/fixslice32.rs b/rust/vendor/aes/src/soft/fixslice32.rs
new file mode 100644
index 0000000..5dc4834
--- /dev/null
+++ b/rust/vendor/aes/src/soft/fixslice32.rs
@@ -0,0 +1,1485 @@
+//! Fixsliced implementations of AES-128, AES-192 and AES-256 (32-bit)
+//! adapted from the C implementation
+//!
+//! All implementations are fully bitsliced and do not rely on any
+//! Look-Up Table (LUT).
+//!
+//! See the paper at <https://eprint.iacr.org/2020/1123.pdf> for more details.
+//!
+//! # Author (original C code)
+//!
+//! Alexandre Adomnicai, Nanyang Technological University, Singapore
+//! <alexandre.adomnicai@ntu.edu.sg>
+//!
+//! Originally licensed MIT. Relicensed as Apache 2.0+MIT with permission.
+
+#![allow(clippy::unreadable_literal)]
+
+use crate::Block;
+use cipher::{
+    consts::{U16, U24, U32},
+    generic_array::GenericArray,
+};
+use core::convert::TryInto;
+
+/// AES block batch size for this implementation
+pub(crate) const FIXSLICE_BLOCKS: usize = 2;
+
+/// AES-128 round keys
+pub(crate) type FixsliceKeys128 = [u32; 88];
+
+/// AES-192 round keys
+pub(crate) type FixsliceKeys192 = [u32; 104];
+
+/// AES-256 round keys
+pub(crate) type FixsliceKeys256 = [u32; 120];
+
+/// 256-bit internal state
+pub(crate) type State = [u32; 8];
+
+/// Fully bitsliced AES-128 key schedule to match the fully-fixsliced representation.
+pub(crate) fn aes128_key_schedule(key: &GenericArray<u8, U16>) -> FixsliceKeys128 {
+    let mut rkeys = [0u32; 88];
+
+    bitslice(&mut rkeys[..8], key, key);
+
+    let mut rk_off = 0;
+    for rcon in 0..10 {
+        memshift32(&mut rkeys, rk_off);
+        rk_off += 8;
+
+        sub_bytes(&mut rkeys[rk_off..(rk_off + 8)]);
+        sub_bytes_nots(&mut rkeys[rk_off..(rk_off + 8)]);
+
+        if rcon < 8 {
+            add_round_constant_bit(&mut rkeys[rk_off..(rk_off + 8)], rcon);
+        } else {
+            add_round_constant_bit(&mut rkeys[rk_off..(rk_off + 8)], rcon - 8);
+            add_round_constant_bit(&mut rkeys[rk_off..(rk_off + 8)], rcon - 7);
+            add_round_constant_bit(&mut rkeys[rk_off..(rk_off + 8)], rcon - 5);
+            add_round_constant_bit(&mut rkeys[rk_off..(rk_off + 8)], rcon - 4);
+        }
+
+        xor_columns(&mut rkeys, rk_off, 8, ror_distance(1, 3));
+    }
+
+    // Adjust to match fixslicing format
+    #[cfg(feature = "compact")]
+    {
+        for i in (8..88).step_by(16) {
+            inv_shift_rows_1(&mut rkeys[i..(i + 8)]);
+        }
+    }
+    #[cfg(not(feature = "compact"))]
+    {
+        for i in (8..72).step_by(32) {
+            inv_shift_rows_1(&mut rkeys[i..(i + 8)]);
+            inv_shift_rows_2(&mut rkeys[(i + 8)..(i + 16)]);
+            inv_shift_rows_3(&mut rkeys[(i + 16)..(i + 24)]);
+        }
+        inv_shift_rows_1(&mut rkeys[72..80]);
+    }
+
+    // Account for NOTs removed from sub_bytes
+    for i in 1..11 {
+        sub_bytes_nots(&mut rkeys[(i * 8)..(i * 8 + 8)]);
+    }
+
+    rkeys
+}
+
+/// Fully bitsliced AES-192 key schedule to match the fully-fixsliced representation.
+pub(crate) fn aes192_key_schedule(key: &GenericArray<u8, U24>) -> FixsliceKeys192 {
+    let mut rkeys = [0u32; 104];
+    let mut tmp = [0u32; 8];
+
+    bitslice(&mut rkeys[..8], &key[..16], &key[..16]);
+    bitslice(&mut tmp, &key[8..], &key[8..]);
+
+    let mut rcon = 0;
+    let mut rk_off = 8;
+
+    loop {
+        for i in 0..8 {
+            rkeys[rk_off + i] =
+                (0x0f0f0f0f & (tmp[i] >> 4)) | (0xf0f0f0f0 & (rkeys[(rk_off - 8) + i] << 4));
+        }
+
+        sub_bytes(&mut tmp);
+        sub_bytes_nots(&mut tmp);
+
+        add_round_constant_bit(&mut tmp, rcon);
+        rcon += 1;
+
+        for i in 0..8 {
+            let mut ti = rkeys[rk_off + i];
+            ti ^= 0x30303030 & ror(tmp[i], ror_distance(1, 1));
+            ti ^= 0xc0c0c0c0 & (ti << 2);
+            tmp[i] = ti;
+        }
+        rkeys[rk_off..(rk_off + 8)].copy_from_slice(&tmp);
+        rk_off += 8;
+
+        for i in 0..8 {
+            let ui = tmp[i];
+            let mut ti = (0x0f0f0f0f & (rkeys[(rk_off - 16) + i] >> 4)) | (0xf0f0f0f0 & (ui << 4));
+            ti ^= 0x03030303 & (ui >> 6);
+            tmp[i] =
+                ti ^ (0xfcfcfcfc & (ti << 2)) ^ (0xf0f0f0f0 & (ti << 4)) ^ (0xc0c0c0c0 & (ti << 6));
+        }
+        rkeys[rk_off..(rk_off + 8)].copy_from_slice(&tmp);
+        rk_off += 8;
+
+        sub_bytes(&mut tmp);
+        sub_bytes_nots(&mut tmp);
+
+        add_round_constant_bit(&mut tmp, rcon);
+        rcon += 1;
+
+        for i in 0..8 {
+            let mut ti = (0x0f0f0f0f & (rkeys[(rk_off - 16) + i] >> 4))
+                | (0xf0f0f0f0 & (rkeys[(rk_off - 8) + i] << 4));
+            ti ^= 0x03030303 & ror(tmp[i], ror_distance(1, 3));
+            rkeys[rk_off + i] =
+                ti ^ (0xfcfcfcfc & (ti << 2)) ^ (0xf0f0f0f0 & (ti << 4)) ^ (0xc0c0c0c0 & (ti << 6));
+        }
+        rk_off += 8;
+
+        if rcon >= 8 {
+            break;
+        }
+
+        for i in 0..8 {
+            let ui = rkeys[(rk_off - 8) + i];
+            let mut ti = rkeys[(rk_off - 16) + i];
+            ti ^= 0x30303030 & (ui >> 2);
+            ti ^= 0xc0c0c0c0 & (ti << 2);
+            tmp[i] = ti;
+        }
+    }
+
+    // Adjust to match fixslicing format
+    #[cfg(feature = "compact")]
+    {
+        for i in (8..104).step_by(16) {
+            inv_shift_rows_1(&mut rkeys[i..(i + 8)]);
+        }
+    }
+    #[cfg(not(feature = "compact"))]
+    {
+        for i in (0..96).step_by(32) {
+            inv_shift_rows_1(&mut rkeys[(i + 8)..(i + 16)]);
+            inv_shift_rows_2(&mut rkeys[(i + 16)..(i + 24)]);
+            inv_shift_rows_3(&mut rkeys[(i + 24)..(i + 32)]);
+        }
+    }
+
+    // Account for NOTs removed from sub_bytes
+    for i in 1..13 {
+        sub_bytes_nots(&mut rkeys[(i * 8)..(i * 8 + 8)]);
+    }
+
+    rkeys
+}
+
+/// Fully bitsliced AES-256 key schedule to match the fully-fixsliced representation.
+pub(crate) fn aes256_key_schedule(key: &GenericArray<u8, U32>) -> FixsliceKeys256 {
+    let mut rkeys = [0u32; 120];
+
+    bitslice(&mut rkeys[..8], &key[..16], &key[..16]);
+    bitslice(&mut rkeys[8..16], &key[16..], &key[16..]);
+
+    let mut rk_off = 8;
+
+    let mut rcon = 0;
+    loop {
+        memshift32(&mut rkeys, rk_off);
+        rk_off += 8;
+
+        sub_bytes(&mut rkeys[rk_off..(rk_off + 8)]);
+        sub_bytes_nots(&mut rkeys[rk_off..(rk_off + 8)]);
+
+        add_round_constant_bit(&mut rkeys[rk_off..(rk_off + 8)], rcon);
+        xor_columns(&mut rkeys, rk_off, 16, ror_distance(1, 3));
+        rcon += 1;
+
+        if rcon == 7 {
+            break;
+        }
+
+        memshift32(&mut rkeys, rk_off);
+        rk_off += 8;
+
+        sub_bytes(&mut rkeys[rk_off..(rk_off + 8)]);
+        sub_bytes_nots(&mut rkeys[rk_off..(rk_off + 8)]);
+
+        xor_columns(&mut rkeys, rk_off, 16, ror_distance(0, 3));
+    }
+
+    // Adjust to match fixslicing format
+    #[cfg(feature = "compact")]
+    {
+        for i in (8..120).step_by(16) {
+            inv_shift_rows_1(&mut rkeys[i..(i + 8)]);
+        }
+    }
+    #[cfg(not(feature = "compact"))]
+    {
+        for i in (8..104).step_by(32) {
+            inv_shift_rows_1(&mut rkeys[i..(i + 8)]);
+            inv_shift_rows_2(&mut rkeys[(i + 8)..(i + 16)]);
+            inv_shift_rows_3(&mut rkeys[(i + 16)..(i + 24)]);
+        }
+        inv_shift_rows_1(&mut rkeys[104..112]);
+    }
+
+    // Account for NOTs removed from sub_bytes
+    for i in 1..15 {
+        sub_bytes_nots(&mut rkeys[(i * 8)..(i * 8 + 8)]);
+    }
+
+    rkeys
+}
+
+/// Fully-fixsliced AES-128 decryption (the InvShiftRows is completely omitted).
+///
+/// Decrypts four blocks in-place and in parallel.
+pub(crate) fn aes128_decrypt(rkeys: &FixsliceKeys128, blocks: &mut [Block]) {
+    debug_assert_eq!(blocks.len(), FIXSLICE_BLOCKS);
+    let mut state = State::default();
+
+    bitslice(&mut state, &blocks[0], &blocks[1]);
+
+    add_round_key(&mut state, &rkeys[80..]);
+    inv_sub_bytes(&mut state);
+
+    #[cfg(not(feature = "compact"))]
+    {
+        inv_shift_rows_2(&mut state);
+    }
+
+    let mut rk_off = 72;
+    loop {
+        #[cfg(feature = "compact")]
+        {
+            inv_shift_rows_2(&mut state);
+        }
+
+        add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
+        inv_mix_columns_1(&mut state);
+        inv_sub_bytes(&mut state);
+        rk_off -= 8;
+
+        if rk_off == 0 {
+            break;
+        }
+
+        add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
+        inv_mix_columns_0(&mut state);
+        inv_sub_bytes(&mut state);
+        rk_off -= 8;
+
+        #[cfg(not(feature = "compact"))]
+        {
+            add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
+            inv_mix_columns_3(&mut state);
+            inv_sub_bytes(&mut state);
+            rk_off -= 8;
+
+            add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
+            inv_mix_columns_2(&mut state);
+            inv_sub_bytes(&mut state);
+            rk_off -= 8;
+        }
+    }
+
+    add_round_key(&mut state, &rkeys[..8]);
+
+    inv_bitslice(&state, blocks);
+}
+
+/// Fully-fixsliced AES-128 encryption (the ShiftRows is completely omitted).
+///
+/// Encrypts four blocks in-place and in parallel.
+pub(crate) fn aes128_encrypt(rkeys: &FixsliceKeys128, blocks: &mut [Block]) {
+    debug_assert_eq!(blocks.len(), FIXSLICE_BLOCKS);
+    let mut state = State::default();
+
+    bitslice(&mut state, &blocks[0], &blocks[1]);
+
+    add_round_key(&mut state, &rkeys[..8]);
+
+    let mut rk_off = 8;
+    loop {
+        sub_bytes(&mut state);
+        mix_columns_1(&mut state);
+        add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
+        rk_off += 8;
+
+        #[cfg(feature = "compact")]
+        {
+            shift_rows_2(&mut state);
+        }
+
+        if rk_off == 80 {
+            break;
+        }
+
+        #[cfg(not(feature = "compact"))]
+        {
+            sub_bytes(&mut state);
+            mix_columns_2(&mut state);
+            add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
+            rk_off += 8;
+
+            sub_bytes(&mut state);
+            mix_columns_3(&mut state);
+            add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
+            rk_off += 8;
+        }
+
+        sub_bytes(&mut state);
+        mix_columns_0(&mut state);
+        add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
+        rk_off += 8;
+    }
+
+    #[cfg(not(feature = "compact"))]
+    {
+        shift_rows_2(&mut state);
+    }
+
+    sub_bytes(&mut state);
+    add_round_key(&mut state, &rkeys[80..]);
+
+    inv_bitslice(&state, blocks);
+}
+
+/// Fully-fixsliced AES-192 decryption (the InvShiftRows is completely omitted).
+///
+/// Decrypts four blocks in-place and in parallel.
+pub(crate) fn aes192_decrypt(rkeys: &FixsliceKeys192, blocks: &mut [Block]) {
+    debug_assert_eq!(blocks.len(), FIXSLICE_BLOCKS);
+    let mut state = State::default();
+
+    bitslice(&mut state, &blocks[0], &blocks[1]);
+
+    add_round_key(&mut state, &rkeys[96..]);
+    inv_sub_bytes(&mut state);
+
+    let mut rk_off = 88;
+    loop {
+        #[cfg(feature = "compact")]
+        {
+            inv_shift_rows_2(&mut state);
+        }
+        #[cfg(not(feature = "compact"))]
+        {
+            add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
+            inv_mix_columns_3(&mut state);
+            inv_sub_bytes(&mut state);
+            rk_off -= 8;
+
+            add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
+            inv_mix_columns_2(&mut state);
+            inv_sub_bytes(&mut state);
+            rk_off -= 8;
+        }
+
+        add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
+        inv_mix_columns_1(&mut state);
+        inv_sub_bytes(&mut state);
+        rk_off -= 8;
+
+        if rk_off == 0 {
+            break;
+        }
+
+        add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
+        inv_mix_columns_0(&mut state);
+        inv_sub_bytes(&mut state);
+        rk_off -= 8;
+    }
+
+    add_round_key(&mut state, &rkeys[..8]);
+
+    inv_bitslice(&state, blocks);
+}
+
+/// Fully-fixsliced AES-192 encryption (the ShiftRows is completely omitted).
+///
+/// Encrypts four blocks in-place and in parallel.
+pub(crate) fn aes192_encrypt(rkeys: &FixsliceKeys192, blocks: &mut [Block]) {
+    debug_assert_eq!(blocks.len(), FIXSLICE_BLOCKS);
+    let mut state = State::default();
+
+    bitslice(&mut state, &blocks[0], &blocks[1]);
+
+    add_round_key(&mut state, &rkeys[..8]);
+
+    let mut rk_off = 8;
+    loop {
+        sub_bytes(&mut state);
+        mix_columns_1(&mut state);
+        add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
+        rk_off += 8;
+
+        #[cfg(feature = "compact")]
+        {
+            shift_rows_2(&mut state);
+        }
+        #[cfg(not(feature = "compact"))]
+        {
+            sub_bytes(&mut state);
+            mix_columns_2(&mut state);
+            add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
+            rk_off += 8;
+
+            sub_bytes(&mut state);
+            mix_columns_3(&mut state);
+            add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
+            rk_off += 8;
+        }
+
+        if rk_off == 96 {
+            break;
+        }
+
+        sub_bytes(&mut state);
+        mix_columns_0(&mut state);
+        add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
+        rk_off += 8;
+    }
+
+    sub_bytes(&mut state);
+    add_round_key(&mut state, &rkeys[96..]);
+
+    inv_bitslice(&state, blocks);
+}
+
+/// Fully-fixsliced AES-256 decryption (the InvShiftRows is completely omitted).
+///
+/// Decrypts four blocks in-place and in parallel.
+pub(crate) fn aes256_decrypt(rkeys: &FixsliceKeys256, blocks: &mut [Block]) {
+    debug_assert_eq!(blocks.len(), FIXSLICE_BLOCKS);
+    let mut state = State::default();
+
+    bitslice(&mut state, &blocks[0], &blocks[1]);
+
+    add_round_key(&mut state, &rkeys[112..]);
+    inv_sub_bytes(&mut state);
+
+    #[cfg(not(feature = "compact"))]
+    {
+        inv_shift_rows_2(&mut state);
+    }
+
+    let mut rk_off = 104;
+    loop {
+        #[cfg(feature = "compact")]
+        {
+            inv_shift_rows_2(&mut state);
+        }
+
+        add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
+        inv_mix_columns_1(&mut state);
+        inv_sub_bytes(&mut state);
+        rk_off -= 8;
+
+        if rk_off == 0 {
+            break;
+        }
+
+        add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
+        inv_mix_columns_0(&mut state);
+        inv_sub_bytes(&mut state);
+        rk_off -= 8;
+
+        #[cfg(not(feature = "compact"))]
+        {
+            add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
+            inv_mix_columns_3(&mut state);
+            inv_sub_bytes(&mut state);
+            rk_off -= 8;
+
+            add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
+            inv_mix_columns_2(&mut state);
+            inv_sub_bytes(&mut state);
+            rk_off -= 8;
+        }
+    }
+
+    add_round_key(&mut state, &rkeys[..8]);
+
+    inv_bitslice(&state, blocks);
+}
+
+/// Fully-fixsliced AES-256 encryption (the ShiftRows is completely omitted).
+///
+/// Encrypts four blocks in-place and in parallel.
+pub(crate) fn aes256_encrypt(rkeys: &FixsliceKeys256, blocks: &mut [Block]) {
+    debug_assert_eq!(blocks.len(), FIXSLICE_BLOCKS);
+    let mut state = State::default();
+
+    bitslice(&mut state, &blocks[0], &blocks[1]);
+
+    add_round_key(&mut state, &rkeys[..8]);
+
+    let mut rk_off = 8;
+    loop {
+        sub_bytes(&mut state);
+        mix_columns_1(&mut state);
+        add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
+        rk_off += 8;
+
+        #[cfg(feature = "compact")]
+        {
+            shift_rows_2(&mut state);
+        }
+
+        if rk_off == 112 {
+            break;
+        }
+
+        #[cfg(not(feature = "compact"))]
+        {
+            sub_bytes(&mut state);
+            mix_columns_2(&mut state);
+            add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
+            rk_off += 8;
+
+            sub_bytes(&mut state);
+            mix_columns_3(&mut state);
+            add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
+            rk_off += 8;
+        }
+
+        sub_bytes(&mut state);
+        mix_columns_0(&mut state);
+        add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
+        rk_off += 8;
+    }
+
+    #[cfg(not(feature = "compact"))]
+    {
+        shift_rows_2(&mut state);
+    }
+
+    sub_bytes(&mut state);
+    add_round_key(&mut state, &rkeys[112..]);
+
+    inv_bitslice(&state, blocks);
+}
+
+/// Note that the 4 bitwise NOT (^= 0xffffffff) are accounted for here so that it is a true
+/// inverse of 'sub_bytes'.
+fn inv_sub_bytes(state: &mut [u32]) {
+    debug_assert_eq!(state.len(), 8);
+
+    // Scheduled using https://github.com/Ko-/aes-armcortexm/tree/public/scheduler
+    // Inline "stack" comments reflect suggested stores and loads (ARM Cortex-M3 and M4)
+
+    let u7 = state[0];
+    let u6 = state[1];
+    let u5 = state[2];
+    let u4 = state[3];
+    let u3 = state[4];
+    let u2 = state[5];
+    let u1 = state[6];
+    let u0 = state[7];
+
+    let t23 = u0 ^ u3;
+    let t8 = u1 ^ t23;
+    let m2 = t23 & t8;
+    let t4 = u4 ^ t8;
+    let t22 = u1 ^ u3;
+    let t2 = u0 ^ u1;
+    let t1 = u3 ^ u4;
+    // t23 -> stack
+    let t9 = u7 ^ t1;
+    // t8 -> stack
+    let m7 = t22 & t9;
+    // t9 -> stack
+    let t24 = u4 ^ u7;
+    // m7 -> stack
+    let t10 = t2 ^ t24;
+    // u4 -> stack
+    let m14 = t2 & t10;
+    let r5 = u6 ^ u7;
+    // m2 -> stack
+    let t3 = t1 ^ r5;
+    // t2 -> stack
+    let t13 = t2 ^ r5;
+    let t19 = t22 ^ r5;
+    // t3 -> stack
+    let t17 = u2 ^ t19;
+    // t4 -> stack
+    let t25 = u2 ^ t1;
+    let r13 = u1 ^ u6;
+    // t25 -> stack
+    let t20 = t24 ^ r13;
+    // t17 -> stack
+    let m9 = t20 & t17;
+    // t20 -> stack
+    let r17 = u2 ^ u5;
+    // t22 -> stack
+    let t6 = t22 ^ r17;
+    // t13 -> stack
+    let m1 = t13 & t6;
+    let y5 = u0 ^ r17;
+    let m4 = t19 & y5;
+    let m5 = m4 ^ m1;
+    let m17 = m5 ^ t24;
+    let r18 = u5 ^ u6;
+    let t27 = t1 ^ r18;
+    let t15 = t10 ^ t27;
+    // t6 -> stack
+    let m11 = t1 & t15;
+    let m15 = m14 ^ m11;
+    let m21 = m17 ^ m15;
+    // t1 -> stack
+    // t4 <- stack
+    let m12 = t4 & t27;
+    let m13 = m12 ^ m11;
+    let t14 = t10 ^ r18;
+    let m3 = t14 ^ m1;
+    // m2 <- stack
+    let m16 = m3 ^ m2;
+    let m20 = m16 ^ m13;
+    // u4 <- stack
+    let r19 = u2 ^ u4;
+    let t16 = r13 ^ r19;
+    // t3 <- stack
+    let t26 = t3 ^ t16;
+    let m6 = t3 & t16;
+    let m8 = t26 ^ m6;
+    // t10 -> stack
+    // m7 <- stack
+    let m18 = m8 ^ m7;
+    let m22 = m18 ^ m13;
+    let m25 = m22 & m20;
+    let m26 = m21 ^ m25;
+    let m10 = m9 ^ m6;
+    let m19 = m10 ^ m15;
+    // t25 <- stack
+    let m23 = m19 ^ t25;
+    let m28 = m23 ^ m25;
+    let m24 = m22 ^ m23;
+    let m30 = m26 & m24;
+    let m39 = m23 ^ m30;
+    let m48 = m39 & y5;
+    let m57 = m39 & t19;
+    // m48 -> stack
+    let m36 = m24 ^ m25;
+    let m31 = m20 & m23;
+    let m27 = m20 ^ m21;
+    let m32 = m27 & m31;
+    let m29 = m28 & m27;
+    let m37 = m21 ^ m29;
+    // m39 -> stack
+    let m42 = m37 ^ m39;
+    let m52 = m42 & t15;
+    // t27 -> stack
+    // t1 <- stack
+    let m61 = m42 & t1;
+    let p0 = m52 ^ m61;
+    let p16 = m57 ^ m61;
+    // m57 -> stack
+    // t20 <- stack
+    let m60 = m37 & t20;
+    // p16 -> stack
+    // t17 <- stack
+    let m51 = m37 & t17;
+    let m33 = m27 ^ m25;
+    let m38 = m32 ^ m33;
+    let m43 = m37 ^ m38;
+    let m49 = m43 & t16;
+    let p6 = m49 ^ m60;
+    let p13 = m49 ^ m51;
+    let m58 = m43 & t3;
+    // t9 <- stack
+    let m50 = m38 & t9;
+    // t22 <- stack
+    let m59 = m38 & t22;
+    // p6 -> stack
+    let p1 = m58 ^ m59;
+    let p7 = p0 ^ p1;
+    let m34 = m21 & m22;
+    let m35 = m24 & m34;
+    let m40 = m35 ^ m36;
+    let m41 = m38 ^ m40;
+    let m45 = m42 ^ m41;
+    // t27 <- stack
+    let m53 = m45 & t27;
+    let p8 = m50 ^ m53;
+    let p23 = p7 ^ p8;
+    // t4 <- stack
+    let m62 = m45 & t4;
+    let p14 = m49 ^ m62;
+    let s6 = p14 ^ p23;
+    // t10 <- stack
+    let m54 = m41 & t10;
+    let p2 = m54 ^ m62;
+    let p22 = p2 ^ p7;
+    let s0 = p13 ^ p22;
+    let p17 = m58 ^ p2;
+    let p15 = m54 ^ m59;
+    // t2 <- stack
+    let m63 = m41 & t2;
+    // m39 <- stack
+    let m44 = m39 ^ m40;
+    // p17 -> stack
+    // t6 <- stack
+    let m46 = m44 & t6;
+    let p5 = m46 ^ m51;
+    // p23 -> stack
+    let p18 = m63 ^ p5;
+    let p24 = p5 ^ p7;
+    // m48 <- stack
+    let p12 = m46 ^ m48;
+    let s3 = p12 ^ p22;
+    // t13 <- stack
+    let m55 = m44 & t13;
+    let p9 = m55 ^ m63;
+    // p16 <- stack
+    let s7 = p9 ^ p16;
+    // t8 <- stack
+    let m47 = m40 & t8;
+    let p3 = m47 ^ m50;
+    let p19 = p2 ^ p3;
+    let s5 = p19 ^ p24;
+    let p11 = p0 ^ p3;
+    let p26 = p9 ^ p11;
+    // t23 <- stack
+    let m56 = m40 & t23;
+    let p4 = m48 ^ m56;
+    // p6 <- stack
+    let p20 = p4 ^ p6;
+    let p29 = p15 ^ p20;
+    let s1 = p26 ^ p29;
+    // m57 <- stack
+    let p10 = m57 ^ p4;
+    let p27 = p10 ^ p18;
+    // p23 <- stack
+    let s4 = p23 ^ p27;
+    let p25 = p6 ^ p10;
+    let p28 = p11 ^ p25;
+    // p17 <- stack
+    let s2 = p17 ^ p28;
+
+    state[0] = s7;
+    state[1] = s6;
+    state[2] = s5;
+    state[3] = s4;
+    state[4] = s3;
+    state[5] = s2;
+    state[6] = s1;
+    state[7] = s0;
+}
+
+/// Bitsliced implementation of the AES Sbox based on Boyar, Peralta and Calik.
+///
+/// See: <http://www.cs.yale.edu/homes/peralta/CircuitStuff/SLP_AES_113.txt>
+///
+/// Note that the 4 bitwise NOT (^= 0xffffffff) are moved to the key schedule.
+fn sub_bytes(state: &mut [u32]) {
+    debug_assert_eq!(state.len(), 8);
+
+    // Scheduled using https://github.com/Ko-/aes-armcortexm/tree/public/scheduler
+    // Inline "stack" comments reflect suggested stores and loads (ARM Cortex-M3 and M4)
+
+    let u7 = state[0];
+    let u6 = state[1];
+    let u5 = state[2];
+    let u4 = state[3];
+    let u3 = state[4];
+    let u2 = state[5];
+    let u1 = state[6];
+    let u0 = state[7];
+
+    let y14 = u3 ^ u5;
+    let y13 = u0 ^ u6;
+    let y12 = y13 ^ y14;
+    let t1 = u4 ^ y12;
+    let y15 = t1 ^ u5;
+    let t2 = y12 & y15;
+    let y6 = y15 ^ u7;
+    let y20 = t1 ^ u1;
+    // y12 -> stack
+    let y9 = u0 ^ u3;
+    // y20 -> stack
+    let y11 = y20 ^ y9;
+    // y9 -> stack
+    let t12 = y9 & y11;
+    // y6 -> stack
+    let y7 = u7 ^ y11;
+    let y8 = u0 ^ u5;
+    let t0 = u1 ^ u2;
+    let y10 = y15 ^ t0;
+    // y15 -> stack
+    let y17 = y10 ^ y11;
+    // y14 -> stack
+    let t13 = y14 & y17;
+    let t14 = t13 ^ t12;
+    // y17 -> stack
+    let y19 = y10 ^ y8;
+    // y10 -> stack
+    let t15 = y8 & y10;
+    let t16 = t15 ^ t12;
+    let y16 = t0 ^ y11;
+    // y11 -> stack
+    let y21 = y13 ^ y16;
+    // y13 -> stack
+    let t7 = y13 & y16;
+    // y16 -> stack
+    let y18 = u0 ^ y16;
+    let y1 = t0 ^ u7;
+    let y4 = y1 ^ u3;
+    // u7 -> stack
+    let t5 = y4 & u7;
+    let t6 = t5 ^ t2;
+    let t18 = t6 ^ t16;
+    let t22 = t18 ^ y19;
+    let y2 = y1 ^ u0;
+    let t10 = y2 & y7;
+    let t11 = t10 ^ t7;
+    let t20 = t11 ^ t16;
+    let t24 = t20 ^ y18;
+    let y5 = y1 ^ u6;
+    let t8 = y5 & y1;
+    let t9 = t8 ^ t7;
+    let t19 = t9 ^ t14;
+    let t23 = t19 ^ y21;
+    let y3 = y5 ^ y8;
+    // y6 <- stack
+    let t3 = y3 & y6;
+    let t4 = t3 ^ t2;
+    // y20 <- stack
+    let t17 = t4 ^ y20;
+    let t21 = t17 ^ t14;
+    let t26 = t21 & t23;
+    let t27 = t24 ^ t26;
+    let t31 = t22 ^ t26;
+    let t25 = t21 ^ t22;
+    // y4 -> stack
+    let t28 = t25 & t27;
+    let t29 = t28 ^ t22;
+    let z14 = t29 & y2;
+    let z5 = t29 & y7;
+    let t30 = t23 ^ t24;
+    let t32 = t31 & t30;
+    let t33 = t32 ^ t24;
+    let t35 = t27 ^ t33;
+    let t36 = t24 & t35;
+    let t38 = t27 ^ t36;
+    let t39 = t29 & t38;
+    let t40 = t25 ^ t39;
+    let t43 = t29 ^ t40;
+    // y16 <- stack
+    let z3 = t43 & y16;
+    let tc12 = z3 ^ z5;
+    // tc12 -> stack
+    // y13 <- stack
+    let z12 = t43 & y13;
+    let z13 = t40 & y5;
+    let z4 = t40 & y1;
+    let tc6 = z3 ^ z4;
+    let t34 = t23 ^ t33;
+    let t37 = t36 ^ t34;
+    let t41 = t40 ^ t37;
+    // y10 <- stack
+    let z8 = t41 & y10;
+    let z17 = t41 & y8;
+    let t44 = t33 ^ t37;
+    // y15 <- stack
+    let z0 = t44 & y15;
+    // z17 -> stack
+    // y12 <- stack
+    let z9 = t44 & y12;
+    let z10 = t37 & y3;
+    let z1 = t37 & y6;
+    let tc5 = z1 ^ z0;
+    let tc11 = tc6 ^ tc5;
+    // y4 <- stack
+    let z11 = t33 & y4;
+    let t42 = t29 ^ t33;
+    let t45 = t42 ^ t41;
+    // y17 <- stack
+    let z7 = t45 & y17;
+    let tc8 = z7 ^ tc6;
+    // y14 <- stack
+    let z16 = t45 & y14;
+    // y11 <- stack
+    let z6 = t42 & y11;
+    let tc16 = z6 ^ tc8;
+    // z14 -> stack
+    // y9 <- stack
+    let z15 = t42 & y9;
+    let tc20 = z15 ^ tc16;
+    let tc1 = z15 ^ z16;
+    let tc2 = z10 ^ tc1;
+    let tc21 = tc2 ^ z11;
+    let tc3 = z9 ^ tc2;
+    let s0 = tc3 ^ tc16;
+    let s3 = tc3 ^ tc11;
+    let s1 = s3 ^ tc16;
+    let tc13 = z13 ^ tc1;
+    // u7 <- stack
+    let z2 = t33 & u7;
+    let tc4 = z0 ^ z2;
+    let tc7 = z12 ^ tc4;
+    let tc9 = z8 ^ tc7;
+    let tc10 = tc8 ^ tc9;
+    // z14 <- stack
+    let tc17 = z14 ^ tc10;
+    let s5 = tc21 ^ tc17;
+    let tc26 = tc17 ^ tc20;
+    // z17 <- stack
+    let s2 = tc26 ^ z17;
+    // tc12 <- stack
+    let tc14 = tc4 ^ tc12;
+    let tc18 = tc13 ^ tc14;
+    let s6 = tc10 ^ tc18;
+    let s7 = z12 ^ tc18;
+    let s4 = tc14 ^ s3;
+
+    state[0] = s7;
+    state[1] = s6;
+    state[2] = s5;
+    state[3] = s4;
+    state[4] = s3;
+    state[5] = s2;
+    state[6] = s1;
+    state[7] = s0;
+}
+
+/// NOT operations that are omitted in S-box
+#[inline]
+fn sub_bytes_nots(state: &mut [u32]) {
+    debug_assert_eq!(state.len(), 8);
+    state[0] ^= 0xffffffff;
+    state[1] ^= 0xffffffff;
+    state[5] ^= 0xffffffff;
+    state[6] ^= 0xffffffff;
+}
+
+/// Computation of the MixColumns transformation in the fixsliced representation, with different
+/// rotations used according to the round number mod 4.
+///
+/// Based on Käsper-Schwabe, similar to https://github.com/Ko-/aes-armcortexm.
+macro_rules! define_mix_columns {
+    (
+        $name:ident,
+        $name_inv:ident,
+        $first_rotate:path,
+        $second_rotate:path
+    ) => {
+        #[rustfmt::skip]
+        fn $name(state: &mut State) {
+            let (a0, a1, a2, a3, a4, a5, a6, a7) = (
+                state[0], state[1], state[2], state[3], state[4], state[5], state[6], state[7]
+            );
+            let (b0, b1, b2, b3, b4, b5, b6, b7) = (
+                $first_rotate(a0),
+                $first_rotate(a1),
+                $first_rotate(a2),
+                $first_rotate(a3),
+                $first_rotate(a4),
+                $first_rotate(a5),
+                $first_rotate(a6),
+                $first_rotate(a7),
+            );
+            let (c0, c1, c2, c3, c4, c5, c6, c7) = (
+                a0 ^ b0,
+                a1 ^ b1,
+                a2 ^ b2,
+                a3 ^ b3,
+                a4 ^ b4,
+                a5 ^ b5,
+                a6 ^ b6,
+                a7 ^ b7,
+            );
+            state[0] = b0      ^ c7 ^ $second_rotate(c0);
+            state[1] = b1 ^ c0 ^ c7 ^ $second_rotate(c1);
+            state[2] = b2 ^ c1      ^ $second_rotate(c2);
+            state[3] = b3 ^ c2 ^ c7 ^ $second_rotate(c3);
+            state[4] = b4 ^ c3 ^ c7 ^ $second_rotate(c4);
+            state[5] = b5 ^ c4      ^ $second_rotate(c5);
+            state[6] = b6 ^ c5      ^ $second_rotate(c6);
+            state[7] = b7 ^ c6      ^ $second_rotate(c7);
+        }
+
+        #[rustfmt::skip]
+        fn $name_inv(state: &mut State) {
+            let (a0, a1, a2, a3, a4, a5, a6, a7) = (
+                state[0], state[1], state[2], state[3], state[4], state[5], state[6], state[7]
+            );
+            let (b0, b1, b2, b3, b4, b5, b6, b7) = (
+                $first_rotate(a0),
+                $first_rotate(a1),
+                $first_rotate(a2),
+                $first_rotate(a3),
+                $first_rotate(a4),
+                $first_rotate(a5),
+                $first_rotate(a6),
+                $first_rotate(a7),
+            );
+            let (c0, c1, c2, c3, c4, c5, c6, c7) = (
+                a0 ^ b0,
+                a1 ^ b1,
+                a2 ^ b2,
+                a3 ^ b3,
+                a4 ^ b4,
+                a5 ^ b5,
+                a6 ^ b6,
+                a7 ^ b7,
+            );
+            let (d0, d1, d2, d3, d4, d5, d6, d7) = (
+                a0      ^ c7,
+                a1 ^ c0 ^ c7,
+                a2 ^ c1,
+                a3 ^ c2 ^ c7,
+                a4 ^ c3 ^ c7,
+                a5 ^ c4,
+                a6 ^ c5,
+                a7 ^ c6,
+            );
+            let (e0, e1, e2, e3, e4, e5, e6, e7) = (
+                c0      ^ d6,
+                c1      ^ d6 ^ d7,
+                c2 ^ d0      ^ d7,
+                c3 ^ d1 ^ d6,
+                c4 ^ d2 ^ d6 ^ d7,
+                c5 ^ d3      ^ d7,
+                c6 ^ d4,
+                c7 ^ d5,
+            );
+            state[0] = d0 ^ e0 ^ $second_rotate(e0);
+            state[1] = d1 ^ e1 ^ $second_rotate(e1);
+            state[2] = d2 ^ e2 ^ $second_rotate(e2);
+            state[3] = d3 ^ e3 ^ $second_rotate(e3);
+            state[4] = d4 ^ e4 ^ $second_rotate(e4);
+            state[5] = d5 ^ e5 ^ $second_rotate(e5);
+            state[6] = d6 ^ e6 ^ $second_rotate(e6);
+            state[7] = d7 ^ e7 ^ $second_rotate(e7);
+        }
+    }
+}
+
+define_mix_columns!(
+    mix_columns_0,
+    inv_mix_columns_0,
+    rotate_rows_1,
+    rotate_rows_2
+);
+
+define_mix_columns!(
+    mix_columns_1,
+    inv_mix_columns_1,
+    rotate_rows_and_columns_1_1,
+    rotate_rows_and_columns_2_2
+);
+
+#[cfg(not(feature = "compact"))]
+define_mix_columns!(
+    mix_columns_2,
+    inv_mix_columns_2,
+    rotate_rows_and_columns_1_2,
+    rotate_rows_2
+);
+
+#[cfg(not(feature = "compact"))]
+define_mix_columns!(
+    mix_columns_3,
+    inv_mix_columns_3,
+    rotate_rows_and_columns_1_3,
+    rotate_rows_and_columns_2_2
+);
+
+#[inline]
+fn delta_swap_1(a: &mut u32, shift: u32, mask: u32) {
+    let t = (*a ^ ((*a) >> shift)) & mask;
+    *a ^= t ^ (t << shift);
+}
+
+#[inline]
+fn delta_swap_2(a: &mut u32, b: &mut u32, shift: u32, mask: u32) {
+    let t = (*a ^ ((*b) >> shift)) & mask;
+    *a ^= t;
+    *b ^= t << shift;
+}
+
+/// Applies ShiftRows once on an AES state (or key).
+#[cfg(any(not(feature = "compact"), feature = "hazmat"))]
+#[inline]
+fn shift_rows_1(state: &mut [u32]) {
+    debug_assert_eq!(state.len(), 8);
+    for x in state.iter_mut() {
+        delta_swap_1(x, 4, 0x0c0f0300);
+        delta_swap_1(x, 2, 0x33003300);
+    }
+}
+
+/// Applies ShiftRows twice on an AES state (or key).
+#[inline]
+fn shift_rows_2(state: &mut [u32]) {
+    debug_assert_eq!(state.len(), 8);
+    for x in state.iter_mut() {
+        delta_swap_1(x, 4, 0x0f000f00);
+    }
+}
+
+/// Applies ShiftRows three times on an AES state (or key).
+#[inline]
+fn shift_rows_3(state: &mut [u32]) {
+    debug_assert_eq!(state.len(), 8);
+    for x in state.iter_mut() {
+        delta_swap_1(x, 4, 0x030f0c00);
+        delta_swap_1(x, 2, 0x33003300);
+    }
+}
+
+#[inline(always)]
+fn inv_shift_rows_1(state: &mut [u32]) {
+    shift_rows_3(state);
+}
+
+#[inline(always)]
+fn inv_shift_rows_2(state: &mut [u32]) {
+    shift_rows_2(state);
+}
+
+#[cfg(not(feature = "compact"))]
+#[inline(always)]
+fn inv_shift_rows_3(state: &mut [u32]) {
+    shift_rows_1(state);
+}
+
+/// XOR the columns after the S-box during the key schedule round function.
+///
+/// The `idx_xor` parameter refers to the index of the previous round key that is
+/// involved in the XOR computation (should be 8 and 16 for AES-128 and AES-256,
+/// respectively).
+///
+/// The `idx_ror` parameter refers to the rotation value, which varies between the
+/// different key schedules.
+fn xor_columns(rkeys: &mut [u32], offset: usize, idx_xor: usize, idx_ror: u32) {
+    for i in 0..8 {
+        let off_i = offset + i;
+        let rk = rkeys[off_i - idx_xor] ^ (0x03030303 & ror(rkeys[off_i], idx_ror));
+        rkeys[off_i] =
+            rk ^ (0xfcfcfcfc & (rk << 2)) ^ (0xf0f0f0f0 & (rk << 4)) ^ (0xc0c0c0c0 & (rk << 6));
+    }
+}
+
+/// Bitslice two 128-bit input blocks input0, input1 into a 256-bit internal state.
+fn bitslice(output: &mut [u32], input0: &[u8], input1: &[u8]) {
+    debug_assert_eq!(output.len(), 8);
+    debug_assert_eq!(input0.len(), 16);
+    debug_assert_eq!(input1.len(), 16);
+
+    // Bitslicing is a bit index manipulation. 256 bits of data means each bit is positioned at an
+    // 8-bit index. AES data is 2 blocks, each one a 4x4 column-major matrix of bytes, so the
+    // index is initially ([b]lock, [c]olumn, [r]ow, [p]osition):
+    //     b0 c1 c0 r1 r0 p2 p1 p0
+    //
+    // The desired bitsliced data groups first by bit position, then row, column, block:
+    //     p2 p1 p0 r1 r0 c1 c0 b0
+
+    // Interleave the columns on input (note the order of input)
+    //     b0 c1 c0 __ __ __ __ __ => c1 c0 b0 __ __ __ __ __
+    let mut t0 = u32::from_le_bytes(input0[0x00..0x04].try_into().unwrap());
+    let mut t2 = u32::from_le_bytes(input0[0x04..0x08].try_into().unwrap());
+    let mut t4 = u32::from_le_bytes(input0[0x08..0x0c].try_into().unwrap());
+    let mut t6 = u32::from_le_bytes(input0[0x0c..0x10].try_into().unwrap());
+    let mut t1 = u32::from_le_bytes(input1[0x00..0x04].try_into().unwrap());
+    let mut t3 = u32::from_le_bytes(input1[0x04..0x08].try_into().unwrap());
+    let mut t5 = u32::from_le_bytes(input1[0x08..0x0c].try_into().unwrap());
+    let mut t7 = u32::from_le_bytes(input1[0x0c..0x10].try_into().unwrap());
+
+    // Bit Index Swap 5 <-> 0:
+    //     __ __ b0 __ __ __ __ p0 => __ __ p0 __ __ __ __ b0
+    let m0 = 0x55555555;
+    delta_swap_2(&mut t1, &mut t0, 1, m0);
+    delta_swap_2(&mut t3, &mut t2, 1, m0);
+    delta_swap_2(&mut t5, &mut t4, 1, m0);
+    delta_swap_2(&mut t7, &mut t6, 1, m0);
+
+    // Bit Index Swap 6 <-> 1:
+    //     __ c0 __ __ __ __ p1 __ => __ p1 __ __ __ __ c0 __
+    let m1 = 0x33333333;
+    delta_swap_2(&mut t2, &mut t0, 2, m1);
+    delta_swap_2(&mut t3, &mut t1, 2, m1);
+    delta_swap_2(&mut t6, &mut t4, 2, m1);
+    delta_swap_2(&mut t7, &mut t5, 2, m1);
+
+    // Bit Index Swap 7 <-> 2:
+    //     c1 __ __ __ __ p2 __ __ => p2 __ __ __ __ c1 __ __
+    let m2 = 0x0f0f0f0f;
+    delta_swap_2(&mut t4, &mut t0, 4, m2);
+    delta_swap_2(&mut t5, &mut t1, 4, m2);
+    delta_swap_2(&mut t6, &mut t2, 4, m2);
+    delta_swap_2(&mut t7, &mut t3, 4, m2);
+
+    // Final bitsliced bit index, as desired:
+    //     p2 p1 p0 r1 r0 c1 c0 b0
+    output[0] = t0;
+    output[1] = t1;
+    output[2] = t2;
+    output[3] = t3;
+    output[4] = t4;
+    output[5] = t5;
+    output[6] = t6;
+    output[7] = t7;
+}
+
+/// Un-bitslice a 256-bit internal state into two 128-bit blocks of output.
+fn inv_bitslice(input: &[u32], output: &mut [Block]) {
+    debug_assert_eq!(input.len(), 8);
+    debug_assert_eq!(output.len(), 2);
+
+    // Unbitslicing is a bit index manipulation. 256 bits of data means each bit is positioned at
+    // an 8-bit index. AES data is 2 blocks, each one a 4x4 column-major matrix of bytes, so the
+    // desired index for the output is ([b]lock, [c]olumn, [r]ow, [p]osition):
+    //     b0 c1 c0 r1 r0 p2 p1 p0
+    //
+    // The initially bitsliced data groups first by bit position, then row, column, block:
+    //     p2 p1 p0 r1 r0 c1 c0 b0
+
+    let mut t0 = input[0];
+    let mut t1 = input[1];
+    let mut t2 = input[2];
+    let mut t3 = input[3];
+    let mut t4 = input[4];
+    let mut t5 = input[5];
+    let mut t6 = input[6];
+    let mut t7 = input[7];
+
+    // TODO: these bit index swaps are identical to those in 'packing'
+
+    // Bit Index Swap 5 <-> 0:
+    //     __ __ p0 __ __ __ __ b0 => __ __ b0 __ __ __ __ p0
+    let m0 = 0x55555555;
+    delta_swap_2(&mut t1, &mut t0, 1, m0);
+    delta_swap_2(&mut t3, &mut t2, 1, m0);
+    delta_swap_2(&mut t5, &mut t4, 1, m0);
+    delta_swap_2(&mut t7, &mut t6, 1, m0);
+
+    // Bit Index Swap 6 <-> 1:
+    //     __ p1 __ __ __ __ c0 __ => __ c0 __ __ __ __ p1 __
+    let m1 = 0x33333333;
+    delta_swap_2(&mut t2, &mut t0, 2, m1);
+    delta_swap_2(&mut t3, &mut t1, 2, m1);
+    delta_swap_2(&mut t6, &mut t4, 2, m1);
+    delta_swap_2(&mut t7, &mut t5, 2, m1);
+
+    // Bit Index Swap 7 <-> 2:
+    //     p2 __ __ __ __ c1 __ __ => c1 __ __ __ __ p2 __ __
+    let m2 = 0x0f0f0f0f;
+    delta_swap_2(&mut t4, &mut t0, 4, m2);
+    delta_swap_2(&mut t5, &mut t1, 4, m2);
+    delta_swap_2(&mut t6, &mut t2, 4, m2);
+    delta_swap_2(&mut t7, &mut t3, 4, m2);
+
+    // De-interleave the columns on output (note the order of output)
+    //     c1 c0 b0 __ __ __ __ __ => b0 c1 c0 __ __ __ __ __
+    output[0][0x00..0x04].copy_from_slice(&t0.to_le_bytes());
+    output[0][0x04..0x08].copy_from_slice(&t2.to_le_bytes());
+    output[0][0x08..0x0c].copy_from_slice(&t4.to_le_bytes());
+    output[0][0x0c..0x10].copy_from_slice(&t6.to_le_bytes());
+    output[1][0x00..0x04].copy_from_slice(&t1.to_le_bytes());
+    output[1][0x04..0x08].copy_from_slice(&t3.to_le_bytes());
+    output[1][0x08..0x0c].copy_from_slice(&t5.to_le_bytes());
+    output[1][0x0c..0x10].copy_from_slice(&t7.to_le_bytes());
+
+    // Final AES bit index, as desired:
+    //     b0 c1 c0 r1 r0 p2 p1 p0
+}
+
+/// Copy 32-bytes within the provided slice to an 8-byte offset
+fn memshift32(buffer: &mut [u32], src_offset: usize) {
+    debug_assert_eq!(src_offset % 8, 0);
+
+    let dst_offset = src_offset + 8;
+    debug_assert!(dst_offset + 8 <= buffer.len());
+
+    for i in (0..8).rev() {
+        buffer[dst_offset + i] = buffer[src_offset + i];
+    }
+}
+
+/// XOR the round key to the internal state. The round keys are expected to be
+/// pre-computed and to be packed in the fixsliced representation.
+#[inline]
+fn add_round_key(state: &mut State, rkey: &[u32]) {
+    debug_assert_eq!(rkey.len(), 8);
+    for (a, b) in state.iter_mut().zip(rkey) {
+        *a ^= b;
+    }
+}
+
+#[inline(always)]
+fn add_round_constant_bit(state: &mut [u32], bit: usize) {
+    state[bit] ^= 0x0000c000;
+}
+
+#[inline(always)]
+fn ror(x: u32, y: u32) -> u32 {
+    x.rotate_right(y)
+}
+
+#[inline(always)]
+fn ror_distance(rows: u32, cols: u32) -> u32 {
+    (rows << 3) + (cols << 1)
+}
+
+#[inline(always)]
+fn rotate_rows_1(x: u32) -> u32 {
+    ror(x, ror_distance(1, 0))
+}
+
+#[inline(always)]
+fn rotate_rows_2(x: u32) -> u32 {
+    ror(x, ror_distance(2, 0))
+}
+
+#[inline(always)]
+#[rustfmt::skip]
+fn rotate_rows_and_columns_1_1(x: u32) -> u32 {
+    (ror(x, ror_distance(1, 1)) & 0x3f3f3f3f) |
+    (ror(x, ror_distance(0, 1)) & 0xc0c0c0c0)
+}
+
+#[cfg(not(feature = "compact"))]
+#[inline(always)]
+#[rustfmt::skip]
+fn rotate_rows_and_columns_1_2(x: u32) -> u32 {
+    (ror(x, ror_distance(1, 2)) & 0x0f0f0f0f) |
+    (ror(x, ror_distance(0, 2)) & 0xf0f0f0f0)
+}
+
+#[cfg(not(feature = "compact"))]
+#[inline(always)]
+#[rustfmt::skip]
+fn rotate_rows_and_columns_1_3(x: u32) -> u32 {
+    (ror(x, ror_distance(1, 3)) & 0x03030303) |
+    (ror(x, ror_distance(0, 3)) & 0xfcfcfcfc)
+}
+
+#[inline(always)]
+#[rustfmt::skip]
+fn rotate_rows_and_columns_2_2(x: u32) -> u32 {
+    (ror(x, ror_distance(2, 2)) & 0x0f0f0f0f) |
+    (ror(x, ror_distance(1, 2)) & 0xf0f0f0f0)
+}
+
+/// Low-level "hazmat" AES functions.
+///
+/// Note: this isn't actually used in the `Aes128`/`Aes192`/`Aes256`
+/// implementations in this crate, but instead provides raw access to
+/// the AES round function gated under the `hazmat` crate feature.
+#[cfg(feature = "hazmat")]
+pub(crate) mod hazmat {
+    use super::{
+        bitslice, inv_bitslice, inv_mix_columns_0, inv_shift_rows_1, inv_sub_bytes, mix_columns_0,
+        shift_rows_1, sub_bytes, sub_bytes_nots, State,
+    };
+    use crate::{Block, ParBlocks};
+
+    /// XOR the `src` block into the `dst` block in-place.
+    fn xor_in_place(dst: &mut Block, src: &Block) {
+        for (a, b) in dst.iter_mut().zip(src.as_slice()) {
+            *a ^= *b;
+        }
+    }
+
+    /// Perform a bitslice operation, loading a single block.
+    fn bitslice_block(block: &Block) -> State {
+        let mut state = State::default();
+        bitslice(&mut state, block, block);
+        state
+    }
+
+    /// Perform an inverse bitslice operation, extracting a single block.
+    fn inv_bitslice_block(block: &mut Block, state: &State) {
+        let mut out = [Block::default(); 2];
+        inv_bitslice(state, &mut out);
+        block.copy_from_slice(&out[0]);
+    }
+
+    /// AES cipher (encrypt) round function.
+    #[inline]
+    pub(crate) fn cipher_round(block: &mut Block, round_key: &Block) {
+        let mut state = bitslice_block(block);
+        sub_bytes(&mut state);
+        sub_bytes_nots(&mut state);
+        shift_rows_1(&mut state);
+        mix_columns_0(&mut state);
+        inv_bitslice_block(block, &state);
+        xor_in_place(block, round_key);
+    }
+
+    /// AES cipher (encrypt) round function: parallel version.
+    #[inline]
+    pub(crate) fn cipher_round_par(blocks: &mut ParBlocks, round_keys: &ParBlocks) {
+        for (chunk, keys) in blocks.chunks_exact_mut(2).zip(round_keys.chunks_exact(2)) {
+            let mut state = State::default();
+            bitslice(&mut state, &chunk[0], &chunk[1]);
+            sub_bytes(&mut state);
+            sub_bytes_nots(&mut state);
+            shift_rows_1(&mut state);
+            mix_columns_0(&mut state);
+            inv_bitslice(&state, chunk);
+
+            for i in 0..2 {
+                xor_in_place(&mut chunk[i], &keys[i]);
+            }
+        }
+    }
+
+    /// AES cipher (encrypt) round function.
+    #[inline]
+    pub(crate) fn equiv_inv_cipher_round(block: &mut Block, round_key: &Block) {
+        let mut state = bitslice_block(block);
+        sub_bytes_nots(&mut state);
+        inv_sub_bytes(&mut state);
+        inv_shift_rows_1(&mut state);
+        inv_mix_columns_0(&mut state);
+        inv_bitslice_block(block, &state);
+        xor_in_place(block, round_key);
+    }
+
+    /// AES cipher (encrypt) round function: parallel version.
+    #[inline]
+    pub(crate) fn equiv_inv_cipher_round_par(blocks: &mut ParBlocks, round_keys: &ParBlocks) {
+        for (chunk, keys) in blocks.chunks_exact_mut(2).zip(round_keys.chunks_exact(2)) {
+            let mut state = State::default();
+            bitslice(&mut state, &chunk[0], &chunk[1]);
+            sub_bytes_nots(&mut state);
+            inv_sub_bytes(&mut state);
+            inv_shift_rows_1(&mut state);
+            inv_mix_columns_0(&mut state);
+            inv_bitslice(&state, chunk);
+
+            for i in 0..2 {
+                xor_in_place(&mut chunk[i], &keys[i]);
+            }
+        }
+    }
+
+    /// AES mix columns function.
+    #[inline]
+    pub(crate) fn mix_columns(block: &mut Block) {
+        let mut state = bitslice_block(block);
+        mix_columns_0(&mut state);
+        inv_bitslice_block(block, &state);
+    }
+
+    /// AES inverse mix columns function.
+    #[inline]
+    pub(crate) fn inv_mix_columns(block: &mut Block) {
+        let mut state = bitslice_block(block);
+        inv_mix_columns_0(&mut state);
+        inv_bitslice_block(block, &state);
+    }
+}
diff --git a/rust/vendor/aes/src/soft/fixslice64.rs b/rust/vendor/aes/src/soft/fixslice64.rs
new file mode 100644
index 0000000..18315b7
--- /dev/null
+++ b/rust/vendor/aes/src/soft/fixslice64.rs
@@ -0,0 +1,1540 @@
+//! Fixsliced implementations of AES-128, AES-192 and AES-256 (64-bit)
+//! adapted from the C implementation.
+//!
+//! All implementations are fully bitsliced and do not rely on any
+//! Look-Up Table (LUT).
+//!
+//! See the paper at <https://eprint.iacr.org/2020/1123.pdf> for more details.
+//!
+//! # Author (original C code)
+//!
+//! Alexandre Adomnicai, Nanyang Technological University, Singapore
+//! <alexandre.adomnicai@ntu.edu.sg>
+//!
+//! Originally licensed MIT. Relicensed as Apache 2.0+MIT with permission.
+
+#![allow(clippy::unreadable_literal)]
+
+use crate::Block;
+use cipher::{
+    consts::{U16, U24, U32},
+    generic_array::GenericArray,
+};
+
+/// AES block batch size for this implementation
+pub(crate) const FIXSLICE_BLOCKS: usize = 4;
+
+/// AES-128 round keys
+pub(crate) type FixsliceKeys128 = [u64; 88];
+
+/// AES-192 round keys
+pub(crate) type FixsliceKeys192 = [u64; 104];
+
+/// AES-256 round keys
+pub(crate) type FixsliceKeys256 = [u64; 120];
+
+/// 512-bit internal state
+pub(crate) type State = [u64; 8];
+
+/// Fully bitsliced AES-128 key schedule to match the fully-fixsliced representation.
+pub(crate) fn aes128_key_schedule(key: &GenericArray<u8, U16>) -> FixsliceKeys128 {
+    let mut rkeys = [0u64; 88];
+
+    bitslice(&mut rkeys[..8], key, key, key, key);
+
+    let mut rk_off = 0;
+    for rcon in 0..10 {
+        memshift32(&mut rkeys, rk_off);
+        rk_off += 8;
+
+        sub_bytes(&mut rkeys[rk_off..(rk_off + 8)]);
+        sub_bytes_nots(&mut rkeys[rk_off..(rk_off + 8)]);
+
+        if rcon < 8 {
+            add_round_constant_bit(&mut rkeys[rk_off..(rk_off + 8)], rcon);
+        } else {
+            add_round_constant_bit(&mut rkeys[rk_off..(rk_off + 8)], rcon - 8);
+            add_round_constant_bit(&mut rkeys[rk_off..(rk_off + 8)], rcon - 7);
+            add_round_constant_bit(&mut rkeys[rk_off..(rk_off + 8)], rcon - 5);
+            add_round_constant_bit(&mut rkeys[rk_off..(rk_off + 8)], rcon - 4);
+        }
+
+        xor_columns(&mut rkeys, rk_off, 8, ror_distance(1, 3));
+    }
+
+    // Adjust to match fixslicing format
+    #[cfg(feature = "compact")]
+    {
+        for i in (8..88).step_by(16) {
+            inv_shift_rows_1(&mut rkeys[i..(i + 8)]);
+        }
+    }
+    #[cfg(not(feature = "compact"))]
+    {
+        for i in (8..72).step_by(32) {
+            inv_shift_rows_1(&mut rkeys[i..(i + 8)]);
+            inv_shift_rows_2(&mut rkeys[(i + 8)..(i + 16)]);
+            inv_shift_rows_3(&mut rkeys[(i + 16)..(i + 24)]);
+        }
+        inv_shift_rows_1(&mut rkeys[72..80]);
+    }
+
+    // Account for NOTs removed from sub_bytes
+    for i in 1..11 {
+        sub_bytes_nots(&mut rkeys[(i * 8)..(i * 8 + 8)]);
+    }
+
+    rkeys
+}
+
+/// Fully bitsliced AES-192 key schedule to match the fully-fixsliced representation.
+pub(crate) fn aes192_key_schedule(key: &GenericArray<u8, U24>) -> FixsliceKeys192 {
+    let mut rkeys = [0u64; 104];
+    let mut tmp = [0u64; 8];
+
+    bitslice(
+        &mut rkeys[..8],
+        &key[..16],
+        &key[..16],
+        &key[..16],
+        &key[..16],
+    );
+    bitslice(&mut tmp, &key[8..], &key[8..], &key[8..], &key[8..]);
+
+    let mut rcon = 0;
+    let mut rk_off = 8;
+
+    loop {
+        for i in 0..8 {
+            rkeys[rk_off + i] = (0x00ff00ff00ff00ff & (tmp[i] >> 8))
+                | (0xff00ff00ff00ff00 & (rkeys[(rk_off - 8) + i] << 8));
+        }
+
+        sub_bytes(&mut tmp);
+        sub_bytes_nots(&mut tmp);
+
+        add_round_constant_bit(&mut tmp, rcon);
+        rcon += 1;
+
+        for i in 0..8 {
+            let mut ti = rkeys[rk_off + i];
+            ti ^= 0x0f000f000f000f00 & ror(tmp[i], ror_distance(1, 1));
+            ti ^= 0xf000f000f000f000 & (ti << 4);
+            tmp[i] = ti;
+        }
+        rkeys[rk_off..(rk_off + 8)].copy_from_slice(&tmp);
+        rk_off += 8;
+
+        for i in 0..8 {
+            let ui = tmp[i];
+            let mut ti = (0x00ff00ff00ff00ff & (rkeys[(rk_off - 16) + i] >> 8))
+                | (0xff00ff00ff00ff00 & (ui << 8));
+            ti ^= 0x000f000f000f000f & (ui >> 12);
+            tmp[i] = ti
+                ^ (0xfff0fff0fff0fff0 & (ti << 4))
+                ^ (0xff00ff00ff00ff00 & (ti << 8))
+                ^ (0xf000f000f000f000 & (ti << 12));
+        }
+        rkeys[rk_off..(rk_off + 8)].copy_from_slice(&tmp);
+        rk_off += 8;
+
+        sub_bytes(&mut tmp);
+        sub_bytes_nots(&mut tmp);
+
+        add_round_constant_bit(&mut tmp, rcon);
+        rcon += 1;
+
+        for i in 0..8 {
+            let mut ti = (0x00ff00ff00ff00ff & (rkeys[(rk_off - 16) + i] >> 8))
+                | (0xff00ff00ff00ff00 & (rkeys[(rk_off - 8) + i] << 8));
+            ti ^= 0x000f000f000f000f & ror(tmp[i], ror_distance(1, 3));
+            rkeys[rk_off + i] = ti
+                ^ (0xfff0fff0fff0fff0 & (ti << 4))
+                ^ (0xff00ff00ff00ff00 & (ti << 8))
+                ^ (0xf000f000f000f000 & (ti << 12));
+        }
+        rk_off += 8;
+
+        if rcon >= 8 {
+            break;
+        }
+
+        for i in 0..8 {
+            let ui = rkeys[(rk_off - 8) + i];
+            let mut ti = rkeys[(rk_off - 16) + i];
+            ti ^= 0x0f000f000f000f00 & (ui >> 4);
+            ti ^= 0xf000f000f000f000 & (ti << 4);
+            tmp[i] = ti;
+        }
+    }
+
+    // Adjust to match fixslicing format
+    #[cfg(feature = "compact")]
+    {
+        for i in (8..104).step_by(16) {
+            inv_shift_rows_1(&mut rkeys[i..(i + 8)]);
+        }
+    }
+    #[cfg(not(feature = "compact"))]
+    {
+        for i in (0..96).step_by(32) {
+            inv_shift_rows_1(&mut rkeys[(i + 8)..(i + 16)]);
+            inv_shift_rows_2(&mut rkeys[(i + 16)..(i + 24)]);
+            inv_shift_rows_3(&mut rkeys[(i + 24)..(i + 32)]);
+        }
+    }
+
+    // Account for NOTs removed from sub_bytes
+    for i in 1..13 {
+        sub_bytes_nots(&mut rkeys[(i * 8)..(i * 8 + 8)]);
+    }
+
+    rkeys
+}
+
+/// Fully bitsliced AES-256 key schedule to match the fully-fixsliced representation.
+pub(crate) fn aes256_key_schedule(key: &GenericArray<u8, U32>) -> FixsliceKeys256 {
+    let mut rkeys = [0u64; 120];
+
+    bitslice(
+        &mut rkeys[..8],
+        &key[..16],
+        &key[..16],
+        &key[..16],
+        &key[..16],
+    );
+    bitslice(
+        &mut rkeys[8..16],
+        &key[16..],
+        &key[16..],
+        &key[16..],
+        &key[16..],
+    );
+
+    let mut rk_off = 8;
+
+    let mut rcon = 0;
+    loop {
+        memshift32(&mut rkeys, rk_off);
+        rk_off += 8;
+
+        sub_bytes(&mut rkeys[rk_off..(rk_off + 8)]);
+        sub_bytes_nots(&mut rkeys[rk_off..(rk_off + 8)]);
+
+        add_round_constant_bit(&mut rkeys[rk_off..(rk_off + 8)], rcon);
+        xor_columns(&mut rkeys, rk_off, 16, ror_distance(1, 3));
+        rcon += 1;
+
+        if rcon == 7 {
+            break;
+        }
+
+        memshift32(&mut rkeys, rk_off);
+        rk_off += 8;
+
+        sub_bytes(&mut rkeys[rk_off..(rk_off + 8)]);
+        sub_bytes_nots(&mut rkeys[rk_off..(rk_off + 8)]);
+
+        xor_columns(&mut rkeys, rk_off, 16, ror_distance(0, 3));
+    }
+
+    // Adjust to match fixslicing format
+    #[cfg(feature = "compact")]
+    {
+        for i in (8..120).step_by(16) {
+            inv_shift_rows_1(&mut rkeys[i..(i + 8)]);
+        }
+    }
+    #[cfg(not(feature = "compact"))]
+    {
+        for i in (8..104).step_by(32) {
+            inv_shift_rows_1(&mut rkeys[i..(i + 8)]);
+            inv_shift_rows_2(&mut rkeys[(i + 8)..(i + 16)]);
+            inv_shift_rows_3(&mut rkeys[(i + 16)..(i + 24)]);
+        }
+        inv_shift_rows_1(&mut rkeys[104..112]);
+    }
+
+    // Account for NOTs removed from sub_bytes
+    for i in 1..15 {
+        sub_bytes_nots(&mut rkeys[(i * 8)..(i * 8 + 8)]);
+    }
+
+    rkeys
+}
+
+/// Fully-fixsliced AES-128 decryption (the InvShiftRows is completely omitted).
+///
+/// Decrypts four blocks in-place and in parallel.
+pub(crate) fn aes128_decrypt(rkeys: &FixsliceKeys128, blocks: &mut [Block]) {
+    debug_assert_eq!(blocks.len(), FIXSLICE_BLOCKS);
+    let mut state = State::default();
+
+    bitslice(&mut state, &blocks[0], &blocks[1], &blocks[2], &blocks[3]);
+
+    add_round_key(&mut state, &rkeys[80..]);
+    inv_sub_bytes(&mut state);
+
+    #[cfg(not(feature = "compact"))]
+    {
+        inv_shift_rows_2(&mut state);
+    }
+
+    let mut rk_off = 72;
+    loop {
+        #[cfg(feature = "compact")]
+        {
+            inv_shift_rows_2(&mut state);
+        }
+
+        add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
+        inv_mix_columns_1(&mut state);
+        inv_sub_bytes(&mut state);
+        rk_off -= 8;
+
+        if rk_off == 0 {
+            break;
+        }
+
+        add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
+        inv_mix_columns_0(&mut state);
+        inv_sub_bytes(&mut state);
+        rk_off -= 8;
+
+        #[cfg(not(feature = "compact"))]
+        {
+            add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
+            inv_mix_columns_3(&mut state);
+            inv_sub_bytes(&mut state);
+            rk_off -= 8;
+
+            add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
+            inv_mix_columns_2(&mut state);
+            inv_sub_bytes(&mut state);
+            rk_off -= 8;
+        }
+    }
+
+    add_round_key(&mut state, &rkeys[..8]);
+
+    inv_bitslice(&state, blocks);
+}
+
+/// Fully-fixsliced AES-128 encryption (the ShiftRows is completely omitted).
+///
+/// Encrypts four blocks in-place and in parallel.
+pub(crate) fn aes128_encrypt(rkeys: &FixsliceKeys128, blocks: &mut [Block]) {
+    debug_assert_eq!(blocks.len(), FIXSLICE_BLOCKS);
+    let mut state = State::default();
+
+    bitslice(&mut state, &blocks[0], &blocks[1], &blocks[2], &blocks[3]);
+
+    add_round_key(&mut state, &rkeys[..8]);
+
+    let mut rk_off = 8;
+    loop {
+        sub_bytes(&mut state);
+        mix_columns_1(&mut state);
+        add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
+        rk_off += 8;
+
+        #[cfg(feature = "compact")]
+        {
+            shift_rows_2(&mut state);
+        }
+
+        if rk_off == 80 {
+            break;
+        }
+
+        #[cfg(not(feature = "compact"))]
+        {
+            sub_bytes(&mut state);
+            mix_columns_2(&mut state);
+            add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
+            rk_off += 8;
+
+            sub_bytes(&mut state);
+            mix_columns_3(&mut state);
+            add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
+            rk_off += 8;
+        }
+
+        sub_bytes(&mut state);
+        mix_columns_0(&mut state);
+        add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
+        rk_off += 8;
+    }
+
+    #[cfg(not(feature = "compact"))]
+    {
+        shift_rows_2(&mut state);
+    }
+
+    sub_bytes(&mut state);
+    add_round_key(&mut state, &rkeys[80..]);
+
+    inv_bitslice(&state, blocks);
+}
+
+/// Fully-fixsliced AES-192 decryption (the InvShiftRows is completely omitted).
+///
+/// Decrypts four blocks in-place and in parallel.
+pub(crate) fn aes192_decrypt(rkeys: &FixsliceKeys192, blocks: &mut [Block]) {
+    debug_assert_eq!(blocks.len(), FIXSLICE_BLOCKS);
+    let mut state = State::default();
+
+    bitslice(&mut state, &blocks[0], &blocks[1], &blocks[2], &blocks[3]);
+
+    add_round_key(&mut state, &rkeys[96..]);
+    inv_sub_bytes(&mut state);
+
+    let mut rk_off = 88;
+    loop {
+        #[cfg(feature = "compact")]
+        {
+            inv_shift_rows_2(&mut state);
+        }
+        #[cfg(not(feature = "compact"))]
+        {
+            add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
+            inv_mix_columns_3(&mut state);
+            inv_sub_bytes(&mut state);
+            rk_off -= 8;
+
+            add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
+            inv_mix_columns_2(&mut state);
+            inv_sub_bytes(&mut state);
+            rk_off -= 8;
+        }
+
+        add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
+        inv_mix_columns_1(&mut state);
+        inv_sub_bytes(&mut state);
+        rk_off -= 8;
+
+        if rk_off == 0 {
+            break;
+        }
+
+        add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
+        inv_mix_columns_0(&mut state);
+        inv_sub_bytes(&mut state);
+        rk_off -= 8;
+    }
+
+    add_round_key(&mut state, &rkeys[..8]);
+
+    inv_bitslice(&state, blocks);
+}
+
+/// Fully-fixsliced AES-192 encryption (the ShiftRows is completely omitted).
+///
+/// Encrypts four blocks in-place and in parallel.
+pub(crate) fn aes192_encrypt(rkeys: &FixsliceKeys192, blocks: &mut [Block]) {
+    debug_assert_eq!(blocks.len(), FIXSLICE_BLOCKS);
+    let mut state = State::default();
+
+    bitslice(&mut state, &blocks[0], &blocks[1], &blocks[2], &blocks[3]);
+
+    add_round_key(&mut state, &rkeys[..8]);
+
+    let mut rk_off = 8;
+    loop {
+        sub_bytes(&mut state);
+        mix_columns_1(&mut state);
+        add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
+        rk_off += 8;
+
+        #[cfg(feature = "compact")]
+        {
+            shift_rows_2(&mut state);
+        }
+        #[cfg(not(feature = "compact"))]
+        {
+            sub_bytes(&mut state);
+            mix_columns_2(&mut state);
+            add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
+            rk_off += 8;
+
+            sub_bytes(&mut state);
+            mix_columns_3(&mut state);
+            add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
+            rk_off += 8;
+        }
+
+        if rk_off == 96 {
+            break;
+        }
+
+        sub_bytes(&mut state);
+        mix_columns_0(&mut state);
+        add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
+        rk_off += 8;
+    }
+
+    sub_bytes(&mut state);
+    add_round_key(&mut state, &rkeys[96..]);
+
+    inv_bitslice(&state, blocks);
+}
+
+/// Fully-fixsliced AES-256 decryption (the InvShiftRows is completely omitted).
+///
+/// Decrypts four blocks in-place and in parallel.
+pub(crate) fn aes256_decrypt(rkeys: &FixsliceKeys256, blocks: &mut [Block]) {
+    debug_assert_eq!(blocks.len(), FIXSLICE_BLOCKS);
+    let mut state = State::default();
+
+    bitslice(&mut state, &blocks[0], &blocks[1], &blocks[2], &blocks[3]);
+
+    add_round_key(&mut state, &rkeys[112..]);
+    inv_sub_bytes(&mut state);
+
+    #[cfg(not(feature = "compact"))]
+    {
+        inv_shift_rows_2(&mut state);
+    }
+
+    let mut rk_off = 104;
+    loop {
+        #[cfg(feature = "compact")]
+        {
+            inv_shift_rows_2(&mut state);
+        }
+
+        add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
+        inv_mix_columns_1(&mut state);
+        inv_sub_bytes(&mut state);
+        rk_off -= 8;
+
+        if rk_off == 0 {
+            break;
+        }
+
+        add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
+        inv_mix_columns_0(&mut state);
+        inv_sub_bytes(&mut state);
+        rk_off -= 8;
+
+        #[cfg(not(feature = "compact"))]
+        {
+            add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
+            inv_mix_columns_3(&mut state);
+            inv_sub_bytes(&mut state);
+            rk_off -= 8;
+
+            add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
+            inv_mix_columns_2(&mut state);
+            inv_sub_bytes(&mut state);
+            rk_off -= 8;
+        }
+    }
+
+    add_round_key(&mut state, &rkeys[..8]);
+
+    inv_bitslice(&state, blocks);
+}
+
+/// Fully-fixsliced AES-256 encryption (the ShiftRows is completely omitted).
+///
+/// Encrypts four blocks in-place and in parallel.
+pub(crate) fn aes256_encrypt(rkeys: &FixsliceKeys256, blocks: &mut [Block]) {
+    debug_assert_eq!(blocks.len(), FIXSLICE_BLOCKS);
+    let mut state = State::default();
+
+    bitslice(&mut state, &blocks[0], &blocks[1], &blocks[2], &blocks[3]);
+
+    add_round_key(&mut state, &rkeys[..8]);
+
+    let mut rk_off = 8;
+    loop {
+        sub_bytes(&mut state);
+        mix_columns_1(&mut state);
+        add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
+        rk_off += 8;
+
+        #[cfg(feature = "compact")]
+        {
+            shift_rows_2(&mut state);
+        }
+
+        if rk_off == 112 {
+            break;
+        }
+
+        #[cfg(not(feature = "compact"))]
+        {
+            sub_bytes(&mut state);
+            mix_columns_2(&mut state);
+            add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
+            rk_off += 8;
+
+            sub_bytes(&mut state);
+            mix_columns_3(&mut state);
+            add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
+            rk_off += 8;
+        }
+
+        sub_bytes(&mut state);
+        mix_columns_0(&mut state);
+        add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
+        rk_off += 8;
+    }
+
+    #[cfg(not(feature = "compact"))]
+    {
+        shift_rows_2(&mut state);
+    }
+
+    sub_bytes(&mut state);
+    add_round_key(&mut state, &rkeys[112..]);
+
+    inv_bitslice(&state, blocks);
+}
+
+/// Note that the 4 bitwise NOT (^= 0xffffffffffffffff) are accounted for here so that it is a true
+/// inverse of 'sub_bytes'.
+fn inv_sub_bytes(state: &mut [u64]) {
+    debug_assert_eq!(state.len(), 8);
+
+    // Scheduled using https://github.com/Ko-/aes-armcortexm/tree/public/scheduler
+    // Inline "stack" comments reflect suggested stores and loads (ARM Cortex-M3 and M4)
+
+    let u7 = state[0];
+    let u6 = state[1];
+    let u5 = state[2];
+    let u4 = state[3];
+    let u3 = state[4];
+    let u2 = state[5];
+    let u1 = state[6];
+    let u0 = state[7];
+
+    let t23 = u0 ^ u3;
+    let t8 = u1 ^ t23;
+    let m2 = t23 & t8;
+    let t4 = u4 ^ t8;
+    let t22 = u1 ^ u3;
+    let t2 = u0 ^ u1;
+    let t1 = u3 ^ u4;
+    // t23 -> stack
+    let t9 = u7 ^ t1;
+    // t8 -> stack
+    let m7 = t22 & t9;
+    // t9 -> stack
+    let t24 = u4 ^ u7;
+    // m7 -> stack
+    let t10 = t2 ^ t24;
+    // u4 -> stack
+    let m14 = t2 & t10;
+    let r5 = u6 ^ u7;
+    // m2 -> stack
+    let t3 = t1 ^ r5;
+    // t2 -> stack
+    let t13 = t2 ^ r5;
+    let t19 = t22 ^ r5;
+    // t3 -> stack
+    let t17 = u2 ^ t19;
+    // t4 -> stack
+    let t25 = u2 ^ t1;
+    let r13 = u1 ^ u6;
+    // t25 -> stack
+    let t20 = t24 ^ r13;
+    // t17 -> stack
+    let m9 = t20 & t17;
+    // t20 -> stack
+    let r17 = u2 ^ u5;
+    // t22 -> stack
+    let t6 = t22 ^ r17;
+    // t13 -> stack
+    let m1 = t13 & t6;
+    let y5 = u0 ^ r17;
+    let m4 = t19 & y5;
+    let m5 = m4 ^ m1;
+    let m17 = m5 ^ t24;
+    let r18 = u5 ^ u6;
+    let t27 = t1 ^ r18;
+    let t15 = t10 ^ t27;
+    // t6 -> stack
+    let m11 = t1 & t15;
+    let m15 = m14 ^ m11;
+    let m21 = m17 ^ m15;
+    // t1 -> stack
+    // t4 <- stack
+    let m12 = t4 & t27;
+    let m13 = m12 ^ m11;
+    let t14 = t10 ^ r18;
+    let m3 = t14 ^ m1;
+    // m2 <- stack
+    let m16 = m3 ^ m2;
+    let m20 = m16 ^ m13;
+    // u4 <- stack
+    let r19 = u2 ^ u4;
+    let t16 = r13 ^ r19;
+    // t3 <- stack
+    let t26 = t3 ^ t16;
+    let m6 = t3 & t16;
+    let m8 = t26 ^ m6;
+    // t10 -> stack
+    // m7 <- stack
+    let m18 = m8 ^ m7;
+    let m22 = m18 ^ m13;
+    let m25 = m22 & m20;
+    let m26 = m21 ^ m25;
+    let m10 = m9 ^ m6;
+    let m19 = m10 ^ m15;
+    // t25 <- stack
+    let m23 = m19 ^ t25;
+    let m28 = m23 ^ m25;
+    let m24 = m22 ^ m23;
+    let m30 = m26 & m24;
+    let m39 = m23 ^ m30;
+    let m48 = m39 & y5;
+    let m57 = m39 & t19;
+    // m48 -> stack
+    let m36 = m24 ^ m25;
+    let m31 = m20 & m23;
+    let m27 = m20 ^ m21;
+    let m32 = m27 & m31;
+    let m29 = m28 & m27;
+    let m37 = m21 ^ m29;
+    // m39 -> stack
+    let m42 = m37 ^ m39;
+    let m52 = m42 & t15;
+    // t27 -> stack
+    // t1 <- stack
+    let m61 = m42 & t1;
+    let p0 = m52 ^ m61;
+    let p16 = m57 ^ m61;
+    // m57 -> stack
+    // t20 <- stack
+    let m60 = m37 & t20;
+    // p16 -> stack
+    // t17 <- stack
+    let m51 = m37 & t17;
+    let m33 = m27 ^ m25;
+    let m38 = m32 ^ m33;
+    let m43 = m37 ^ m38;
+    let m49 = m43 & t16;
+    let p6 = m49 ^ m60;
+    let p13 = m49 ^ m51;
+    let m58 = m43 & t3;
+    // t9 <- stack
+    let m50 = m38 & t9;
+    // t22 <- stack
+    let m59 = m38 & t22;
+    // p6 -> stack
+    let p1 = m58 ^ m59;
+    let p7 = p0 ^ p1;
+    let m34 = m21 & m22;
+    let m35 = m24 & m34;
+    let m40 = m35 ^ m36;
+    let m41 = m38 ^ m40;
+    let m45 = m42 ^ m41;
+    // t27 <- stack
+    let m53 = m45 & t27;
+    let p8 = m50 ^ m53;
+    let p23 = p7 ^ p8;
+    // t4 <- stack
+    let m62 = m45 & t4;
+    let p14 = m49 ^ m62;
+    let s6 = p14 ^ p23;
+    // t10 <- stack
+    let m54 = m41 & t10;
+    let p2 = m54 ^ m62;
+    let p22 = p2 ^ p7;
+    let s0 = p13 ^ p22;
+    let p17 = m58 ^ p2;
+    let p15 = m54 ^ m59;
+    // t2 <- stack
+    let m63 = m41 & t2;
+    // m39 <- stack
+    let m44 = m39 ^ m40;
+    // p17 -> stack
+    // t6 <- stack
+    let m46 = m44 & t6;
+    let p5 = m46 ^ m51;
+    // p23 -> stack
+    let p18 = m63 ^ p5;
+    let p24 = p5 ^ p7;
+    // m48 <- stack
+    let p12 = m46 ^ m48;
+    let s3 = p12 ^ p22;
+    // t13 <- stack
+    let m55 = m44 & t13;
+    let p9 = m55 ^ m63;
+    // p16 <- stack
+    let s7 = p9 ^ p16;
+    // t8 <- stack
+    let m47 = m40 & t8;
+    let p3 = m47 ^ m50;
+    let p19 = p2 ^ p3;
+    let s5 = p19 ^ p24;
+    let p11 = p0 ^ p3;
+    let p26 = p9 ^ p11;
+    // t23 <- stack
+    let m56 = m40 & t23;
+    let p4 = m48 ^ m56;
+    // p6 <- stack
+    let p20 = p4 ^ p6;
+    let p29 = p15 ^ p20;
+    let s1 = p26 ^ p29;
+    // m57 <- stack
+    let p10 = m57 ^ p4;
+    let p27 = p10 ^ p18;
+    // p23 <- stack
+    let s4 = p23 ^ p27;
+    let p25 = p6 ^ p10;
+    let p28 = p11 ^ p25;
+    // p17 <- stack
+    let s2 = p17 ^ p28;
+
+    state[0] = s7;
+    state[1] = s6;
+    state[2] = s5;
+    state[3] = s4;
+    state[4] = s3;
+    state[5] = s2;
+    state[6] = s1;
+    state[7] = s0;
+}
+
+/// Bitsliced implementation of the AES Sbox based on Boyar, Peralta and Calik.
+///
+/// See: <http://www.cs.yale.edu/homes/peralta/CircuitStuff/SLP_AES_113.txt>
+///
+/// Note that the 4 bitwise NOT (^= 0xffffffffffffffff) are moved to the key schedule.
+fn sub_bytes(state: &mut [u64]) {
+    debug_assert_eq!(state.len(), 8);
+
+    // Scheduled using https://github.com/Ko-/aes-armcortexm/tree/public/scheduler
+    // Inline "stack" comments reflect suggested stores and loads (ARM Cortex-M3 and M4)
+
+    let u7 = state[0];
+    let u6 = state[1];
+    let u5 = state[2];
+    let u4 = state[3];
+    let u3 = state[4];
+    let u2 = state[5];
+    let u1 = state[6];
+    let u0 = state[7];
+
+    let y14 = u3 ^ u5;
+    let y13 = u0 ^ u6;
+    let y12 = y13 ^ y14;
+    let t1 = u4 ^ y12;
+    let y15 = t1 ^ u5;
+    let t2 = y12 & y15;
+    let y6 = y15 ^ u7;
+    let y20 = t1 ^ u1;
+    // y12 -> stack
+    let y9 = u0 ^ u3;
+    // y20 -> stack
+    let y11 = y20 ^ y9;
+    // y9 -> stack
+    let t12 = y9 & y11;
+    // y6 -> stack
+    let y7 = u7 ^ y11;
+    let y8 = u0 ^ u5;
+    let t0 = u1 ^ u2;
+    let y10 = y15 ^ t0;
+    // y15 -> stack
+    let y17 = y10 ^ y11;
+    // y14 -> stack
+    let t13 = y14 & y17;
+    let t14 = t13 ^ t12;
+    // y17 -> stack
+    let y19 = y10 ^ y8;
+    // y10 -> stack
+    let t15 = y8 & y10;
+    let t16 = t15 ^ t12;
+    let y16 = t0 ^ y11;
+    // y11 -> stack
+    let y21 = y13 ^ y16;
+    // y13 -> stack
+    let t7 = y13 & y16;
+    // y16 -> stack
+    let y18 = u0 ^ y16;
+    let y1 = t0 ^ u7;
+    let y4 = y1 ^ u3;
+    // u7 -> stack
+    let t5 = y4 & u7;
+    let t6 = t5 ^ t2;
+    let t18 = t6 ^ t16;
+    let t22 = t18 ^ y19;
+    let y2 = y1 ^ u0;
+    let t10 = y2 & y7;
+    let t11 = t10 ^ t7;
+    let t20 = t11 ^ t16;
+    let t24 = t20 ^ y18;
+    let y5 = y1 ^ u6;
+    let t8 = y5 & y1;
+    let t9 = t8 ^ t7;
+    let t19 = t9 ^ t14;
+    let t23 = t19 ^ y21;
+    let y3 = y5 ^ y8;
+    // y6 <- stack
+    let t3 = y3 & y6;
+    let t4 = t3 ^ t2;
+    // y20 <- stack
+    let t17 = t4 ^ y20;
+    let t21 = t17 ^ t14;
+    let t26 = t21 & t23;
+    let t27 = t24 ^ t26;
+    let t31 = t22 ^ t26;
+    let t25 = t21 ^ t22;
+    // y4 -> stack
+    let t28 = t25 & t27;
+    let t29 = t28 ^ t22;
+    let z14 = t29 & y2;
+    let z5 = t29 & y7;
+    let t30 = t23 ^ t24;
+    let t32 = t31 & t30;
+    let t33 = t32 ^ t24;
+    let t35 = t27 ^ t33;
+    let t36 = t24 & t35;
+    let t38 = t27 ^ t36;
+    let t39 = t29 & t38;
+    let t40 = t25 ^ t39;
+    let t43 = t29 ^ t40;
+    // y16 <- stack
+    let z3 = t43 & y16;
+    let tc12 = z3 ^ z5;
+    // tc12 -> stack
+    // y13 <- stack
+    let z12 = t43 & y13;
+    let z13 = t40 & y5;
+    let z4 = t40 & y1;
+    let tc6 = z3 ^ z4;
+    let t34 = t23 ^ t33;
+    let t37 = t36 ^ t34;
+    let t41 = t40 ^ t37;
+    // y10 <- stack
+    let z8 = t41 & y10;
+    let z17 = t41 & y8;
+    let t44 = t33 ^ t37;
+    // y15 <- stack
+    let z0 = t44 & y15;
+    // z17 -> stack
+    // y12 <- stack
+    let z9 = t44 & y12;
+    let z10 = t37 & y3;
+    let z1 = t37 & y6;
+    let tc5 = z1 ^ z0;
+    let tc11 = tc6 ^ tc5;
+    // y4 <- stack
+    let z11 = t33 & y4;
+    let t42 = t29 ^ t33;
+    let t45 = t42 ^ t41;
+    // y17 <- stack
+    let z7 = t45 & y17;
+    let tc8 = z7 ^ tc6;
+    // y14 <- stack
+    let z16 = t45 & y14;
+    // y11 <- stack
+    let z6 = t42 & y11;
+    let tc16 = z6 ^ tc8;
+    // z14 -> stack
+    // y9 <- stack
+    let z15 = t42 & y9;
+    let tc20 = z15 ^ tc16;
+    let tc1 = z15 ^ z16;
+    let tc2 = z10 ^ tc1;
+    let tc21 = tc2 ^ z11;
+    let tc3 = z9 ^ tc2;
+    let s0 = tc3 ^ tc16;
+    let s3 = tc3 ^ tc11;
+    let s1 = s3 ^ tc16;
+    let tc13 = z13 ^ tc1;
+    // u7 <- stack
+    let z2 = t33 & u7;
+    let tc4 = z0 ^ z2;
+    let tc7 = z12 ^ tc4;
+    let tc9 = z8 ^ tc7;
+    let tc10 = tc8 ^ tc9;
+    // z14 <- stack
+    let tc17 = z14 ^ tc10;
+    let s5 = tc21 ^ tc17;
+    let tc26 = tc17 ^ tc20;
+    // z17 <- stack
+    let s2 = tc26 ^ z17;
+    // tc12 <- stack
+    let tc14 = tc4 ^ tc12;
+    let tc18 = tc13 ^ tc14;
+    let s6 = tc10 ^ tc18;
+    let s7 = z12 ^ tc18;
+    let s4 = tc14 ^ s3;
+
+    state[0] = s7;
+    state[1] = s6;
+    state[2] = s5;
+    state[3] = s4;
+    state[4] = s3;
+    state[5] = s2;
+    state[6] = s1;
+    state[7] = s0;
+}
+
+/// NOT operations that are omitted in S-box
+#[inline]
+fn sub_bytes_nots(state: &mut [u64]) {
+    debug_assert_eq!(state.len(), 8);
+    state[0] ^= 0xffffffffffffffff;
+    state[1] ^= 0xffffffffffffffff;
+    state[5] ^= 0xffffffffffffffff;
+    state[6] ^= 0xffffffffffffffff;
+}
+
+/// Computation of the MixColumns transformation in the fixsliced representation, with different
+/// rotations used according to the round number mod 4.
+///
+/// Based on Käsper-Schwabe, similar to https://github.com/Ko-/aes-armcortexm.
+macro_rules! define_mix_columns {
+    (
+        $name:ident,
+        $name_inv:ident,
+        $first_rotate:path,
+        $second_rotate:path
+    ) => {
+        #[rustfmt::skip]
+        fn $name(state: &mut State) {
+            let (a0, a1, a2, a3, a4, a5, a6, a7) = (
+                state[0], state[1], state[2], state[3], state[4], state[5], state[6], state[7]
+            );
+            let (b0, b1, b2, b3, b4, b5, b6, b7) = (
+                $first_rotate(a0),
+                $first_rotate(a1),
+                $first_rotate(a2),
+                $first_rotate(a3),
+                $first_rotate(a4),
+                $first_rotate(a5),
+                $first_rotate(a6),
+                $first_rotate(a7),
+            );
+            let (c0, c1, c2, c3, c4, c5, c6, c7) = (
+                a0 ^ b0,
+                a1 ^ b1,
+                a2 ^ b2,
+                a3 ^ b3,
+                a4 ^ b4,
+                a5 ^ b5,
+                a6 ^ b6,
+                a7 ^ b7,
+            );
+            state[0] = b0      ^ c7 ^ $second_rotate(c0);
+            state[1] = b1 ^ c0 ^ c7 ^ $second_rotate(c1);
+            state[2] = b2 ^ c1      ^ $second_rotate(c2);
+            state[3] = b3 ^ c2 ^ c7 ^ $second_rotate(c3);
+            state[4] = b4 ^ c3 ^ c7 ^ $second_rotate(c4);
+            state[5] = b5 ^ c4      ^ $second_rotate(c5);
+            state[6] = b6 ^ c5      ^ $second_rotate(c6);
+            state[7] = b7 ^ c6      ^ $second_rotate(c7);
+        }
+
+        #[rustfmt::skip]
+        fn $name_inv(state: &mut State) {
+            let (a0, a1, a2, a3, a4, a5, a6, a7) = (
+                state[0], state[1], state[2], state[3], state[4], state[5], state[6], state[7]
+            );
+            let (b0, b1, b2, b3, b4, b5, b6, b7) = (
+                $first_rotate(a0),
+                $first_rotate(a1),
+                $first_rotate(a2),
+                $first_rotate(a3),
+                $first_rotate(a4),
+                $first_rotate(a5),
+                $first_rotate(a6),
+                $first_rotate(a7),
+            );
+            let (c0, c1, c2, c3, c4, c5, c6, c7) = (
+                a0 ^ b0,
+                a1 ^ b1,
+                a2 ^ b2,
+                a3 ^ b3,
+                a4 ^ b4,
+                a5 ^ b5,
+                a6 ^ b6,
+                a7 ^ b7,
+            );
+            let (d0, d1, d2, d3, d4, d5, d6, d7) = (
+                a0      ^ c7,
+                a1 ^ c0 ^ c7,
+                a2 ^ c1,
+                a3 ^ c2 ^ c7,
+                a4 ^ c3 ^ c7,
+                a5 ^ c4,
+                a6 ^ c5,
+                a7 ^ c6,
+            );
+            let (e0, e1, e2, e3, e4, e5, e6, e7) = (
+                c0      ^ d6,
+                c1      ^ d6 ^ d7,
+                c2 ^ d0      ^ d7,
+                c3 ^ d1 ^ d6,
+                c4 ^ d2 ^ d6 ^ d7,
+                c5 ^ d3      ^ d7,
+                c6 ^ d4,
+                c7 ^ d5,
+            );
+            state[0] = d0 ^ e0 ^ $second_rotate(e0);
+            state[1] = d1 ^ e1 ^ $second_rotate(e1);
+            state[2] = d2 ^ e2 ^ $second_rotate(e2);
+            state[3] = d3 ^ e3 ^ $second_rotate(e3);
+            state[4] = d4 ^ e4 ^ $second_rotate(e4);
+            state[5] = d5 ^ e5 ^ $second_rotate(e5);
+            state[6] = d6 ^ e6 ^ $second_rotate(e6);
+            state[7] = d7 ^ e7 ^ $second_rotate(e7);
+        }
+    }
+}
+
+define_mix_columns!(
+    mix_columns_0,
+    inv_mix_columns_0,
+    rotate_rows_1,
+    rotate_rows_2
+);
+
+define_mix_columns!(
+    mix_columns_1,
+    inv_mix_columns_1,
+    rotate_rows_and_columns_1_1,
+    rotate_rows_and_columns_2_2
+);
+
+#[cfg(not(feature = "compact"))]
+define_mix_columns!(
+    mix_columns_2,
+    inv_mix_columns_2,
+    rotate_rows_and_columns_1_2,
+    rotate_rows_2
+);
+
+#[cfg(not(feature = "compact"))]
+define_mix_columns!(
+    mix_columns_3,
+    inv_mix_columns_3,
+    rotate_rows_and_columns_1_3,
+    rotate_rows_and_columns_2_2
+);
+
+#[inline]
+fn delta_swap_1(a: &mut u64, shift: u32, mask: u64) {
+    let t = (*a ^ ((*a) >> shift)) & mask;
+    *a ^= t ^ (t << shift);
+}
+
+#[inline]
+fn delta_swap_2(a: &mut u64, b: &mut u64, shift: u32, mask: u64) {
+    let t = (*a ^ ((*b) >> shift)) & mask;
+    *a ^= t;
+    *b ^= t << shift;
+}
+
+/// Applies ShiftRows once on an AES state (or key).
+#[cfg(any(not(feature = "compact"), feature = "hazmat"))]
+#[inline]
+fn shift_rows_1(state: &mut [u64]) {
+    debug_assert_eq!(state.len(), 8);
+    for x in state.iter_mut() {
+        delta_swap_1(x, 8, 0x00f000ff000f0000);
+        delta_swap_1(x, 4, 0x0f0f00000f0f0000);
+    }
+}
+
+/// Applies ShiftRows twice on an AES state (or key).
+#[inline]
+fn shift_rows_2(state: &mut [u64]) {
+    debug_assert_eq!(state.len(), 8);
+    for x in state.iter_mut() {
+        delta_swap_1(x, 8, 0x00ff000000ff0000);
+    }
+}
+
+/// Applies ShiftRows three times on an AES state (or key).
+#[inline]
+fn shift_rows_3(state: &mut [u64]) {
+    debug_assert_eq!(state.len(), 8);
+    for x in state.iter_mut() {
+        delta_swap_1(x, 8, 0x000f00ff00f00000);
+        delta_swap_1(x, 4, 0x0f0f00000f0f0000);
+    }
+}
+
+#[inline(always)]
+fn inv_shift_rows_1(state: &mut [u64]) {
+    shift_rows_3(state);
+}
+
+#[inline(always)]
+fn inv_shift_rows_2(state: &mut [u64]) {
+    shift_rows_2(state);
+}
+
+#[cfg(not(feature = "compact"))]
+#[inline(always)]
+fn inv_shift_rows_3(state: &mut [u64]) {
+    shift_rows_1(state);
+}
+
+/// XOR the columns after the S-box during the key schedule round function.
+///
+/// The `idx_xor` parameter refers to the index of the previous round key that is
+/// involved in the XOR computation (should be 8 and 16 for AES-128 and AES-256,
+/// respectively).
+///
+/// The `idx_ror` parameter refers to the rotation value, which varies between the
+/// different key schedules.
+fn xor_columns(rkeys: &mut [u64], offset: usize, idx_xor: usize, idx_ror: u32) {
+    for i in 0..8 {
+        let off_i = offset + i;
+        let rk = rkeys[off_i - idx_xor] ^ (0x000f000f000f000f & ror(rkeys[off_i], idx_ror));
+        rkeys[off_i] = rk
+            ^ (0xfff0fff0fff0fff0 & (rk << 4))
+            ^ (0xff00ff00ff00ff00 & (rk << 8))
+            ^ (0xf000f000f000f000 & (rk << 12));
+    }
+}
+
+/// Bitslice four 128-bit input blocks input0, input1, input2, input3 into a 512-bit internal state.
+fn bitslice(output: &mut [u64], input0: &[u8], input1: &[u8], input2: &[u8], input3: &[u8]) {
+    debug_assert_eq!(output.len(), 8);
+    debug_assert_eq!(input0.len(), 16);
+    debug_assert_eq!(input1.len(), 16);
+    debug_assert_eq!(input2.len(), 16);
+    debug_assert_eq!(input3.len(), 16);
+
+    // Bitslicing is a bit index manipulation. 512 bits of data means each bit is positioned at a
+    // 9-bit index. AES data is 4 blocks, each one a 4x4 column-major matrix of bytes, so the
+    // index is initially ([b]lock, [c]olumn, [r]ow, [p]osition):
+    //     b1 b0 c1 c0 r1 r0 p2 p1 p0
+    //
+    // The desired bitsliced data groups first by bit position, then row, column, block:
+    //     p2 p1 p0 r1 r0 c1 c0 b1 b0
+
+    #[rustfmt::skip]
+    fn read_reordered(input: &[u8]) -> u64 {
+        (u64::from(input[0x0])        ) |
+        (u64::from(input[0x1]) << 0x10) |
+        (u64::from(input[0x2]) << 0x20) |
+        (u64::from(input[0x3]) << 0x30) |
+        (u64::from(input[0x8]) << 0x08) |
+        (u64::from(input[0x9]) << 0x18) |
+        (u64::from(input[0xa]) << 0x28) |
+        (u64::from(input[0xb]) << 0x38)
+    }
+
+    // Reorder each block's bytes on input
+    //     __ __ c1 c0 r1 r0 __ __ __ => __ __ c0 r1 r0 c1 __ __ __
+    // Reorder by relabeling (note the order of input)
+    //     b1 b0 c0 __ __ __ __ __ __ => c0 b1 b0 __ __ __ __ __ __
+    let mut t0 = read_reordered(&input0[0x00..0x0c]);
+    let mut t4 = read_reordered(&input0[0x04..0x10]);
+    let mut t1 = read_reordered(&input1[0x00..0x0c]);
+    let mut t5 = read_reordered(&input1[0x04..0x10]);
+    let mut t2 = read_reordered(&input2[0x00..0x0c]);
+    let mut t6 = read_reordered(&input2[0x04..0x10]);
+    let mut t3 = read_reordered(&input3[0x00..0x0c]);
+    let mut t7 = read_reordered(&input3[0x04..0x10]);
+
+    // Bit Index Swap 6 <-> 0:
+    //     __ __ b0 __ __ __ __ __ p0 => __ __ p0 __ __ __ __ __ b0
+    let m0 = 0x5555555555555555;
+    delta_swap_2(&mut t1, &mut t0, 1, m0);
+    delta_swap_2(&mut t3, &mut t2, 1, m0);
+    delta_swap_2(&mut t5, &mut t4, 1, m0);
+    delta_swap_2(&mut t7, &mut t6, 1, m0);
+
+    // Bit Index Swap 7 <-> 1:
+    //     __ b1 __ __ __ __ __ p1 __ => __ p1 __ __ __ __ __ b1 __
+    let m1 = 0x3333333333333333;
+    delta_swap_2(&mut t2, &mut t0, 2, m1);
+    delta_swap_2(&mut t3, &mut t1, 2, m1);
+    delta_swap_2(&mut t6, &mut t4, 2, m1);
+    delta_swap_2(&mut t7, &mut t5, 2, m1);
+
+    // Bit Index Swap 8 <-> 2:
+    //     c0 __ __ __ __ __ p2 __ __ => p2 __ __ __ __ __ c0 __ __
+    let m2 = 0x0f0f0f0f0f0f0f0f;
+    delta_swap_2(&mut t4, &mut t0, 4, m2);
+    delta_swap_2(&mut t5, &mut t1, 4, m2);
+    delta_swap_2(&mut t6, &mut t2, 4, m2);
+    delta_swap_2(&mut t7, &mut t3, 4, m2);
+
+    // Final bitsliced bit index, as desired:
+    //     p2 p1 p0 r1 r0 c1 c0 b1 b0
+    output[0] = t0;
+    output[1] = t1;
+    output[2] = t2;
+    output[3] = t3;
+    output[4] = t4;
+    output[5] = t5;
+    output[6] = t6;
+    output[7] = t7;
+}
+
+/// Un-bitslice a 512-bit internal state into four 128-bit blocks of output.
+fn inv_bitslice(input: &[u64], output: &mut [Block]) {
+    debug_assert_eq!(input.len(), 8);
+    debug_assert_eq!(output.len(), 4);
+
+    // Unbitslicing is a bit index manipulation. 512 bits of data means each bit is positioned at
+    // a 9-bit index. AES data is 4 blocks, each one a 4x4 column-major matrix of bytes, so the
+    // desired index for the output is ([b]lock, [c]olumn, [r]ow, [p]osition):
+    //     b1 b0 c1 c0 r1 r0 p2 p1 p0
+    //
+    // The initially bitsliced data groups first by bit position, then row, column, block:
+    //     p2 p1 p0 r1 r0 c1 c0 b1 b0
+
+    let mut t0 = input[0];
+    let mut t1 = input[1];
+    let mut t2 = input[2];
+    let mut t3 = input[3];
+    let mut t4 = input[4];
+    let mut t5 = input[5];
+    let mut t6 = input[6];
+    let mut t7 = input[7];
+
+    // TODO: these bit index swaps are identical to those in 'packing'
+
+    // Bit Index Swap 6 <-> 0:
+    //     __ __ p0 __ __ __ __ __ b0 => __ __ b0 __ __ __ __ __ p0
+    let m0 = 0x5555555555555555;
+    delta_swap_2(&mut t1, &mut t0, 1, m0);
+    delta_swap_2(&mut t3, &mut t2, 1, m0);
+    delta_swap_2(&mut t5, &mut t4, 1, m0);
+    delta_swap_2(&mut t7, &mut t6, 1, m0);
+
+    // Bit Index Swap 7 <-> 1:
+    //     __ p1 __ __ __ __ __ b1 __ => __ b1 __ __ __ __ __ p1 __
+    let m1 = 0x3333333333333333;
+    delta_swap_2(&mut t2, &mut t0, 2, m1);
+    delta_swap_2(&mut t3, &mut t1, 2, m1);
+    delta_swap_2(&mut t6, &mut t4, 2, m1);
+    delta_swap_2(&mut t7, &mut t5, 2, m1);
+
+    // Bit Index Swap 8 <-> 2:
+    //     p2 __ __ __ __ __ c0 __ __ => c0 __ __ __ __ __ p2 __ __
+    let m2 = 0x0f0f0f0f0f0f0f0f;
+    delta_swap_2(&mut t4, &mut t0, 4, m2);
+    delta_swap_2(&mut t5, &mut t1, 4, m2);
+    delta_swap_2(&mut t6, &mut t2, 4, m2);
+    delta_swap_2(&mut t7, &mut t3, 4, m2);
+
+    #[rustfmt::skip]
+    fn write_reordered(columns: u64, output: &mut [u8]) {
+        output[0x0] = (columns        ) as u8;
+        output[0x1] = (columns >> 0x10) as u8;
+        output[0x2] = (columns >> 0x20) as u8;
+        output[0x3] = (columns >> 0x30) as u8;
+        output[0x8] = (columns >> 0x08) as u8;
+        output[0x9] = (columns >> 0x18) as u8;
+        output[0xa] = (columns >> 0x28) as u8;
+        output[0xb] = (columns >> 0x38) as u8;
+    }
+
+    // Reorder by relabeling (note the order of output)
+    //     c0 b1 b0 __ __ __ __ __ __ => b1 b0 c0 __ __ __ __ __ __
+    // Reorder each block's bytes on output
+    //     __ __ c0 r1 r0 c1 __ __ __ => __ __ c1 c0 r1 r0 __ __ __
+    write_reordered(t0, &mut output[0][0x00..0x0c]);
+    write_reordered(t4, &mut output[0][0x04..0x10]);
+    write_reordered(t1, &mut output[1][0x00..0x0c]);
+    write_reordered(t5, &mut output[1][0x04..0x10]);
+    write_reordered(t2, &mut output[2][0x00..0x0c]);
+    write_reordered(t6, &mut output[2][0x04..0x10]);
+    write_reordered(t3, &mut output[3][0x00..0x0c]);
+    write_reordered(t7, &mut output[3][0x04..0x10]);
+
+    // Final AES bit index, as desired:
+    //     b1 b0 c1 c0 r1 r0 p2 p1 p0
+}
+
+/// Copy 32-bytes within the provided slice to an 8-byte offset
+fn memshift32(buffer: &mut [u64], src_offset: usize) {
+    debug_assert_eq!(src_offset % 8, 0);
+
+    let dst_offset = src_offset + 8;
+    debug_assert!(dst_offset + 8 <= buffer.len());
+
+    for i in (0..8).rev() {
+        buffer[dst_offset + i] = buffer[src_offset + i];
+    }
+}
+
+/// XOR the round key to the internal state. The round keys are expected to be
+/// pre-computed and to be packed in the fixsliced representation.
+#[inline]
+fn add_round_key(state: &mut State, rkey: &[u64]) {
+    debug_assert_eq!(rkey.len(), 8);
+    for (a, b) in state.iter_mut().zip(rkey) {
+        *a ^= b;
+    }
+}
+
+#[inline(always)]
+fn add_round_constant_bit(state: &mut [u64], bit: usize) {
+    state[bit] ^= 0x00000000f0000000;
+}
+
+#[inline(always)]
+fn ror(x: u64, y: u32) -> u64 {
+    x.rotate_right(y)
+}
+
+#[inline(always)]
+fn ror_distance(rows: u32, cols: u32) -> u32 {
+    (rows << 4) + (cols << 2)
+}
+
+#[inline(always)]
+fn rotate_rows_1(x: u64) -> u64 {
+    ror(x, ror_distance(1, 0))
+}
+
+#[inline(always)]
+fn rotate_rows_2(x: u64) -> u64 {
+    ror(x, ror_distance(2, 0))
+}
+
+#[inline(always)]
+#[rustfmt::skip]
+fn rotate_rows_and_columns_1_1(x: u64) -> u64 {
+    (ror(x, ror_distance(1, 1)) & 0x0fff0fff0fff0fff) |
+    (ror(x, ror_distance(0, 1)) & 0xf000f000f000f000)
+}
+
+#[cfg(not(feature = "compact"))]
+#[inline(always)]
+#[rustfmt::skip]
+fn rotate_rows_and_columns_1_2(x: u64) -> u64 {
+    (ror(x, ror_distance(1, 2)) & 0x00ff00ff00ff00ff) |
+    (ror(x, ror_distance(0, 2)) & 0xff00ff00ff00ff00)
+}
+
+#[cfg(not(feature = "compact"))]
+#[inline(always)]
+#[rustfmt::skip]
+fn rotate_rows_and_columns_1_3(x: u64) -> u64 {
+    (ror(x, ror_distance(1, 3)) & 0x000f000f000f000f) |
+    (ror(x, ror_distance(0, 3)) & 0xfff0fff0fff0fff0)
+}
+
+#[inline(always)]
+#[rustfmt::skip]
+fn rotate_rows_and_columns_2_2(x: u64) -> u64 {
+    (ror(x, ror_distance(2, 2)) & 0x00ff00ff00ff00ff) |
+    (ror(x, ror_distance(1, 2)) & 0xff00ff00ff00ff00)
+}
+
+/// Low-level "hazmat" AES functions.
+///
+/// Note: this isn't actually used in the `Aes128`/`Aes192`/`Aes256`
+/// implementations in this crate, but instead provides raw access to
+/// the AES round function gated under the `hazmat` crate feature.
+#[cfg(feature = "hazmat")]
+pub(crate) mod hazmat {
+    use super::{
+        bitslice, inv_bitslice, inv_mix_columns_0, inv_shift_rows_1, inv_sub_bytes, mix_columns_0,
+        shift_rows_1, sub_bytes, sub_bytes_nots, State,
+    };
+    use crate::{Block, ParBlocks};
+
+    /// XOR the `src` block into the `dst` block in-place.
+    fn xor_in_place(dst: &mut Block, src: &Block) {
+        for (a, b) in dst.iter_mut().zip(src.as_slice()) {
+            *a ^= *b;
+        }
+    }
+
+    /// Perform a bitslice operation, loading a single block.
+    fn bitslice_block(block: &Block) -> State {
+        let mut state = State::default();
+        bitslice(&mut state, block, block, block, block);
+        state
+    }
+
+    /// Perform an inverse bitslice operation, extracting a single block.
+    fn inv_bitslice_block(block: &mut Block, state: &State) {
+        let mut out = [Block::default(); 4];
+        inv_bitslice(state, &mut out);
+        block.copy_from_slice(&out[0]);
+    }
+
+    /// AES cipher (encrypt) round function.
+    #[inline]
+    pub(crate) fn cipher_round(block: &mut Block, round_key: &Block) {
+        let mut state = bitslice_block(block);
+        sub_bytes(&mut state);
+        sub_bytes_nots(&mut state);
+        shift_rows_1(&mut state);
+        mix_columns_0(&mut state);
+        inv_bitslice_block(block, &state);
+        xor_in_place(block, round_key);
+    }
+
+    /// AES cipher (encrypt) round function: parallel version.
+    #[inline]
+    pub(crate) fn cipher_round_par(blocks: &mut ParBlocks, round_keys: &ParBlocks) {
+        for (chunk, keys) in blocks.chunks_exact_mut(4).zip(round_keys.chunks_exact(4)) {
+            let mut state = State::default();
+            bitslice(&mut state, &chunk[0], &chunk[1], &chunk[2], &chunk[3]);
+            sub_bytes(&mut state);
+            sub_bytes_nots(&mut state);
+            shift_rows_1(&mut state);
+            mix_columns_0(&mut state);
+            inv_bitslice(&state, chunk);
+
+            for i in 0..4 {
+                xor_in_place(&mut chunk[i], &keys[i]);
+            }
+        }
+    }
+
+    /// AES cipher (encrypt) round function.
+    #[inline]
+    pub(crate) fn equiv_inv_cipher_round(block: &mut Block, round_key: &Block) {
+        let mut state = State::default();
+        bitslice(&mut state, &block, &block, &block, &block);
+        sub_bytes_nots(&mut state);
+        inv_sub_bytes(&mut state);
+        inv_shift_rows_1(&mut state);
+        inv_mix_columns_0(&mut state);
+        inv_bitslice_block(block, &state);
+        xor_in_place(block, round_key);
+    }
+
+    /// AES cipher (encrypt) round function: parallel version.
+    #[inline]
+    pub(crate) fn equiv_inv_cipher_round_par(blocks: &mut ParBlocks, round_keys: &ParBlocks) {
+        for (chunk, keys) in blocks.chunks_exact_mut(4).zip(round_keys.chunks_exact(4)) {
+            let mut state = State::default();
+            bitslice(&mut state, &chunk[0], &chunk[1], &chunk[2], &chunk[3]);
+            sub_bytes_nots(&mut state);
+            inv_sub_bytes(&mut state);
+            inv_shift_rows_1(&mut state);
+            inv_mix_columns_0(&mut state);
+            inv_bitslice(&state, chunk);
+
+            for i in 0..4 {
+                xor_in_place(&mut chunk[i], &keys[i]);
+            }
+        }
+    }
+
+    /// AES mix columns function.
+    #[inline]
+    pub(crate) fn mix_columns(block: &mut Block) {
+        let mut state = bitslice_block(block);
+        mix_columns_0(&mut state);
+        inv_bitslice_block(block, &state);
+    }
+
+    /// AES inverse mix columns function.
+    #[inline]
+    pub(crate) fn inv_mix_columns(block: &mut Block) {
+        let mut state = bitslice_block(block);
+        inv_mix_columns_0(&mut state);
+        inv_bitslice_block(block, &state);
+    }
+}
author	Daniel Baumann <daniel.baumann@progress-linux.org>	2024-04-19 17:39:49 +0000
committer	Daniel Baumann <daniel.baumann@progress-linux.org>	2024-04-19 17:39:49 +0000
commit	a0aa2307322cd47bbf416810ac0292925e03be87 (patch)
tree	37076262a026c4b48c8a0e84f44ff9187556ca35 /rust/vendor/aes/src
parent	Initial commit. (diff)
download	suricata-a0aa2307322cd47bbf416810ac0292925e03be87.tar.xz suricata-a0aa2307322cd47bbf416810ac0292925e03be87.zip