diff options
Diffstat (limited to 'src/spdk/intel-ipsec-mb/include/snow3g_common.h')
-rw-r--r-- | src/spdk/intel-ipsec-mb/include/snow3g_common.h | 2840 |
1 files changed, 2840 insertions, 0 deletions
diff --git a/src/spdk/intel-ipsec-mb/include/snow3g_common.h b/src/spdk/intel-ipsec-mb/include/snow3g_common.h new file mode 100644 index 000000000..d7c7e63c1 --- /dev/null +++ b/src/spdk/intel-ipsec-mb/include/snow3g_common.h @@ -0,0 +1,2840 @@ +/******************************************************************************* + Copyright (c) 2009-2019, Intel Corporation + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of Intel Corporation nor the names of its contributors + may be used to endorse or promote products derived from this software + without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE + FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +/*----------------------------------------------------------------------- + * + * An implementation of SNOW 3G, the core algorithm for the + * 3GPP Confidentiality and Integrity algorithms. + * + *-----------------------------------------------------------------------*/ + +#ifndef SNOW3G_COMMON_H +#define SNOW3G_COMMON_H + +#include <stdio.h> +#include <string.h> +#include <stdint.h> + +#include "intel-ipsec-mb.h" +#include "include/snow3g.h" +#include "include/snow3g_internal.h" +#include "clear_regs_mem.h" + +#define CLEAR_MEM clear_mem +#define CLEAR_VAR clear_var + +/* ------------------------------------------------------------------- + * LFSR array shift by 1 position, 4 packets at a time + * ------------------------------------------------------------------ */ + +#ifdef AVX2 +/* LFSR array shift */ +static inline void ShiftLFSR_8(snow3gKeyState8_t *pCtx) +{ + pCtx->iLFSR_X = (pCtx->iLFSR_X + 1) & 15; +} +#endif /* AVX2 */ + +/* LFSR array shift */ +static inline void ShiftLFSR_4(snow3gKeyState4_t *pCtx) +{ + pCtx->iLFSR_X = (pCtx->iLFSR_X + 1) % 16; +} + +/*--------------------------------------------------------- + * @description + * Gf2 modular multiplication/reduction + * + *---------------------------------------------------------*/ +static inline uint64_t multiply_and_reduce64(uint64_t a, uint64_t b) +{ + uint64_t msk; + uint64_t res = 0; + uint64_t i = 64; + + while (i--) { + msk = ((int64_t)res >> 63) & 0x1b; + res <<= 1; + res ^= msk; + msk = ((int64_t)b >> 63) & a; + b <<= 1; + res ^= msk; + } + return res; +} + +#ifdef AVX2 +/* ------------------------------------------------------------------- + * ClockLFSR sub-function as defined in snow3g standard + * S = LFSR[2] + * ^ table_Alpha_div[LFSR[11] & 0xff] + * ^ table_Alpha_mul[LFSR[0] & 0xff] + * ------------------------------------------------------------------ */ +static void C0_C11_8(__m256i *S, const __m256i *L0, const __m256i *L11) +{ + __m256i mask, Sx, B11, B0, offset; + + offset = _mm256_set1_epi32(3); + mask = _mm256_setr_epi32(0xF0F0F000, 0xF0F0F004, 0xF0F0F008, 0xF0F0F00C, + 0xF0F0F000, 0xF0F0F004, 0xF0F0F008, + 0xF0F0F00C); + B11 = _mm256_shuffle_epi8(*L11, mask); + *S = _mm256_i32gather_epi32(snow3g_table_A_div, B11, 4); + + mask = _mm256_add_epi32(mask, offset); + B0 = _mm256_shuffle_epi8(*L0, mask); + Sx = _mm256_i32gather_epi32(snow3g_table_A_mul, B0, 4); + *S = _mm256_xor_si256(*S, Sx); +} +#endif /* AVX2 */ + +/* ------------------------------------------------------------------- + * ClockLFSR sub-function as defined in snow3g standard + * S = LFSR[2] + * ^ table_Alpha_div[LFSR[11] & 0xff] + * ^ table_Alpha_mul[LFSR[0] & 0xff] + * ------------------------------------------------------------------ */ +static inline void C0_C11_4(uint32_t *S, const __m128i *L0, const __m128i *L11) +{ + unsigned B11[4], B0[4]; + + B11[0] = _mm_extract_epi8(*L11, 0); + B11[1] = _mm_extract_epi8(*L11, 4); + B11[2] = _mm_extract_epi8(*L11, 8); + B11[3] = _mm_extract_epi8(*L11, 12); + + S[0] = snow3g_table_A_div[B11[0]]; + S[1] = snow3g_table_A_div[B11[1]]; + S[2] = snow3g_table_A_div[B11[2]]; + S[3] = snow3g_table_A_div[B11[3]]; + + B0[0] = _mm_extract_epi8(*L0, 3); + B0[1] = _mm_extract_epi8(*L0, 7); + B0[2] = _mm_extract_epi8(*L0, 11); + B0[3] = _mm_extract_epi8(*L0, 15); + + S[0] ^= snow3g_table_A_mul[B0[0]]; + S[1] ^= snow3g_table_A_mul[B0[1]]; + S[2] ^= snow3g_table_A_mul[B0[2]]; + S[3] ^= snow3g_table_A_mul[B0[3]]; +} + +#ifdef AVX2 +/* ------------------------------------------------------------------- + * ClockLFSR function as defined in snow3g standard + * S = table_Alpha_div[LFSR[11] & 0xff] + * ^ table_Alpha_mul[LFSR[0] >> 24] + * ^ LFSR[2] ^ LFSR[0] << 8 ^ LFSR[11] >> 8 + * ------------------------------------------------------------------ */ +static inline void ClockLFSR_8(snow3gKeyState8_t *pCtx) +{ + __m256i X2; + __m256i S, T, U; + + U = pCtx->LFSR_X[pCtx->iLFSR_X]; + S = pCtx->LFSR_X[(pCtx->iLFSR_X + 11) % 16]; + + C0_C11_8(&X2, &U, &S); + + T = _mm256_slli_epi32(U, 8); + S = _mm256_srli_epi32(S, 8); + U = _mm256_xor_si256(T, pCtx->LFSR_X[(pCtx->iLFSR_X + 2) % 16]); + + ShiftLFSR_8(pCtx); + + S = _mm256_xor_si256(S, U); + S = _mm256_xor_si256(S, X2); + pCtx->LFSR_X[(pCtx->iLFSR_X + 15) % 16] = S; +} +#endif /* AVX2 */ + +/* ------------------------------------------------------------------- + * ClockLFSR function as defined in snow3g standard + * S = table_Alpha_div[LFSR[11] & 0xff] + * ^ table_Alpha_mul[LFSR[0] >> 24] + * ^ LFSR[2] ^ LFSR[0] << 8 ^ LFSR[11] >> 8 + * ------------------------------------------------------------------ */ +static inline void ClockLFSR_4(snow3gKeyState4_t *pCtx) +{ + uint32_t X2[4]; + __m128i S, T, U; + + U = pCtx->LFSR_X[pCtx->iLFSR_X]; + S = pCtx->LFSR_X[(pCtx->iLFSR_X + 11) % 16]; + C0_C11_4(X2, &U, &S); + + T = _mm_slli_epi32(U, 8); + S = _mm_srli_epi32(S, 8); + U = _mm_xor_si128(T, pCtx->LFSR_X[(pCtx->iLFSR_X + 2) % 16]); + ShiftLFSR_4(pCtx); + + /* (SSE4) */ + T = _mm_insert_epi32(T, X2[0], 0); + T = _mm_insert_epi32(T, X2[1], 1); + T = _mm_insert_epi32(T, X2[2], 2); + T = _mm_insert_epi32(T, X2[3], 3); + S = _mm_xor_si128(S, U); + S = _mm_xor_si128(S, T); + pCtx->LFSR_X[(pCtx->iLFSR_X + 15) % 16] = S; +} + +#ifdef AVX2 +/* ------------------------------------------------------------------- + * ClockFSM function as defined in snow3g standard + * 8 packets at a time + * ------------------------------------------------------------------ */ +static inline void ClockFSM_8(snow3gKeyState8_t *pCtx, __m256i *data) +{ + __m256i F, R, S2T0, S2T1, S2T2, S2T3, S1T0, S1T1, S1T2, S1T3; + __m256i w3, w2, w1, w0, offset, mask; + + F = _mm256_add_epi32(pCtx->LFSR_X[(pCtx->iLFSR_X + 15)%16], + pCtx->FSM_X[0]); + R = _mm256_xor_si256(pCtx->LFSR_X[(pCtx->iLFSR_X + 5)%16], + pCtx->FSM_X[2]); + *data = _mm256_xor_si256(F, pCtx->FSM_X[1]); + R = _mm256_add_epi32(R, pCtx->FSM_X[1]); + offset = _mm256_set1_epi32(0x1); + + F = pCtx->FSM_X[1]; + w3 = _mm256_setr_epi32(0xF0F0F000, 0xF0F0F004, 0xF0F0F008, + 0xF0F0F00C, 0xF0F0F000, 0xF0F0F004, + 0xF0F0F008, 0xF0F0F00C); + mask = _mm256_shuffle_epi8(F,w3); + S2T0 = _mm256_i32gather_epi32(S2_T0,mask,4); + + w2 = _mm256_add_epi32(w3,offset); + mask = _mm256_shuffle_epi8(F,w2); + S2T1 = _mm256_i32gather_epi32(S2_T1,mask,4); + + w1 = _mm256_add_epi32(w2,offset); + mask = _mm256_shuffle_epi8(pCtx->FSM_X[1],w1); + S2T2 = _mm256_i32gather_epi32(S2_T2,mask,4); + + w0 = _mm256_add_epi32(w1,offset); + mask = _mm256_shuffle_epi8(F,w0); + S2T3 = _mm256_i32gather_epi32(S2_T3,mask,4); + + + F = pCtx->FSM_X[0]; + w3 = _mm256_setr_epi32(0xF0F0F000, 0xF0F0F004, 0xF0F0F008, + 0xF0F0F00C, 0xF0F0F010, 0xF0F0F014, + 0xF0F0F018, 0xF0F0F01C); + mask = _mm256_shuffle_epi8(F,w3); + S1T0 = _mm256_i32gather_epi32(S1_T0,mask,4); + + w2 = _mm256_add_epi32(w3,offset); + mask = _mm256_shuffle_epi8(F,w2); + S1T1 = _mm256_i32gather_epi32(S1_T1,mask,4); + + w1 = _mm256_add_epi32(w2,offset); + mask = _mm256_shuffle_epi8(F,w1); + S1T2 = _mm256_i32gather_epi32(S1_T2,mask,4); + + w0 = _mm256_add_epi32(w1,offset); + mask = _mm256_shuffle_epi8(F,w0); + S1T3 = _mm256_i32gather_epi32(S1_T3,mask,4); + + S2T0 = _mm256_xor_si256(S2T0, S2T1); + S2T2 = _mm256_xor_si256(S2T2, S2T3); + S2T0 = _mm256_xor_si256(S2T0, S2T2); + + S1T0 = _mm256_xor_si256(S1T0, S1T1); + S1T2 = _mm256_xor_si256(S1T2, S1T3); + S1T0 = _mm256_xor_si256(S1T0, S1T2); + + + pCtx->FSM_X[2] = S2T0; + pCtx->FSM_X[1] = S1T0; + pCtx->FSM_X[2] = S2T0; + pCtx->FSM_X[0] = R; +} + +#endif /* AVX2 */ + +/* ------------------------------------------------------------------- + * ClockFSM function as defined in snow3g standard + * 4 packets at a time + * ------------------------------------------------------------------ */ +static inline void ClockFSM_4(snow3gKeyState4_t *pCtx, __m128i *data) +{ + __m128i F, R; +#ifdef _WIN32 +#pragma warning(push) +#pragma warning(disable:4556) +#endif +#if defined (NO_AESNI) || defined (SAFE_LOOKUP) + uint32_t L = 0; +#endif + uint32_t K = 0; + + F = _mm_add_epi32(pCtx->LFSR_X[(pCtx->iLFSR_X + 15) % 16], + pCtx->FSM_X[0]); + R = _mm_xor_si128(pCtx->LFSR_X[(pCtx->iLFSR_X + 5) % 16], + pCtx->FSM_X[2]); + *data = _mm_xor_si128(F, pCtx->FSM_X[1]); + R = _mm_add_epi32(R, pCtx->FSM_X[1]); +#if defined (NO_AESNI) || defined (SAFE_LOOKUP) + S1_S2_4(pCtx->FSM_X[2], pCtx->FSM_X[1], pCtx->FSM_X[0], K, L, 0); + S1_S2_4(pCtx->FSM_X[2], pCtx->FSM_X[1], pCtx->FSM_X[0], K, L, 1); + S1_S2_4(pCtx->FSM_X[2], pCtx->FSM_X[1], pCtx->FSM_X[0], K, L, 2); + S1_S2_4(pCtx->FSM_X[2], pCtx->FSM_X[1], pCtx->FSM_X[0], K, L, 3); +#else + S1_S2_4(pCtx->FSM_X[2], pCtx->FSM_X[1], pCtx->FSM_X[0], K, 0); + S1_S2_4(pCtx->FSM_X[2], pCtx->FSM_X[1], pCtx->FSM_X[0], K, 1); + S1_S2_4(pCtx->FSM_X[2], pCtx->FSM_X[1], pCtx->FSM_X[0], K, 2); + S1_S2_4(pCtx->FSM_X[2], pCtx->FSM_X[1], pCtx->FSM_X[0], K, 3); +#endif /* NO_AESNI */ + pCtx->FSM_X[0] = R; + +#ifdef _WIN32 +#pragma warning(pop) +#endif +} + +/** +******************************************************************************* +* @description +* This function generates 4 bytes of keystream 1 buffer at a time +* +* @param[in] pCtx Context where the scheduled keys are stored +* @param[in/out] pKeyStream Pointer to generated keystream +* +*******************************************************************************/ +static inline void snow3g_keystream_1_4(snow3gKeyState1_t *pCtx, + uint32_t *pKeyStream) +{ + uint32_t F; + + ClockFSM_1(pCtx, &F); + *pKeyStream = F ^ pCtx->LFSR_S[0]; + ClockLFSR_1(pCtx); +} + +/** +******************************************************************************* +* @description +* This function generates 8 bytes of keystream 1 buffer at a time +* +* @param[in] pCtx Context where the scheduled keys are stored +* @param[in/out] pKeyStream Pointer to generated keystream +* +*******************************************************************************/ +static inline void snow3g_keystream_1_8(snow3gKeyState1_t *pCtx, + uint64_t *pKeyStream) +{ + uint64_t F; + uint32_t FSM4; + uint32_t V0, V1; + uint32_t F0, F1; + uint32_t R0, R1; + uint32_t L0, L1, L11, L12; + + /* Merged clock FSM + clock LFSR + clock FSM + clockLFSR + * in order to avoid redundancies in function processing + * and less instruction immediate dependencies + */ + L0 = pCtx->LFSR_S[0]; + V0 = pCtx->LFSR_S[2]; + L1 = pCtx->LFSR_S[1]; + V1 = pCtx->LFSR_S[3]; + R1 = pCtx->FSM_R1; + L11 = pCtx->LFSR_S[11]; + L12 = pCtx->LFSR_S[12]; + V0 ^= snow3g_table_A_mul[L0 >> 24]; + V1 ^= snow3g_table_A_mul[L1 >> 24]; + V0 ^= snow3g_table_A_div[L11 & 0xff]; + V1 ^= snow3g_table_A_div[L12 & 0xff]; + V0 ^= L0 << 8; + V1 ^= L1 << 8; + V0 ^= L11 >> 8; + V1 ^= L12 >> 8; + F0 = pCtx->LFSR_S[15] + R1; + F0 ^= L0; + F0 ^= pCtx->FSM_R2; + R0 = pCtx->FSM_R3 ^ pCtx->LFSR_S[5]; + R0 += pCtx->FSM_R2; + S1_S2_S3_1(pCtx->FSM_R3, pCtx->FSM_R2, R1, FSM4, R0); + R1 = pCtx->FSM_R3 ^ pCtx->LFSR_S[6]; + F1 = V0 + R0; + F1 ^= L1; + F1 ^= pCtx->FSM_R2; + R1 += pCtx->FSM_R2; + pCtx->FSM_R3 = Snow3g_S2(pCtx->FSM_R2); + pCtx->FSM_R2 = FSM4; + pCtx->FSM_R1 = R1; + + /* Shift LFSR twice */ + ShiftTwiceLFSR_1(pCtx); + + /* keystream mode LFSR update */ + pCtx->LFSR_S[14] = V0; + pCtx->LFSR_S[15] = V1; + + F = F0; + F <<= 32; + F |= (uint64_t)F1; + + *pKeyStream = F; +} + +#ifdef AVX2 +/** +******************************************************************************* +* @description +* This function generates 8 bytes of keystream 8 buffers at a time +* +* @param[in] pCtx Context where the scheduled keys are stored +* @param[in/out] pKeyStream Pointer to generated keystream +* +*******************************************************************************/ +static inline void snow3g_keystream_8_8(snow3gKeyState8_t *pCtx, + __m256i *pKeyStreamLo, + __m256i *pKeyStreamHi) +{ + __m256i H, L; + + /* first set of 4 bytes */ + ClockFSM_8(pCtx, &L); + L = _mm256_xor_si256(L, pCtx->LFSR_X[pCtx->iLFSR_X]); + ClockLFSR_8(pCtx); + + /* second set of 4 bytes */ + ClockFSM_8(pCtx, &H); + H = _mm256_xor_si256(H, pCtx->LFSR_X[pCtx->iLFSR_X]); + ClockLFSR_8(pCtx); + + /* merge the 2 sets */ + *pKeyStreamLo = _mm256_unpacklo_epi32(H, L); + *pKeyStreamHi = _mm256_unpackhi_epi32(H, L); +} + +/** +******************************************************************************* +* @description +* This function generates 4 bytes of keystream 8 buffers at a time +* +* @param[in] pCtx Context where the scheduled keys are stored +* @param[in/out] pKeyStream Pointer to generated keystream +* +*******************************************************************************/ +static inline void snow3g_keystream_8_4(snow3gKeyState8_t *pCtx, + __m256i *pKeyStream) +{ + __m256i F; + + ClockFSM_8(pCtx, &F); + *pKeyStream = _mm256_xor_si256(F, pCtx->LFSR_X[pCtx->iLFSR_X]); + ClockLFSR_8(pCtx); +} + +/** +***************************************************************************** +* @description +* This function generates 32 bytes of keystream 8 buffers at a time +* +* @param[in] pCtx Context where the scheduled keys are stored +* @param[in/out] pKeyStream Array of generated keystreams +* +******************************************************************************/ +static inline void snow3g_keystream_8_32(snow3gKeyState8_t *pCtx, + __m256i *pKeyStream) +{ + + __m256i temp[8]; + + /** produces the next 4 bytes for each buffer */ + int i; + + /** Byte reversal on each KS */ + __m256i mask1 = {0x0001020304050607ULL, 0x08090a0b0c0d0e0fULL, + 0x0001020304050607ULL, 0x08090a0b0c0d0e0fULL}; + /** Reversal, shifted 4 bytes right */ + __m256i mask2 = {0x0405060708090a0bULL, 0x0c0d0e0f00010203ULL, + 0x0405060708090a0bULL, 0x0c0d0e0f00010203ULL}; + /** Reversal, shifted 8 bytes right */ + __m256i mask3 = {0x08090a0b0c0d0e0fULL, 0x0001020304050607ULL, + 0x08090a0b0c0d0e0fULL, 0x0001020304050607ULL}; + /** Reversal, shifted 12 bytes right */ + __m256i mask4 = {0x0c0d0e0f00010203ULL, 0x0405060708090a0bULL, + 0x0c0d0e0f00010203ULL, 0x0405060708090a0bULL}; + + snow3g_keystream_8_4(pCtx, &temp[0]); + snow3g_keystream_8_4(pCtx, &temp[1]); + snow3g_keystream_8_4(pCtx, &temp[2]); + snow3g_keystream_8_4(pCtx, &temp[3]); + snow3g_keystream_8_4(pCtx, &temp[4]); + snow3g_keystream_8_4(pCtx, &temp[5]); + snow3g_keystream_8_4(pCtx, &temp[6]); + snow3g_keystream_8_4(pCtx, &temp[7]); + + temp[0] = _mm256_shuffle_epi8(temp[0], mask1); + temp[1] = _mm256_shuffle_epi8(temp[1], mask2); + temp[2] = _mm256_shuffle_epi8(temp[2], mask3); + temp[3] = _mm256_shuffle_epi8(temp[3], mask4); + temp[4] = _mm256_shuffle_epi8(temp[4], mask1); + temp[5] = _mm256_shuffle_epi8(temp[5], mask2); + temp[6] = _mm256_shuffle_epi8(temp[6], mask3); + temp[7] = _mm256_shuffle_epi8(temp[7], mask4); + + __m256i blended[8]; + /* blends KS together: 128bit slice consists + of 4 32-bit words for one packet */ + blended[0] = _mm256_blend_epi32(temp[0], temp[1], 0xaa); + blended[1] = _mm256_blend_epi32(temp[0], temp[1], 0x55); + blended[2] = _mm256_blend_epi32(temp[2], temp[3], 0xaa); + blended[3] = _mm256_blend_epi32(temp[2], temp[3], 0x55); + blended[4] = _mm256_blend_epi32(temp[4], temp[5], 0xaa); + blended[5] = _mm256_blend_epi32(temp[4], temp[5], 0x55); + blended[6] = _mm256_blend_epi32(temp[6], temp[7], 0xaa); + blended[7] = _mm256_blend_epi32(temp[6], temp[7], 0x55); + + temp[0] = _mm256_blend_epi32(blended[0], blended[2], 0xcc); + temp[1] = _mm256_blend_epi32(blended[1], blended[3], 0x99); + temp[2] = _mm256_blend_epi32(blended[0], blended[2], 0x33); + temp[3] = _mm256_blend_epi32(blended[1], blended[3], 0x66); + temp[4] = _mm256_blend_epi32(blended[4], blended[6], 0xcc); + temp[5] = _mm256_blend_epi32(blended[5], blended[7], 0x99); + temp[6] = _mm256_blend_epi32(blended[4], blended[6], 0x33); + temp[7] = _mm256_blend_epi32(blended[5], blended[7], 0x66); + + /** sorts 32 bit words back into order */ + blended[0] = temp[0]; + blended[1] = _mm256_shuffle_epi32(temp[1], 0x39); + blended[2] = _mm256_shuffle_epi32(temp[2], 0x4e); + blended[3] = _mm256_shuffle_epi32(temp[3], 0x93); + blended[4] = temp[4]; + blended[5] = _mm256_shuffle_epi32(temp[5], 0x39); + blended[6] = _mm256_shuffle_epi32(temp[6], 0x4e); + blended[7] = _mm256_shuffle_epi32(temp[7], 0x93); + + for (i = 0; i < 4; i++) { + pKeyStream[i] = _mm256_permute2x128_si256(blended[i], + blended[i + 4], 0x20); + pKeyStream[i + 4] = _mm256_permute2x128_si256( + blended[i], blended[i + 4], 0x31); + } +} + +#endif /* AVX2 */ + +/** +******************************************************************************* +* @description +* This function generates 4 bytes of keystream 4 buffers at a time +* +* @param[in] pCtx Context where the scheduled keys are stored +* @param[in/out] pKeyStream Pointer to generated keystream +* +*******************************************************************************/ +static inline void snow3g_keystream_4_4(snow3gKeyState4_t *pCtx, + __m128i *pKeyStream) +{ + __m128i F; + + ClockFSM_4(pCtx, &F); + *pKeyStream = _mm_xor_si128(F, pCtx->LFSR_X[pCtx->iLFSR_X]); + ClockLFSR_4(pCtx); +} + +/** +******************************************************************************* +* @description +* This function generates 8 bytes of keystream 4 buffers at a time +* +* @param[in] pCtx Context where the scheduled keys are stored +* @param[in/out] pKeyStreamLo Pointer to lower end of generated keystream +* @param[in/out] pKeyStreamHi Pointer to higer end of generated keystream +* +*******************************************************************************/ +static inline void snow3g_keystream_4_8(snow3gKeyState4_t *pCtx, + __m128i *pKeyStreamLo, + __m128i *pKeyStreamHi) +{ + __m128i H, L; + + /* first set of 4 bytes */ + ClockFSM_4(pCtx, &L); + L = _mm_xor_si128(L, pCtx->LFSR_X[pCtx->iLFSR_X]); + ClockLFSR_4(pCtx); + + /* second set of 4 bytes */ + ClockFSM_4(pCtx, &H); + H = _mm_xor_si128(H, pCtx->LFSR_X[pCtx->iLFSR_X]); + ClockLFSR_4(pCtx); + + /* merge the 2 sets */ + *pKeyStreamLo = _mm_unpacklo_epi32(H, L); + *pKeyStreamHi = _mm_unpackhi_epi32(H, L); +} + +/** +******************************************************************************* +* @description +* This function initializes the key schedule for 4 buffers for snow3g f8/f9. +* +* @param [in] pCtx Context where the scheduled keys are stored +* @param [in] pKeySched Key schedule +* @param [in] pIV1 IV for buffer 1 +* @param [in] pIV2 IV for buffer 2 +* @param [in] pIV3 IV for buffer 3 +* @param [in] pIV4 IV for buffer 4 +* +*******************************************************************************/ +static inline void +snow3gStateInitialize_4(snow3gKeyState4_t *pCtx, + const snow3g_key_schedule_t *pKeySched, + const void *pIV1, const void *pIV2, + const void *pIV3, const void *pIV4) +{ + uint32_t K, L; + int i; + __m128i R, S, T, U; + __m128i V0, V1, T0, T1; + + /* Initialize the LFSR table from constants, Keys, and IV */ + + /* Load complete 128b IV into register (SSE2)*/ + uint64_t sm[2] = {0x0405060700010203ULL, 0x0c0d0e0f08090a0bULL}; + __m128i *swapMask = (__m128i *) sm; + + R = _mm_loadu_si128((const __m128i *)pIV1); + S = _mm_loadu_si128((const __m128i *)pIV2); + T = _mm_loadu_si128((const __m128i *)pIV3); + U = _mm_loadu_si128((const __m128i *)pIV4); + + /* initialize the array block (SSE4) */ + for (i = 0; i < 4; i++) { + K = pKeySched->k[i]; + L = ~K; + V0 = _mm_set1_epi32(K); + V1 = _mm_set1_epi32(L); + pCtx->LFSR_X[i + 4] = V0; + pCtx->LFSR_X[i + 12] = V0; + pCtx->LFSR_X[i + 0] = V1; + pCtx->LFSR_X[i + 8] = V1; + } + /* Update the schedule structure with IVs */ + /* Store the 4 IVs in LFSR by a column/row matrix swap + * after endianness correction */ + + /* endianness swap (SSSE3) */ + R = _mm_shuffle_epi8(R, *swapMask); + S = _mm_shuffle_epi8(S, *swapMask); + T = _mm_shuffle_epi8(T, *swapMask); + U = _mm_shuffle_epi8(U, *swapMask); + + /* row/column dword inversion (SSE2) */ + T0 = _mm_unpacklo_epi32(R, S); + R = _mm_unpackhi_epi32(R, S); + T1 = _mm_unpacklo_epi32(T, U); + T = _mm_unpackhi_epi32(T, U); + + /* row/column qword inversion (SSE2) */ + U = _mm_unpackhi_epi64(R, T); + T = _mm_unpacklo_epi64(R, T); + S = _mm_unpackhi_epi64(T0, T1); + R = _mm_unpacklo_epi64(T0, T1); + + /*IV ^ LFSR (SSE2) */ + pCtx->LFSR_X[15] = _mm_xor_si128(pCtx->LFSR_X[15], U); + pCtx->LFSR_X[12] = _mm_xor_si128(pCtx->LFSR_X[12], T); + pCtx->LFSR_X[10] = _mm_xor_si128(pCtx->LFSR_X[10], S); + pCtx->LFSR_X[9] = _mm_xor_si128(pCtx->LFSR_X[9], R); + pCtx->iLFSR_X = 0; + /* FSM initialization (SSE2) */ + S = _mm_setzero_si128(); + for (i = 0; i < 3; i++) + pCtx->FSM_X[i] = S; + + /* Initialisation rounds */ + for (i = 0; i < 32; i++) { + ClockFSM_4(pCtx, &S); + ClockLFSR_4(pCtx); + pCtx->LFSR_X[(pCtx->iLFSR_X + 15) % 16] = _mm_xor_si128( + pCtx->LFSR_X[(pCtx->iLFSR_X + 15) % 16], S); + } +} + +#ifdef AVX2 +/** +******************************************************************************* +* @description +* This function intializes the key schedule for 8 buffers with +* individual keys, for snow3g f8/f9. +* +* @param [in] pCtx Context where scheduled keys are stored +* @param [in] pKeySched Key schedule +* @param [in] pIV1 IV for buffer 1 +* @param [in] pIV2 IV for buffer 2 +* @param [in] pIV3 IV for buffer 3 +* @param [in] pIV4 IV for buffer 4 +* @param [in] pIV5 IV for buffer 5 +* @param [in] pIV6 IV for buffer 6 +* @param [in] pIV7 IV for buffer 7 +* @param [in] pIV8 IV for buffer 8 +* +*******************************************************************************/ +static inline void +snow3gStateInitialize_8_multiKey(snow3gKeyState8_t *pCtx, + const snow3g_key_schedule_t * const KeySched[], + const void * const pIV[]) +{ + DECLARE_ALIGNED(uint32_t k[8], 32); + DECLARE_ALIGNED(uint32_t l[8], 32); + __m256i *K = (__m256i *)k; + __m256i *L = (__m256i *)l; + + int i, j; + __m256i mR, mS, mT, mU, T0, T1; + + /* Initialize the LFSR table from constants, Keys, and IV */ + + /* Load complete 256b IV into register (SSE2)*/ + __m256i swapMask = {0x0405060700010203ULL, 0x0c0d0e0f08090a0bULL, + 0x0405060700010203ULL, 0x0c0d0e0f08090a0bULL}; + mR = _mm256_loadu2_m128i((const __m128i *)pIV[4], + (const __m128i *)pIV[0]); + mS = _mm256_loadu2_m128i((const __m128i *)pIV[5], + (const __m128i *)pIV[1]); + mT = _mm256_loadu2_m128i((const __m128i *)pIV[6], + (const __m128i *)pIV[2]); + mU = _mm256_loadu2_m128i((const __m128i *)pIV[7], + (const __m128i *)pIV[3]); + + /* initialize the array block (SSE4) */ + for (i = 0; i < 4; i++) { + for (j = 0; j < 8; j++) { + k[j] = KeySched[j]->k[i]; + l[j] = ~k[j]; + } + + pCtx->LFSR_X[i + 4] = *K; + pCtx->LFSR_X[i + 12] = *K; + pCtx->LFSR_X[i + 0] = *L; + pCtx->LFSR_X[i + 8] = *L; + } + + /* Update the schedule structure with IVs */ + /* Store the 4 IVs in LFSR by a column/row matrix swap + * after endianness correction */ + + /* endianness swap (SSSE3) */ + mR = _mm256_shuffle_epi8(mR, swapMask); + mS = _mm256_shuffle_epi8(mS, swapMask); + mT = _mm256_shuffle_epi8(mT, swapMask); + mU = _mm256_shuffle_epi8(mU, swapMask); + + /* row/column dword inversion (SSE2) */ + T0 = _mm256_unpacklo_epi32(mR, mS); + mR = _mm256_unpackhi_epi32(mR, mS); + T1 = _mm256_unpacklo_epi32(mT, mU); + mT = _mm256_unpackhi_epi32(mT, mU); + + /* row/column qword inversion (SSE2) */ + mU = _mm256_unpackhi_epi64(mR, mT); + mT = _mm256_unpacklo_epi64(mR, mT); + mS = _mm256_unpackhi_epi64(T0, T1); + mR = _mm256_unpacklo_epi64(T0, T1); + + /*IV ^ LFSR (SSE2) */ + pCtx->LFSR_X[15] = _mm256_xor_si256(pCtx->LFSR_X[15], mU); + pCtx->LFSR_X[12] = _mm256_xor_si256(pCtx->LFSR_X[12], mT); + pCtx->LFSR_X[10] = _mm256_xor_si256(pCtx->LFSR_X[10], mS); + pCtx->LFSR_X[9] = _mm256_xor_si256(pCtx->LFSR_X[9], mR); + pCtx->iLFSR_X = 0; + /* FSM initialization (SSE2) */ + mS = _mm256_setzero_si256(); + for (i = 0; i < 3; i++) + pCtx->FSM_X[i] = mS; + + /* Initialisation rounds */ + for (i = 0; i < 32; i++) { + ClockFSM_8(pCtx, &mS); + ClockLFSR_8(pCtx); + pCtx->LFSR_X[(pCtx->iLFSR_X + 15) % 16] = _mm256_xor_si256( + pCtx->LFSR_X[(pCtx->iLFSR_X + 15) % 16], mS); + } +} + +/** +******************************************************************************* +* @description +* This function initializes the key schedule for 8 buffers for snow3g f8/f9. +* +* @param [in] pCtx Context where the scheduled keys are stored +* @param [in] pKeySched Key schedule +* @param [in] pIV1 IV for buffer 1 +* @param [in] pIV2 IV for buffer 2 +* @param [in] pIV3 IV for buffer 3 +* @param [in] pIV4 IV for buffer 4 +* @param [in] pIV5 IV for buffer 5 +* @param [in] pIV6 IV for buffer 6 +* @param [in] pIV7 IV for buffer 7 +* @param [in] pIV8 IV for buffer 8 +* +*******************************************************************************/ +static inline void +snow3gStateInitialize_8(snow3gKeyState8_t *pCtx, + const snow3g_key_schedule_t *pKeySched, + const void *pIV1, const void *pIV2, + const void *pIV3, const void *pIV4, + const void *pIV5, const void *pIV6, + const void *pIV7, const void *pIV8) +{ + uint32_t K, L; + int i; + __m256i mR, mS, mT, mU, V0, V1, T0, T1; + + /* Initialize the LFSR table from constants, Keys, and IV */ + + /* Load complete 256b IV into register (SSE2)*/ + __m256i swapMask = {0x0405060700010203ULL, 0x0c0d0e0f08090a0bULL, + 0x0405060700010203ULL, 0x0c0d0e0f08090a0bULL}; + mR = _mm256_loadu2_m128i((const __m128i *)pIV5, (const __m128i *)pIV1); + mS = _mm256_loadu2_m128i((const __m128i *)pIV6, (const __m128i *)pIV2); + mT = _mm256_loadu2_m128i((const __m128i *)pIV7, (const __m128i *)pIV3); + mU = _mm256_loadu2_m128i((const __m128i *)pIV8, (const __m128i *)pIV4); + + /* initialize the array block (SSE4) */ + for (i = 0; i < 4; i++) { + K = pKeySched->k[i]; + L = ~K; + V0 = _mm256_set1_epi32(K); + V1 = _mm256_set1_epi32(L); + pCtx->LFSR_X[i + 4] = V0; + pCtx->LFSR_X[i + 12] = V0; + pCtx->LFSR_X[i + 0] = V1; + pCtx->LFSR_X[i + 8] = V1; + } + + /* Update the schedule structure with IVs */ + /* Store the 4 IVs in LFSR by a column/row matrix swap + * after endianness correction */ + + /* endianness swap (SSSE3) */ + mR = _mm256_shuffle_epi8(mR, swapMask); + mS = _mm256_shuffle_epi8(mS, swapMask); + mT = _mm256_shuffle_epi8(mT, swapMask); + mU = _mm256_shuffle_epi8(mU, swapMask); + + /* row/column dword inversion (SSE2) */ + T0 = _mm256_unpacklo_epi32(mR, mS); + mR = _mm256_unpackhi_epi32(mR, mS); + T1 = _mm256_unpacklo_epi32(mT, mU); + mT = _mm256_unpackhi_epi32(mT, mU); + + /* row/column qword inversion (SSE2) */ + mU = _mm256_unpackhi_epi64(mR, mT); + mT = _mm256_unpacklo_epi64(mR, mT); + mS = _mm256_unpackhi_epi64(T0, T1); + mR = _mm256_unpacklo_epi64(T0, T1); + + /*IV ^ LFSR (SSE2) */ + pCtx->LFSR_X[15] = _mm256_xor_si256(pCtx->LFSR_X[15], mU); + pCtx->LFSR_X[12] = _mm256_xor_si256(pCtx->LFSR_X[12], mT); + pCtx->LFSR_X[10] = _mm256_xor_si256(pCtx->LFSR_X[10], mS); + pCtx->LFSR_X[9] = _mm256_xor_si256(pCtx->LFSR_X[9], mR); + pCtx->iLFSR_X = 0; + /* FSM initialization (SSE2) */ + mS = _mm256_setzero_si256(); + for (i = 0; i < 3; i++) + pCtx->FSM_X[i] = mS; + + /* Initialisation rounds */ + for (i = 0; i < 32; i++) { + ClockFSM_8(pCtx, &mS); + ClockLFSR_8(pCtx); + pCtx->LFSR_X[(pCtx->iLFSR_X + 15) % 16] = _mm256_xor_si256( + pCtx->LFSR_X[(pCtx->iLFSR_X + 15) % 16], mS); + } +} +#endif /* AVX2 */ + +static inline void +preserve_bits(uint64_t *KS, + const uint8_t *pcBufferOut, const uint8_t *pcBufferIn, + SafeBuf *safeOutBuf, SafeBuf *safeInBuf, + const uint8_t bit_len, const uint8_t byte_len) +{ + const uint64_t mask = UINT64_MAX << (SNOW3G_BLOCK_SIZE * 8 - bit_len); + + /* Clear the last bits of the keystream and the input + * (input only in out-of-place case) */ + *KS &= mask; + if (pcBufferIn != pcBufferOut) { + const uint64_t swapMask = BSWAP64(mask); + + safeInBuf->b64 &= swapMask; + + /* + * Merge the last bits from the output, to be preserved, + * in the keystream, to be XOR'd with the input + * (which last bits are 0, maintaining the output bits) + */ + memcpy_keystrm(safeOutBuf->b8, pcBufferOut, byte_len); + *KS |= BSWAP64(safeOutBuf->b64 & ~swapMask); + } +} + +/** +******************************************************************************* +* @description +* This function is the core snow3g bit algorithm +* for the 3GPP confidentiality algorithm +* +* @param[in] pCtx Context where the scheduled keys are stored +* @param[in] pBufferIn Input buffer +* @param[out] pBufferOut Output buffer +* @param[in] cipherLengthInBits length in bits of the data to be encrypted +* @param[in] bitOffset offset in input buffer, where data are valid +* +*******************************************************************************/ +static inline void f8_snow3g_bit(snow3gKeyState1_t *pCtx, + const void *pIn, + void *pOut, + const uint32_t lengthInBits, + const uint32_t offsetInBits) +{ + const uint8_t *pBufferIn = pIn; + uint8_t *pBufferOut = pOut; + uint32_t cipherLengthInBits = lengthInBits; + uint64_t shiftrem = 0; + uint64_t KS8, KS8bit; /* 8 bytes of keystream */ + const uint8_t *pcBufferIn = pBufferIn + (offsetInBits / 8); + uint8_t *pcBufferOut = pBufferOut + (offsetInBits / 8); + /* Offset into the first byte (0 - 7 bits) */ + uint32_t remainOffset = offsetInBits % 8; + uint32_t byteLength = (cipherLengthInBits + 7) / 8; + SafeBuf safeInBuf = {0}; + SafeBuf safeOutBuf = {0}; + + /* Now run the block cipher */ + + /* Start with potential partial block (due to offset and length) */ + snow3g_keystream_1_8(pCtx, &KS8); + KS8bit = KS8 >> remainOffset; + /* Only one block to encrypt */ + if (cipherLengthInBits < (64 - remainOffset)) { + byteLength = (cipherLengthInBits + 7) / 8; + memcpy_keystrm(safeInBuf.b8, pcBufferIn, byteLength); + /* + * If operation is Out-of-place and there is offset + * to be applied, "remainOffset" bits from the output buffer + * need to be preserved (only applicable to first byte, + * since remainOffset is up to 7 bits) + */ + if ((pIn != pOut) && remainOffset) { + const uint8_t mask8 = (uint8_t) + (1 << (8 - remainOffset)) - 1; + + safeInBuf.b8[0] = (safeInBuf.b8[0] & mask8) | + (pcBufferOut[0] & ~mask8); + } + /* If last byte is a partial byte, the last bits of the output + * need to be preserved */ + const uint8_t bitlen_with_off = remainOffset + + cipherLengthInBits; + + if ((bitlen_with_off & 0x7) != 0) + preserve_bits(&KS8bit, pcBufferOut, pcBufferIn, + &safeOutBuf, &safeInBuf, + bitlen_with_off, byteLength); + + xor_keystrm_rev(safeOutBuf.b8, safeInBuf.b8, KS8bit); + memcpy_keystrm(pcBufferOut, safeOutBuf.b8, byteLength); + return; + } + /* + * If operation is Out-of-place and there is offset + * to be applied, "remainOffset" bits from the output buffer + * need to be preserved (only applicable to first byte, + * since remainOffset is up to 7 bits) + */ + if ((pIn != pOut) && remainOffset) { + const uint8_t mask8 = (uint8_t)(1 << (8 - remainOffset)) - 1; + + memcpy_keystrm(safeInBuf.b8, pcBufferIn, 8); + safeInBuf.b8[0] = (safeInBuf.b8[0] & mask8) | + (pcBufferOut[0] & ~mask8); + xor_keystrm_rev(pcBufferOut, safeInBuf.b8, KS8bit); + pcBufferIn += SNOW3G_BLOCK_SIZE; + } else { + /* At least 64 bits to produce (including offset) */ + pcBufferIn = xor_keystrm_rev(pcBufferOut, pcBufferIn, KS8bit); + } + + if (remainOffset != 0) + shiftrem = KS8 << (64 - remainOffset); + cipherLengthInBits -= SNOW3G_BLOCK_SIZE * 8 - remainOffset; + pcBufferOut += SNOW3G_BLOCK_SIZE; + + while (cipherLengthInBits) { + /* produce the next block of keystream */ + snow3g_keystream_1_8(pCtx, &KS8); + KS8bit = (KS8 >> remainOffset) | shiftrem; + if (remainOffset != 0) + shiftrem = KS8 << (64 - remainOffset); + if (cipherLengthInBits >= SNOW3G_BLOCK_SIZE * 8) { + pcBufferIn = xor_keystrm_rev(pcBufferOut, + pcBufferIn, KS8bit); + cipherLengthInBits -= SNOW3G_BLOCK_SIZE * 8; + pcBufferOut += SNOW3G_BLOCK_SIZE; + /* loop variant */ + } else { + /* end of the loop, handle the last bytes */ + byteLength = (cipherLengthInBits + 7) / 8; + memcpy_keystrm(safeInBuf.b8, pcBufferIn, + byteLength); + + /* If last byte is a partial byte, the last bits + * of the output need to be preserved */ + if ((cipherLengthInBits & 0x7) != 0) + preserve_bits(&KS8bit, pcBufferOut, pcBufferIn, + &safeOutBuf, &safeInBuf, + cipherLengthInBits, byteLength); + + xor_keystrm_rev(safeOutBuf.b8, safeInBuf.b8, KS8bit); + memcpy_keystrm(pcBufferOut, safeOutBuf.b8, byteLength); + cipherLengthInBits = 0; + } + } +#ifdef SAFE_DATA + CLEAR_VAR(&KS8, sizeof(KS8)); + CLEAR_VAR(&KS8bit, sizeof(KS8bit)); + CLEAR_MEM(&safeInBuf, sizeof(safeInBuf)); + CLEAR_MEM(&safeOutBuf, sizeof(safeOutBuf)); +#endif +} + +/** +******************************************************************************* +* @description +* This function is the core snow3g algorithm for +* the 3GPP confidentiality and integrity algorithm. +* +* @param[in] pCtx Context where the scheduled keys are stored +* @param[in] pBufferIn Input buffer +* @param[out] pBufferOut Output buffer +* @param[in] lengthInBytes length in bytes of the data to be encrypted +* +*******************************************************************************/ +static inline void f8_snow3g(snow3gKeyState1_t *pCtx, + const void *pIn, + void *pOut, + const uint32_t lengthInBytes) +{ + uint32_t qwords = lengthInBytes / SNOW3G_8_BYTES; /* number of qwords */ + uint32_t words = lengthInBytes & 4; /* remaining word if not 0 */ + uint32_t bytes = lengthInBytes & 3; /* remaining bytes */ + uint32_t KS4; /* 4 bytes of keystream */ + uint64_t KS8; /* 8 bytes of keystream */ + const uint8_t *pBufferIn = pIn; + uint8_t *pBufferOut = pOut; + + /* process 64 bits at a time */ + while (qwords--) { + /* generate keystream 8 bytes at a time */ + snow3g_keystream_1_8(pCtx, &KS8); + + /* xor keystream 8 bytes at a time */ + pBufferIn = xor_keystrm_rev(pBufferOut, pBufferIn, KS8); + pBufferOut += SNOW3G_8_BYTES; + } + + /* check for remaining 0 to 7 bytes */ + if (0 != words) { + if (bytes) { + /* 5 to 7 last bytes, process 8 bytes */ + uint8_t buftemp[8]; + uint8_t safeBuff[8]; + + memset(safeBuff, 0, SNOW3G_8_BYTES); + snow3g_keystream_1_8(pCtx, &KS8); + memcpy_keystrm(safeBuff, pBufferIn, 4 + bytes); + xor_keystrm_rev(buftemp, safeBuff, KS8); + memcpy_keystrm(pBufferOut, buftemp, 4 + bytes); +#ifdef SAFE_DATA + CLEAR_MEM(&safeBuff, sizeof(safeBuff)); + CLEAR_MEM(&buftemp, sizeof(buftemp)); +#endif + } else { + /* exactly 4 last bytes */ + snow3g_keystream_1_4(pCtx, &KS4); + xor_keystream_reverse_32(pBufferOut, pBufferIn, KS4); + } + } else if (0 != bytes) { + /* 1 to 3 last bytes */ + uint8_t buftemp[4]; + uint8_t safeBuff[4]; + + memset(safeBuff, 0, SNOW3G_4_BYTES); + snow3g_keystream_1_4(pCtx, &KS4); + memcpy_keystream_32(safeBuff, pBufferIn, bytes); + xor_keystream_reverse_32(buftemp, safeBuff, KS4); + memcpy_keystream_32(pBufferOut, buftemp, bytes); +#ifdef SAFE_DATA + CLEAR_MEM(&safeBuff, sizeof(safeBuff)); + CLEAR_MEM(&buftemp, sizeof(buftemp)); +#endif + } + +#ifdef SAFE_DATA + CLEAR_VAR(&KS4, sizeof(KS4)); + CLEAR_VAR(&KS8, sizeof(KS8)); +#endif +} + +#ifdef AVX2 +/** +******************************************************************************* +* @description +* This function converts the state from a 4 buffer state structure to 1 +* buffer state structure. +* +* @param[in] pSrcState Pointer to the source state +* @param[in] pDstState Pointer to the destination state +* @param[in] NumBuffers Number of buffers +* +*******************************************************************************/ +static inline void snow3gStateConvert_8(snow3gKeyState8_t *pSrcState, + snow3gKeyState1_t *pDstState, + uint32_t NumBuffers) +{ + uint32_t T = 0, iLFSR_X = pSrcState->iLFSR_X; + __m256i *LFSR_X = pSrcState->LFSR_X; + int i; + + for (i = 0; i < 16; i++) { + switch (NumBuffers) { + case 0: + T = _mm256_extract_epi32(LFSR_X[(i + iLFSR_X) % 16], 0); + break; + case 1: + T = _mm256_extract_epi32(LFSR_X[(i + iLFSR_X) % 16], 1); + break; + case 2: + T = _mm256_extract_epi32(LFSR_X[(i + iLFSR_X) % 16], 2); + break; + case 3: + T = _mm256_extract_epi32(LFSR_X[(i + iLFSR_X) % 16], 3); + break; + case 4: + T = _mm256_extract_epi32(LFSR_X[(i + iLFSR_X) % 16], 4); + break; + case 5: + T = _mm256_extract_epi32(LFSR_X[(i + iLFSR_X) % 16], 5); + break; + case 6: + T = _mm256_extract_epi32(LFSR_X[(i + iLFSR_X) % 16], 6); + break; + case 7: + T = _mm256_extract_epi32(LFSR_X[(i + iLFSR_X) % 16], 7); + break; + } + pDstState->LFSR_S[i] = T; + } + i = 0; + switch (NumBuffers) { + case 0: + T = _mm256_extract_epi32(pSrcState->FSM_X[i], 0); + break; + case 1: + T = _mm256_extract_epi32(pSrcState->FSM_X[i], 1); + break; + case 2: + T = _mm256_extract_epi32(pSrcState->FSM_X[i], 2); + break; + case 3: + T = _mm256_extract_epi32(pSrcState->FSM_X[i], 3); + break; + case 4: + T = _mm256_extract_epi32(pSrcState->FSM_X[i], 4); + break; + case 5: + T = _mm256_extract_epi32(pSrcState->FSM_X[i], 5); + break; + case 6: + T = _mm256_extract_epi32(pSrcState->FSM_X[i], 6); + break; + case 7: + T = _mm256_extract_epi32(pSrcState->FSM_X[i], 7); + break; + } + pDstState->FSM_R1 = T; + + i = 1; + switch (NumBuffers) { + case 0: + T = _mm256_extract_epi32(pSrcState->FSM_X[i], 0); + break; + case 1: + T = _mm256_extract_epi32(pSrcState->FSM_X[i], 1); + break; + case 2: + T = _mm256_extract_epi32(pSrcState->FSM_X[i], 2); + break; + case 3: + T = _mm256_extract_epi32(pSrcState->FSM_X[i], 3); + break; + case 4: + T = _mm256_extract_epi32(pSrcState->FSM_X[i], 4); + break; + case 5: + T = _mm256_extract_epi32(pSrcState->FSM_X[i], 5); + break; + case 6: + T = _mm256_extract_epi32(pSrcState->FSM_X[i], 6); + break; + case 7: + T = _mm256_extract_epi32(pSrcState->FSM_X[i], 7); + break; + } + pDstState->FSM_R2 = T; + + i = 2; + switch (NumBuffers) { + case 0: + T = _mm256_extract_epi32(pSrcState->FSM_X[i], 0); + break; + case 1: + T = _mm256_extract_epi32(pSrcState->FSM_X[i], 1); + break; + case 2: + T = _mm256_extract_epi32(pSrcState->FSM_X[i], 2); + break; + case 3: + T = _mm256_extract_epi32(pSrcState->FSM_X[i], 3); + break; + case 4: + T = _mm256_extract_epi32(pSrcState->FSM_X[i], 4); + break; + case 5: + T = _mm256_extract_epi32(pSrcState->FSM_X[i], 5); + break; + case 6: + T = _mm256_extract_epi32(pSrcState->FSM_X[i], 6); + break; + case 7: + T = _mm256_extract_epi32(pSrcState->FSM_X[i], 7); + break; + } + pDstState->FSM_R3 = T; +} +#endif /* AVX2 */ + +/** +******************************************************************************* +* @description +* This function converts the state from a 4 buffer state structure to 1 +* buffer state structure. +* +* @param[in] pSrcState Pointer to the source state +* @param[in] pDstState Pointer to the destination state +* @param[in] NumBuffers Number of buffers +* +*******************************************************************************/ +static inline void snow3gStateConvert_4(snow3gKeyState4_t *pSrcState, + snow3gKeyState1_t *pDstState, + uint32_t NumBuffers) +{ + uint32_t i; + uint32_t T = 0, iLFSR_X = pSrcState->iLFSR_X; + __m128i *LFSR_X = pSrcState->LFSR_X; + + for (i = 0; i < 16; i++) { + switch (NumBuffers) { + case 0: + T = _mm_extract_epi32(LFSR_X[(i + iLFSR_X) % 16], 0); + break; + case 1: + T = _mm_extract_epi32(LFSR_X[(i + iLFSR_X) % 16], 1); + break; + case 2: + T = _mm_extract_epi32(LFSR_X[(i + iLFSR_X) % 16], 2); + break; + case 3: + T = _mm_extract_epi32(LFSR_X[(i + iLFSR_X) % 16], 3); + break; + } + pDstState->LFSR_S[i] = T; + } + + i = 0; + switch (NumBuffers) { + case 0: + T = _mm_extract_epi32(pSrcState->FSM_X[i], 0); + break; + case 1: + T = _mm_extract_epi32(pSrcState->FSM_X[i], 1); + break; + case 2: + T = _mm_extract_epi32(pSrcState->FSM_X[i], 2); + break; + case 3: + T = _mm_extract_epi32(pSrcState->FSM_X[i], 3); + break; + } + pDstState->FSM_R1 = T; + + i = 1; + switch (NumBuffers) { + case 0: + T = _mm_extract_epi32(pSrcState->FSM_X[i], 0); + break; + case 1: + T = _mm_extract_epi32(pSrcState->FSM_X[i], 1); + break; + case 2: + T = _mm_extract_epi32(pSrcState->FSM_X[i], 2); + break; + case 3: + T = _mm_extract_epi32(pSrcState->FSM_X[i], 3); + break; + } + pDstState->FSM_R2 = T; + + i = 2; + switch (NumBuffers) { + case 0: + T = _mm_extract_epi32(pSrcState->FSM_X[i], 0); + break; + case 1: + T = _mm_extract_epi32(pSrcState->FSM_X[i], 1); + break; + case 2: + T = _mm_extract_epi32(pSrcState->FSM_X[i], 2); + break; + case 3: + T = _mm_extract_epi32(pSrcState->FSM_X[i], 3); + break; + } + pDstState->FSM_R3 = T; +} + +/*--------------------------------------------------------- + * f8() + * Initializations and Context size definitions + *---------------------------------------------------------*/ +size_t SNOW3G_KEY_SCHED_SIZE(void) { return sizeof(snow3g_key_schedule_t); } + +int SNOW3G_INIT_KEY_SCHED(const void *pKey, snow3g_key_schedule_t *pCtx) +{ +#ifdef SAFE_PARAM + if ((pKey == NULL) || (pCtx == NULL)) + return -1; +#endif + + const uint32_t *pKey32 = pKey; + + pCtx->k[3] = BSWAP32(pKey32[0]); + pCtx->k[2] = BSWAP32(pKey32[1]); + pCtx->k[1] = BSWAP32(pKey32[2]); + pCtx->k[0] = BSWAP32(pKey32[3]); + + return 0; +} + +/*--------------------------------------------------------- + * @description + * Snow3G F8 1 buffer: + * Single buffer enc/dec with IV and precomputed key schedule + *---------------------------------------------------------*/ +void SNOW3G_F8_1_BUFFER(const snow3g_key_schedule_t *pHandle, + const void *pIV, + const void *pBufferIn, + void *pBufferOut, + const uint32_t lengthInBytes) +{ +#ifdef SAFE_PARAM + if ((pHandle == NULL) || (pIV == NULL) || + (pBufferIn == NULL) || (pBufferOut == NULL) || + (lengthInBytes == 0) || (lengthInBytes > SNOW3G_MAX_BYTELEN)) + return; +#endif + snow3gKeyState1_t ctx; + uint32_t KS4; /* 4 bytes of keystream */ + + /* Initialize the schedule from the IV */ + snow3gStateInitialize_1(&ctx, pHandle, pIV); + + /* Clock FSM and LFSR once, ignore the keystream */ + snow3g_keystream_1_4(&ctx, &KS4); + + f8_snow3g(&ctx, pBufferIn, pBufferOut, lengthInBytes); + +#ifdef SAFE_DATA + CLEAR_VAR(&KS4, sizeof(KS4)); + CLEAR_MEM(&ctx, sizeof(ctx)); + CLEAR_SCRATCH_GPS(); + CLEAR_SCRATCH_SIMD_REGS(); +#endif /* SAFE_DATA */ +} + +/*--------------------------------------------------------- + * @description + * Snow3G F8 bit 1 buffer: + * Single buffer enc/dec with IV and precomputed key schedule + *---------------------------------------------------------*/ +void SNOW3G_F8_1_BUFFER_BIT(const snow3g_key_schedule_t *pHandle, + const void *pIV, + const void *pBufferIn, + void *pBufferOut, + const uint32_t lengthInBits, + const uint32_t offsetInBits) +{ +#ifdef SAFE_PARAM + if ((pHandle == NULL) || (pIV == NULL) || + (pBufferIn == NULL) || (pBufferOut == NULL) || + (lengthInBits == 0)) + return; +#endif + + snow3gKeyState1_t ctx; + uint32_t KS4; /* 4 bytes of keystream */ + + /* Initialize the schedule from the IV */ + snow3gStateInitialize_1(&ctx, pHandle, pIV); + + /* Clock FSM and LFSR once, ignore the keystream */ + snow3g_keystream_1_4(&ctx, &KS4); + + f8_snow3g_bit(&ctx, pBufferIn, pBufferOut, lengthInBits, offsetInBits); + +#ifdef SAFE_DATA + CLEAR_VAR(&KS4, sizeof(KS4)); + CLEAR_MEM(&ctx, sizeof(ctx)); + CLEAR_SCRATCH_GPS(); + CLEAR_SCRATCH_SIMD_REGS(); +#endif /* SAFE_DATA */ +} + +/*--------------------------------------------------------- + * @description + * Snow3G F8 2 buffer: + * Two buffers enc/dec with the same key schedule. + * The 3 IVs are independent and are passed as an array of pointers. + * Each buffer and data length are separate. + *---------------------------------------------------------*/ +void SNOW3G_F8_2_BUFFER(const snow3g_key_schedule_t *pHandle, + const void *pIV1, + const void *pIV2, + const void *pBufIn1, + void *pBufOut1, + const uint32_t lenInBytes1, + const void *pBufIn2, + void *pBufOut2, + const uint32_t lenInBytes2) +{ +#ifdef SAFE_PARAM + if ((pHandle == NULL) || (pIV1 == NULL) || (pIV2 == NULL) || + (pBufIn1 == NULL) || (pBufOut1 == NULL) || + (pBufIn2 == NULL) || (pBufOut2 == NULL) || + (lenInBytes1 == 0) || (lenInBytes1 > SNOW3G_MAX_BYTELEN) || + (lenInBytes2 == 0) || (lenInBytes2 > SNOW3G_MAX_BYTELEN)) + return; +#endif + + snow3gKeyState1_t ctx1, ctx2; + uint32_t KS4; /* 4 bytes of keystream */ + + /* Initialize the schedule from the IV */ + snow3gStateInitialize_1(&ctx1, pHandle, pIV1); + + /* Clock FSM and LFSR once, ignore the keystream */ + snow3g_keystream_1_4(&ctx1, &KS4); + + /* data processing for packet 1 */ + f8_snow3g(&ctx1, pBufIn1, pBufOut1, lenInBytes1); + + /* Initialize the schedule from the IV */ + snow3gStateInitialize_1(&ctx2, pHandle, pIV2); + + /* Clock FSM and LFSR once, ignore the keystream */ + snow3g_keystream_1_4(&ctx2, &KS4); + + /* data processing for packet 2 */ + f8_snow3g(&ctx2, pBufIn2, pBufOut2, lenInBytes2); + +#ifdef SAFE_DATA + CLEAR_VAR(&KS4, sizeof(KS4)); + CLEAR_MEM(&ctx1, sizeof(ctx1)); + CLEAR_MEM(&ctx2, sizeof(ctx2)); + CLEAR_SCRATCH_GPS(); + CLEAR_SCRATCH_SIMD_REGS(); +#endif /* SAFE_DATA */ + +} + +/*--------------------------------------------------------- + * @description + * Snow3G F8 4 buffer: + * Four packets enc/dec with the same key schedule. + * The 4 IVs are independent and are passed as an array of pointers. + * Each buffer and data length are separate. + *---------------------------------------------------------*/ +void SNOW3G_F8_4_BUFFER(const snow3g_key_schedule_t *pHandle, + const void *pIV1, + const void *pIV2, + const void *pIV3, + const void *pIV4, + const void *pBufferIn1, + void *pBufferOut1, + const uint32_t lengthInBytes1, + const void *pBufferIn2, + void *pBufferOut2, + const uint32_t lengthInBytes2, + const void *pBufferIn3, + void *pBufferOut3, + const uint32_t lengthInBytes3, + const void *pBufferIn4, + void *pBufferOut4, + const uint32_t lengthInBytes4) +{ +#ifdef SAFE_PARAM + if ((pHandle == NULL) || + (pIV1 == NULL) || (pIV2 == NULL) || + (pIV3 == NULL) || (pIV4 == NULL) || + (pBufferIn1 == NULL) || (pBufferOut1 == NULL) || + (pBufferIn2 == NULL) || (pBufferOut2 == NULL) || + (pBufferIn3 == NULL) || (pBufferOut3 == NULL) || + (pBufferIn4 == NULL) || (pBufferOut4 == NULL) || + (lengthInBytes1 == 0) || (lengthInBytes1 > SNOW3G_MAX_BYTELEN) || + (lengthInBytes2 == 0) || (lengthInBytes2 > SNOW3G_MAX_BYTELEN) || + (lengthInBytes3 == 0) || (lengthInBytes3 > SNOW3G_MAX_BYTELEN) || + (lengthInBytes4 == 0) || (lengthInBytes4 > SNOW3G_MAX_BYTELEN)) + return; +#endif + + snow3gKeyState4_t ctx; + __m128i H, L; /* 4 bytes of keystream */ + uint32_t lenInBytes1 = lengthInBytes1; + uint32_t lenInBytes2 = lengthInBytes2; + uint32_t lenInBytes3 = lengthInBytes3; + uint32_t lenInBytes4 = lengthInBytes4; + uint32_t bytes1 = + (lenInBytes1 < lenInBytes2 ? lenInBytes1 + : lenInBytes2); /* number of bytes */ + uint32_t bytes2 = + (lenInBytes3 < lenInBytes4 ? lenInBytes3 + : lenInBytes4); /* number of bytes */ + /* min num of bytes */ + uint32_t bytes = (bytes1 < bytes2) ? bytes1 : bytes2; + uint32_t qwords = bytes / SNOW3G_8_BYTES; + uint8_t *pBufOut1 = pBufferOut1; + uint8_t *pBufOut2 = pBufferOut2; + uint8_t *pBufOut3 = pBufferOut3; + uint8_t *pBufOut4 = pBufferOut4; + const uint8_t *pBufIn1 = pBufferIn1; + const uint8_t *pBufIn2 = pBufferIn2; + const uint8_t *pBufIn3 = pBufferIn3; + const uint8_t *pBufIn4 = pBufferIn4; + + bytes = qwords * SNOW3G_8_BYTES; /* rounded down minimum length */ + + /* Initialize the schedule from the IV */ + snow3gStateInitialize_4(&ctx, pHandle, pIV1, pIV2, pIV3, pIV4); + + /* Clock FSM and LFSR once, ignore the keystream */ + snow3g_keystream_4_4(&ctx, &L); + + lenInBytes1 -= bytes; + lenInBytes2 -= bytes; + lenInBytes3 -= bytes; + lenInBytes4 -= bytes; + + /* generates 4 bytes at a time on all streams */ + while (qwords--) { + snow3g_keystream_4_8(&ctx, &L, &H); + pBufIn1 = xor_keystrm_rev(pBufOut1, pBufIn1, + _mm_extract_epi64(L, 0)); + pBufIn2 = xor_keystrm_rev(pBufOut2, pBufIn2, + _mm_extract_epi64(L, 1)); + pBufIn3 = xor_keystrm_rev(pBufOut3, pBufIn3, + _mm_extract_epi64(H, 0)); + pBufIn4 = xor_keystrm_rev(pBufOut4, pBufIn4, + _mm_extract_epi64(H, 1)); + + pBufOut1 += SNOW3G_8_BYTES; + pBufOut2 += SNOW3G_8_BYTES; + pBufOut3 += SNOW3G_8_BYTES; + pBufOut4 += SNOW3G_8_BYTES; + } + + /* process the remaining of each buffer + * - extract the LFSR and FSM structures + * - Continue process 1 buffer + */ + if (lenInBytes1) { + snow3gKeyState1_t ctx1; + + snow3gStateConvert_4(&ctx, &ctx1, 0); + f8_snow3g(&ctx1, pBufIn1, pBufOut1, lenInBytes1); + } + + if (lenInBytes2) { + snow3gKeyState1_t ctx2; + + snow3gStateConvert_4(&ctx, &ctx2, 1); + f8_snow3g(&ctx2, pBufIn2, pBufOut2, lenInBytes2); + } + + if (lenInBytes3) { + snow3gKeyState1_t ctx3; + + snow3gStateConvert_4(&ctx, &ctx3, 2); + f8_snow3g(&ctx3, pBufIn3, pBufOut3, lenInBytes3); + } + + if (lenInBytes4) { + snow3gKeyState1_t ctx4; + + snow3gStateConvert_4(&ctx, &ctx4, 3); + f8_snow3g(&ctx4, pBufIn4, pBufOut4, lenInBytes4); + } + +#ifdef SAFE_DATA + H = _mm_setzero_si128(); + L = _mm_setzero_si128(); + CLEAR_MEM(&ctx, sizeof(ctx)); + CLEAR_SCRATCH_GPS(); + CLEAR_SCRATCH_SIMD_REGS(); +#endif /* SAFE_DATA */ + +} + +#ifdef AVX2 +/*--------------------------------------------------------- + * @description + * Snow3G 8 buffer ks 8 multi: + * Processes 8 packets 8 bytes at a time. + * Uses individual key schedule for each buffer. + *---------------------------------------------------------*/ +static inline void +snow3g_8_buffer_ks_8_multi(uint32_t bytes, + const snow3g_key_schedule_t * const pKey[], + const void * const IV[], + const void * const pBufferIn[], + void *pBufferOut[], const uint32_t *lengthInBytes) +{ + uint32_t qwords = bytes / SNOW3G_8_BYTES; + __m256i H, L; /* 8 bytes of keystream */ + snow3gKeyState8_t ctx; + int i; + const uint8_t *tBufferIn[8]; + uint8_t *tBufferOut[8]; + uint32_t tLenInBytes[8]; + + bytes = qwords * SNOW3G_8_BYTES; /* rounded down minimum length */ + + for (i = 0; i < 8; i++) { + tBufferIn[i] = pBufferIn[i]; + tBufferOut[i] = pBufferOut[i]; + tLenInBytes[i] = lengthInBytes[i]; + } + + /* Initialize the schedule from the IV */ + snow3gStateInitialize_8_multiKey(&ctx, pKey, IV); + + /* Clock FSM and LFSR once, ignore the keystream */ + snow3g_keystream_8_4(&ctx, &L); + + for (i = 0; i < 8; i++) + tLenInBytes[i] -= bytes; + + /* generates 8 sets at a time on all streams */ + for (i = qwords; i != 0; i--) { + int j; + + snow3g_keystream_8_8(&ctx, &L, &H); + + tBufferIn[0] = xor_keystrm_rev(tBufferOut[0], tBufferIn[0], + _mm256_extract_epi64(L, 0)); + tBufferIn[1] = xor_keystrm_rev(tBufferOut[1], tBufferIn[1], + _mm256_extract_epi64(L, 1)); + tBufferIn[2] = xor_keystrm_rev(tBufferOut[2], tBufferIn[2], + _mm256_extract_epi64(H, 0)); + tBufferIn[3] = xor_keystrm_rev(tBufferOut[3], tBufferIn[3], + _mm256_extract_epi64(H, 1)); + tBufferIn[4] = xor_keystrm_rev(tBufferOut[4], tBufferIn[4], + _mm256_extract_epi64(L, 2)); + tBufferIn[5] = xor_keystrm_rev(tBufferOut[5], tBufferIn[5], + _mm256_extract_epi64(L, 3)); + tBufferIn[6] = xor_keystrm_rev(tBufferOut[6], tBufferIn[6], + _mm256_extract_epi64(H, 2)); + tBufferIn[7] = xor_keystrm_rev(tBufferOut[7], tBufferIn[7], + _mm256_extract_epi64(H, 3)); + + for (j = 0; j < 8; j++) + tBufferOut[j] += SNOW3G_8_BYTES; + } + + /* process the remaining of each buffer + * - extract the LFSR and FSM structures + * - Continue process 1 buffer + */ + if (tLenInBytes[0]) { + snow3gKeyState1_t ctx1; + + snow3gStateConvert_8(&ctx, &ctx1, 0); + f8_snow3g(&ctx1, tBufferIn[0], tBufferOut[0], tLenInBytes[0]); + } + if (tLenInBytes[1]) { + snow3gKeyState1_t ctx2; + + snow3gStateConvert_8(&ctx, &ctx2, 1); + f8_snow3g(&ctx2, tBufferIn[1], tBufferOut[1], tLenInBytes[1]); + } + if (tLenInBytes[2]) { + snow3gKeyState1_t ctx3; + + snow3gStateConvert_8(&ctx, &ctx3, 2); + f8_snow3g(&ctx3, tBufferIn[2], tBufferOut[2], tLenInBytes[2]); + } + if (tLenInBytes[3]) { + snow3gKeyState1_t ctx4; + + snow3gStateConvert_8(&ctx, &ctx4, 3); + f8_snow3g(&ctx4, tBufferIn[3], tBufferOut[3], tLenInBytes[3]); + } + if (tLenInBytes[4]) { + snow3gKeyState1_t ctx5; + + snow3gStateConvert_8(&ctx, &ctx5, 4); + f8_snow3g(&ctx5, tBufferIn[4], tBufferOut[4], tLenInBytes[4]); + } + if (tLenInBytes[5]) { + snow3gKeyState1_t ctx6; + + snow3gStateConvert_8(&ctx, &ctx6, 5); + f8_snow3g(&ctx6, tBufferIn[5], tBufferOut[5], tLenInBytes[5]); + } + if (tLenInBytes[6]) { + snow3gKeyState1_t ctx7; + + snow3gStateConvert_8(&ctx, &ctx7, 6); + f8_snow3g(&ctx7, tBufferIn[6], tBufferOut[6], tLenInBytes[6]); + } + if (tLenInBytes[7]) { + snow3gKeyState1_t ctx8; + + snow3gStateConvert_8(&ctx, &ctx8, 7); + f8_snow3g(&ctx8, tBufferIn[7], tBufferOut[7], tLenInBytes[7]); + } + +#ifdef SAFE_DATA + H = _mm256_setzero_si256(); + L = _mm256_setzero_si256(); + CLEAR_MEM(&ctx, sizeof(ctx)); +#endif /* SAFE_DATA */ +} + +/*--------------------------------------------------------- + * @description + * Snow3G 8 buffer ks 32 multi: + * Processes 8 packets 32 bytes at a time. + * Uses individual key schedule for each buffer. + *---------------------------------------------------------*/ +static inline void +snow3g_8_buffer_ks_32_multi(uint32_t bytes, + const snow3g_key_schedule_t * const pKey[], + const void * const IV[], + const void * const pBufferIn[], + void *pBufferOut[], const uint32_t *lengthInBytes) +{ + + snow3gKeyState8_t ctx; + uint32_t i; + + const uint8_t *tBufferIn[8]; + uint8_t *tBufferOut[8]; + uint32_t tLenInBytes[8]; + + for (i = 0; i < 8; i++) { + tBufferIn[i] = pBufferIn[i]; + tBufferOut[i] = pBufferOut[i]; + tLenInBytes[i] = lengthInBytes[i]; + } + + uint32_t blocks = bytes / 32; + + bytes = blocks * 32; /* rounded down minimum length */ + + /* Initialize the schedule from the IV */ + snow3gStateInitialize_8_multiKey(&ctx, pKey, IV); + + /* Clock FSM and LFSR once, ignore the keystream */ + __m256i ks[8]; + + snow3g_keystream_8_4(&ctx, ks); + + for (i = 0; i < 8; i++) + tLenInBytes[i] -= bytes; + + __m256i in[8]; + + /* generates 8 sets at a time on all streams */ + for (i = 0; i < blocks; i++) { + int j; + + in[0] = _mm256_loadu_si256((const __m256i *)tBufferIn[0]); + in[1] = _mm256_loadu_si256((const __m256i *)tBufferIn[1]); + in[2] = _mm256_loadu_si256((const __m256i *)tBufferIn[2]); + in[3] = _mm256_loadu_si256((const __m256i *)tBufferIn[3]); + in[4] = _mm256_loadu_si256((const __m256i *)tBufferIn[4]); + in[5] = _mm256_loadu_si256((const __m256i *)tBufferIn[5]); + in[6] = _mm256_loadu_si256((const __m256i *)tBufferIn[6]); + in[7] = _mm256_loadu_si256((const __m256i *)tBufferIn[7]); + + snow3g_keystream_8_32(&ctx, ks); + + _mm256_storeu_si256((__m256i *)tBufferOut[0], + _mm256_xor_si256(in[0], ks[0])); + _mm256_storeu_si256((__m256i *)tBufferOut[1], + _mm256_xor_si256(in[1], ks[1])); + _mm256_storeu_si256((__m256i *)tBufferOut[2], + _mm256_xor_si256(in[2], ks[2])); + _mm256_storeu_si256((__m256i *)tBufferOut[3], + _mm256_xor_si256(in[3], ks[3])); + _mm256_storeu_si256((__m256i *)tBufferOut[4], + _mm256_xor_si256(in[4], ks[4])); + _mm256_storeu_si256((__m256i *)tBufferOut[5], + _mm256_xor_si256(in[5], ks[5])); + _mm256_storeu_si256((__m256i *)tBufferOut[6], + _mm256_xor_si256(in[6], ks[6])); + _mm256_storeu_si256((__m256i *)tBufferOut[7], + _mm256_xor_si256(in[7], ks[7])); + + for (j = 0; j < 8; j++) { + tBufferIn[i] += 32; + tBufferOut[i] += 32; + } + } + + /* process the remaining of each buffer + * - extract the LFSR and FSM structures + * - Continue process 1 buffer + */ + if (tLenInBytes[0]) { + snow3gKeyState1_t ctx1; + + snow3gStateConvert_8(&ctx, &ctx1, 0); + f8_snow3g(&ctx1, tBufferIn[0], tBufferOut[0], tLenInBytes[0]); + } + if (tLenInBytes[1]) { + snow3gKeyState1_t ctx2; + + snow3gStateConvert_8(&ctx, &ctx2, 1); + f8_snow3g(&ctx2, tBufferIn[1], tBufferOut[1], tLenInBytes[1]); + } + if (tLenInBytes[2]) { + snow3gKeyState1_t ctx3; + + snow3gStateConvert_8(&ctx, &ctx3, 2); + f8_snow3g(&ctx3, tBufferIn[2], tBufferOut[2], tLenInBytes[2]); + } + if (tLenInBytes[3]) { + snow3gKeyState1_t ctx4; + + snow3gStateConvert_8(&ctx, &ctx4, 3); + f8_snow3g(&ctx4, tBufferIn[3], tBufferOut[3], tLenInBytes[3]); + } + if (tLenInBytes[4]) { + snow3gKeyState1_t ctx5; + + snow3gStateConvert_8(&ctx, &ctx5, 4); + f8_snow3g(&ctx5, tBufferIn[4], tBufferOut[4], tLenInBytes[4]); + } + if (tLenInBytes[5]) { + snow3gKeyState1_t ctx6; + + snow3gStateConvert_8(&ctx, &ctx6, 5); + f8_snow3g(&ctx6, tBufferIn[5], tBufferOut[5], tLenInBytes[5]); + } + if (tLenInBytes[6]) { + snow3gKeyState1_t ctx7; + + snow3gStateConvert_8(&ctx, &ctx7, 6); + f8_snow3g(&ctx7, tBufferIn[6], tBufferOut[6], tLenInBytes[6]); + } + if (tLenInBytes[7]) { + snow3gKeyState1_t ctx8; + + snow3gStateConvert_8(&ctx, &ctx8, 7); + f8_snow3g(&ctx8, tBufferIn[7], tBufferOut[7], tLenInBytes[7]); + } + +#ifdef SAFE_DATA + CLEAR_MEM(&ctx, sizeof(ctx)); + CLEAR_MEM(&ks, sizeof(ks)); + CLEAR_MEM(&in, sizeof(in)); +#endif /* SAFE_DATA */ +} + +/*--------------------------------------------------------- + * @description + * Snow3G 8 buffer ks 8 multi: + * Processes 8 packets 8 bytes at a time. + * Uses same key schedule for each buffer. + *---------------------------------------------------------*/ +static inline void +snow3g_8_buffer_ks_8(uint32_t bytes, + const snow3g_key_schedule_t *pHandle, + const void *pIV1, + const void *pIV2, + const void *pIV3, + const void *pIV4, + const void *pIV5, + const void *pIV6, + const void *pIV7, + const void *pIV8, + const void *pBufferIn1, void *pBufferOut1, + const uint32_t lengthInBytes1, + const void *pBufferIn2, void *pBufferOut2, + const uint32_t lengthInBytes2, + const void *pBufferIn3, void *pBufferOut3, + const uint32_t lengthInBytes3, + const void *pBufferIn4, void *pBufferOut4, + const uint32_t lengthInBytes4, + const void *pBufferIn5, void *pBufferOut5, + const uint32_t lengthInBytes5, + const void *pBufferIn6, void *pBufferOut6, + const uint32_t lengthInBytes6, + const void *pBufferIn7, void *pBufferOut7, + const uint32_t lengthInBytes7, + const void *pBufferIn8, void *pBufferOut8, + const uint32_t lengthInBytes8) +{ + + uint32_t qwords = bytes / SNOW3G_8_BYTES; + __m256i H, L; /* 8 bytes of keystream */ + snow3gKeyState8_t ctx; + int i; + uint32_t lenInBytes1 = lengthInBytes1; + uint32_t lenInBytes2 = lengthInBytes2; + uint32_t lenInBytes3 = lengthInBytes3; + uint32_t lenInBytes4 = lengthInBytes4; + uint32_t lenInBytes5 = lengthInBytes5; + uint32_t lenInBytes6 = lengthInBytes6; + uint32_t lenInBytes7 = lengthInBytes7; + uint32_t lenInBytes8 = lengthInBytes8; + uint8_t *pBufOut1 = pBufferOut1; + uint8_t *pBufOut2 = pBufferOut2; + uint8_t *pBufOut3 = pBufferOut3; + uint8_t *pBufOut4 = pBufferOut4; + uint8_t *pBufOut5 = pBufferOut5; + uint8_t *pBufOut6 = pBufferOut6; + uint8_t *pBufOut7 = pBufferOut7; + uint8_t *pBufOut8 = pBufferOut8; + const uint8_t *pBufIn1 = pBufferIn1; + const uint8_t *pBufIn2 = pBufferIn2; + const uint8_t *pBufIn3 = pBufferIn3; + const uint8_t *pBufIn4 = pBufferIn4; + const uint8_t *pBufIn5 = pBufferIn5; + const uint8_t *pBufIn6 = pBufferIn6; + const uint8_t *pBufIn7 = pBufferIn7; + const uint8_t *pBufIn8 = pBufferIn8; + + bytes = qwords * SNOW3G_8_BYTES; /* rounded down minimum length */ + + /* Initialize the schedule from the IV */ + snow3gStateInitialize_8(&ctx, pHandle, pIV1, pIV2, pIV3, + pIV4, pIV5, pIV6, pIV7, pIV8); + + /* Clock FSM and LFSR once, ignore the keystream */ + snow3g_keystream_8_4(&ctx, &L); + + lenInBytes1 -= bytes; + lenInBytes2 -= bytes; + lenInBytes3 -= bytes; + lenInBytes4 -= bytes; + lenInBytes5 -= bytes; + lenInBytes6 -= bytes; + lenInBytes7 -= bytes; + lenInBytes8 -= bytes; + + /* generates 8 sets at a time on all streams */ + for (i = qwords; i != 0; i--) { + snow3g_keystream_8_8(&ctx, &L, &H); + + pBufIn1 = xor_keystrm_rev(pBufOut1, pBufIn1, + _mm256_extract_epi64(L, 0)); + pBufIn2 = xor_keystrm_rev(pBufOut2, pBufIn2, + _mm256_extract_epi64(L, 1)); + pBufIn3 = xor_keystrm_rev(pBufOut3, pBufIn3, + _mm256_extract_epi64(H, 0)); + pBufIn4 = xor_keystrm_rev(pBufOut4, pBufIn4, + _mm256_extract_epi64(H, 1)); + pBufIn5 = xor_keystrm_rev(pBufOut5, pBufIn5, + _mm256_extract_epi64(L, 2)); + pBufIn6 = xor_keystrm_rev(pBufOut6, pBufIn6, + _mm256_extract_epi64(L, 3)); + pBufIn7 = xor_keystrm_rev(pBufOut7, pBufIn7, + _mm256_extract_epi64(H, 2)); + pBufIn8 = xor_keystrm_rev(pBufOut8, pBufIn8, + _mm256_extract_epi64(H, 3)); + + pBufOut1 += SNOW3G_8_BYTES; + pBufOut2 += SNOW3G_8_BYTES; + pBufOut3 += SNOW3G_8_BYTES; + pBufOut4 += SNOW3G_8_BYTES; + pBufOut5 += SNOW3G_8_BYTES; + pBufOut6 += SNOW3G_8_BYTES; + pBufOut7 += SNOW3G_8_BYTES; + pBufOut8 += SNOW3G_8_BYTES; + } + + /* process the remaining of each buffer + * - extract the LFSR and FSM structures + * - Continue process 1 buffer + */ + if (lenInBytes1) { + snow3gKeyState1_t ctx1; + + snow3gStateConvert_8(&ctx, &ctx1, 0); + f8_snow3g(&ctx1, pBufIn1, pBufOut1, lenInBytes1); + } + + if (lenInBytes2) { + snow3gKeyState1_t ctx2; + + snow3gStateConvert_8(&ctx, &ctx2, 1); + f8_snow3g(&ctx2, pBufIn2, pBufOut2, lenInBytes2); + } + + if (lenInBytes3) { + snow3gKeyState1_t ctx3; + + snow3gStateConvert_8(&ctx, &ctx3, 2); + f8_snow3g(&ctx3, pBufIn3, pBufOut3, lenInBytes3); + } + + if (lenInBytes4) { + snow3gKeyState1_t ctx4; + + snow3gStateConvert_8(&ctx, &ctx4, 3); + f8_snow3g(&ctx4, pBufIn4, pBufOut4, lenInBytes4); + } + + if (lenInBytes5) { + snow3gKeyState1_t ctx5; + + snow3gStateConvert_8(&ctx, &ctx5, 4); + f8_snow3g(&ctx5, pBufIn5, pBufOut5, lenInBytes5); + } + + if (lenInBytes6) { + snow3gKeyState1_t ctx6; + + snow3gStateConvert_8(&ctx, &ctx6, 5); + f8_snow3g(&ctx6, pBufIn6, pBufOut6, lenInBytes6); + } + + if (lenInBytes7) { + snow3gKeyState1_t ctx7; + + snow3gStateConvert_8(&ctx, &ctx7, 6); + f8_snow3g(&ctx7, pBufIn7, pBufOut7, lenInBytes7); + } + + if (lenInBytes8) { + snow3gKeyState1_t ctx8; + + snow3gStateConvert_8(&ctx, &ctx8, 7); + f8_snow3g(&ctx8, pBufIn8, pBufOut8, lenInBytes8); + } + +#ifdef SAFE_DATA + H = _mm256_setzero_si256(); + L = _mm256_setzero_si256(); + CLEAR_MEM(&ctx, sizeof(ctx)); +#endif /* SAFE_DATA */ +} + +/*--------------------------------------------------------- + * @description + * Snow3G 8 buffer ks 32 multi: + * Processes 8 packets 32 bytes at a time. + * Uses same key schedule for each buffer. + *---------------------------------------------------------*/ +static inline void +snow3g_8_buffer_ks_32(uint32_t bytes, + const snow3g_key_schedule_t *pKey, + const void *pIV1, const void *pIV2, + const void *pIV3, const void *pIV4, + const void *pIV5, const void *pIV6, + const void *pIV7, const void *pIV8, + const void *pBufferIn1, void *pBufferOut1, + const uint32_t lengthInBytes1, + const void *pBufferIn2, void *pBufferOut2, + const uint32_t lengthInBytes2, + const void *pBufferIn3, void *pBufferOut3, + const uint32_t lengthInBytes3, + const void *pBufferIn4, void *pBufferOut4, + const uint32_t lengthInBytes4, + const void *pBufferIn5, void *pBufferOut5, + const uint32_t lengthInBytes5, + const void *pBufferIn6, void *pBufferOut6, + const uint32_t lengthInBytes6, + const void *pBufferIn7, void *pBufferOut7, + const uint32_t lengthInBytes7, + const void *pBufferIn8, void *pBufferOut8, + const uint32_t lengthInBytes8) +{ + snow3gKeyState8_t ctx; + uint32_t i; + uint32_t lenInBytes1 = lengthInBytes1; + uint32_t lenInBytes2 = lengthInBytes2; + uint32_t lenInBytes3 = lengthInBytes3; + uint32_t lenInBytes4 = lengthInBytes4; + uint32_t lenInBytes5 = lengthInBytes5; + uint32_t lenInBytes6 = lengthInBytes6; + uint32_t lenInBytes7 = lengthInBytes7; + uint32_t lenInBytes8 = lengthInBytes8; + uint8_t *pBufOut1 = pBufferOut1; + uint8_t *pBufOut2 = pBufferOut2; + uint8_t *pBufOut3 = pBufferOut3; + uint8_t *pBufOut4 = pBufferOut4; + uint8_t *pBufOut5 = pBufferOut5; + uint8_t *pBufOut6 = pBufferOut6; + uint8_t *pBufOut7 = pBufferOut7; + uint8_t *pBufOut8 = pBufferOut8; + const uint8_t *pBufIn1 = pBufferIn1; + const uint8_t *pBufIn2 = pBufferIn2; + const uint8_t *pBufIn3 = pBufferIn3; + const uint8_t *pBufIn4 = pBufferIn4; + const uint8_t *pBufIn5 = pBufferIn5; + const uint8_t *pBufIn6 = pBufferIn6; + const uint8_t *pBufIn7 = pBufferIn7; + const uint8_t *pBufIn8 = pBufferIn8; + + uint32_t blocks = bytes / 32; + + bytes = blocks * 32; /* rounded down minimum length */ + + /* Initialize the schedule from the IV */ + snow3gStateInitialize_8(&ctx, pKey, pIV1, pIV2, pIV3, pIV4, pIV5, pIV6, + pIV7, pIV8); + + /* Clock FSM and LFSR once, ignore the keystream */ + __m256i ks[8]; + + snow3g_keystream_8_4(&ctx, ks); + + lenInBytes1 -= bytes; + lenInBytes2 -= bytes; + lenInBytes3 -= bytes; + lenInBytes4 -= bytes; + lenInBytes5 -= bytes; + lenInBytes6 -= bytes; + lenInBytes7 -= bytes; + lenInBytes8 -= bytes; + + __m256i in[8]; + + /* generates 8 sets at a time on all streams */ + for (i = 0; i < blocks; i++) { + + in[0] = _mm256_loadu_si256((const __m256i *)pBufIn1); + in[1] = _mm256_loadu_si256((const __m256i *)pBufIn2); + in[2] = _mm256_loadu_si256((const __m256i *)pBufIn3); + in[3] = _mm256_loadu_si256((const __m256i *)pBufIn4); + in[4] = _mm256_loadu_si256((const __m256i *)pBufIn5); + in[5] = _mm256_loadu_si256((const __m256i *)pBufIn6); + in[6] = _mm256_loadu_si256((const __m256i *)pBufIn7); + in[7] = _mm256_loadu_si256((const __m256i *)pBufIn8); + + snow3g_keystream_8_32(&ctx, ks); + + _mm256_storeu_si256((__m256i *)pBufOut1, + _mm256_xor_si256(in[0], ks[0])); + _mm256_storeu_si256((__m256i *)pBufOut2, + _mm256_xor_si256(in[1], ks[1])); + _mm256_storeu_si256((__m256i *)pBufOut3, + _mm256_xor_si256(in[2], ks[2])); + _mm256_storeu_si256((__m256i *)pBufOut4, + _mm256_xor_si256(in[3], ks[3])); + _mm256_storeu_si256((__m256i *)pBufOut5, + _mm256_xor_si256(in[4], ks[4])); + _mm256_storeu_si256((__m256i *)pBufOut6, + _mm256_xor_si256(in[5], ks[5])); + _mm256_storeu_si256((__m256i *)pBufOut7, + _mm256_xor_si256(in[6], ks[6])); + _mm256_storeu_si256((__m256i *)pBufOut8, + _mm256_xor_si256(in[7], ks[7])); + + pBufIn1 += 32; + pBufIn2 += 32; + pBufIn3 += 32; + pBufIn4 += 32; + pBufIn5 += 32; + pBufIn6 += 32; + pBufIn7 += 32; + pBufIn8 += 32; + + pBufOut1 += 32; + pBufOut2 += 32; + pBufOut3 += 32; + pBufOut4 += 32; + pBufOut5 += 32; + pBufOut6 += 32; + pBufOut7 += 32; + pBufOut8 += 32; + } + + /* process the remaining of each buffer + * - extract the LFSR and FSM structures + * - Continue process 1 buffer + */ + if (lenInBytes1) { + snow3gKeyState1_t ctx1; + + snow3gStateConvert_8(&ctx, &ctx1, 0); + f8_snow3g(&ctx1, pBufIn1, pBufOut1, lenInBytes1); + } + + if (lenInBytes2) { + snow3gKeyState1_t ctx2; + + snow3gStateConvert_8(&ctx, &ctx2, 1); + f8_snow3g(&ctx2, pBufIn2, pBufOut2, lenInBytes2); + } + + if (lenInBytes3) { + snow3gKeyState1_t ctx3; + + snow3gStateConvert_8(&ctx, &ctx3, 2); + f8_snow3g(&ctx3, pBufIn3, pBufOut3, lenInBytes3); + } + + if (lenInBytes4) { + snow3gKeyState1_t ctx4; + + snow3gStateConvert_8(&ctx, &ctx4, 3); + f8_snow3g(&ctx4, pBufIn4, pBufOut4, lenInBytes4); + } + + if (lenInBytes5) { + snow3gKeyState1_t ctx5; + + snow3gStateConvert_8(&ctx, &ctx5, 4); + f8_snow3g(&ctx5, pBufIn5, pBufOut5, lenInBytes5); + } + + if (lenInBytes6) { + snow3gKeyState1_t ctx6; + + snow3gStateConvert_8(&ctx, &ctx6, 5); + f8_snow3g(&ctx6, pBufIn6, pBufOut6, lenInBytes6); + } + + if (lenInBytes7) { + snow3gKeyState1_t ctx7; + + snow3gStateConvert_8(&ctx, &ctx7, 6); + f8_snow3g(&ctx7, pBufIn7, pBufOut7, lenInBytes7); + } + + if (lenInBytes8) { + snow3gKeyState1_t ctx8; + + snow3gStateConvert_8(&ctx, &ctx8, 7); + f8_snow3g(&ctx8, pBufIn8, pBufOut8, lenInBytes8); + } + +#ifdef SAFE_DATA + CLEAR_MEM(&ctx, sizeof(ctx)); + CLEAR_MEM(&ks, sizeof(ks)); + CLEAR_MEM(&in, sizeof(in)); +#endif /* SAFE_DATA */ +} +#endif /* AVX2 */ + +/*--------------------------------------------------------- + * @description + * Snow3G F8 8 buffer, multi-key: + * Eight packets enc/dec with eight respective key schedules. + * The 8 IVs are independent and are passed as an array of pointers. + * Each buffer and data length are separate. + *---------------------------------------------------------*/ +void SNOW3G_F8_8_BUFFER_MULTIKEY(const snow3g_key_schedule_t * const pKey[], + const void * const IV[], + const void * const BufferIn[], + void *BufferOut[], + const uint32_t lengthInBytes[]) +{ + int i; + +#ifdef SAFE_PARAM + if ((pKey == NULL) || (IV == NULL) || (BufferIn == NULL) || + (BufferOut == NULL) || (lengthInBytes == NULL)) + return; + + for (i = 0; i < 8; i++) + if ((pKey[i] == NULL) || (IV[i] == NULL) || + (BufferIn[i] == NULL) || (BufferOut[i] == NULL) || + (lengthInBytes[i] == 0) || + (lengthInBytes[i] > SNOW3G_MAX_BYTELEN)) + return; +#endif + +#ifndef AVX2 + /* basic C workaround for lack of non AVX2 implementation */ + for (i = 0; i < 8; i++) + SNOW3G_F8_1_BUFFER(pKey[i], IV[i], BufferIn[i], BufferOut[i], + lengthInBytes[i]); +#else + uint32_t bytes = lengthInBytes[0]; + + /* find min byte lenght */ + for (i = 1; i < 8; i++) + if (lengthInBytes[i] < bytes) + bytes = lengthInBytes[i]; + + if (bytes % 32) { + snow3g_8_buffer_ks_8_multi(bytes, pKey, IV, BufferIn, BufferOut, + lengthInBytes); + } else { + snow3g_8_buffer_ks_32_multi(bytes, pKey, IV, BufferIn, + BufferOut, lengthInBytes); + } +#ifdef SAFE_DATA + CLEAR_SCRATCH_GPS(); + CLEAR_SCRATCH_SIMD_REGS(); +#endif +#endif /* AVX2 */ +} + +/*--------------------------------------------------------- + * @description + * Snow3G F8 8 buffer: + * Eight packets enc/dec with the same key schedule. + * The 8 IVs are independent and are passed as an array of pointers. + * Each buffer and data length are separate. + * Uses AVX instructions. + *---------------------------------------------------------*/ +void SNOW3G_F8_8_BUFFER(const snow3g_key_schedule_t *pHandle, + const void *pIV1, + const void *pIV2, + const void *pIV3, + const void *pIV4, + const void *pIV5, + const void *pIV6, + const void *pIV7, + const void *pIV8, + const void *pBufIn1, + void *pBufOut1, + const uint32_t lenInBytes1, + const void *pBufIn2, + void *pBufOut2, + const uint32_t lenInBytes2, + const void *pBufIn3, + void *pBufOut3, + const uint32_t lenInBytes3, + const void *pBufIn4, + void *pBufOut4, + const uint32_t lenInBytes4, + const void *pBufIn5, + void *pBufOut5, + const uint32_t lenInBytes5, + const void *pBufIn6, + void *pBufOut6, + const uint32_t lenInBytes6, + const void *pBufIn7, + void *pBufOut7, + const uint32_t lenInBytes7, + const void *pBufIn8, + void *pBufOut8, + const uint32_t lenInBytes8) +{ +#ifdef SAFE_PARAM + if ((pHandle == NULL) || + (pIV1 == NULL) || (pIV2 == NULL) || + (pIV3 == NULL) || (pIV4 == NULL) || + (pIV5 == NULL) || (pIV6 == NULL) || + (pIV7 == NULL) || (pIV8 == NULL) || + (pBufIn1 == NULL) || (pBufOut1 == NULL) || + (pBufIn2 == NULL) || (pBufOut2 == NULL) || + (pBufIn3 == NULL) || (pBufOut3 == NULL) || + (pBufIn4 == NULL) || (pBufOut4 == NULL) || + (pBufIn5 == NULL) || (pBufOut5 == NULL) || + (pBufIn6 == NULL) || (pBufOut6 == NULL) || + (pBufIn7 == NULL) || (pBufOut7 == NULL) || + (pBufIn8 == NULL) || (pBufOut8 == NULL) || + (lenInBytes1 == 0) || (lenInBytes1 > SNOW3G_MAX_BYTELEN) || + (lenInBytes2 == 0) || (lenInBytes2 > SNOW3G_MAX_BYTELEN) || + (lenInBytes3 == 0) || (lenInBytes3 > SNOW3G_MAX_BYTELEN) || + (lenInBytes4 == 0) || (lenInBytes4 > SNOW3G_MAX_BYTELEN) || + (lenInBytes5 == 0) || (lenInBytes5 > SNOW3G_MAX_BYTELEN) || + (lenInBytes6 == 0) || (lenInBytes6 > SNOW3G_MAX_BYTELEN) || + (lenInBytes7 == 0) || (lenInBytes7 > SNOW3G_MAX_BYTELEN) || + (lenInBytes8 == 0) || (lenInBytes8 > SNOW3G_MAX_BYTELEN)) + return; +#endif + +#ifdef AVX2 + uint32_t bytes1 = + (lenInBytes1 < lenInBytes2 ? lenInBytes1 + : lenInBytes2); /* number of bytes */ + uint32_t bytes2 = + (lenInBytes3 < lenInBytes4 ? lenInBytes3 + : lenInBytes4); /* number of bytes */ + uint32_t bytes3 = + (lenInBytes5 < lenInBytes6 ? lenInBytes5 + : lenInBytes6); /* number of bytes */ + uint32_t bytes4 = + (lenInBytes7 < lenInBytes8 ? lenInBytes7 + : lenInBytes8); /* number of bytes */ + uint32_t bytesq1 = + (bytes1 < bytes2) ? bytes1 : bytes2; /* min number of bytes */ + uint32_t bytesq2 = (bytes3 < bytes4) ? bytes3 : bytes4; + uint32_t bytes = (bytesq1 < bytesq2) ? bytesq1 : bytesq2; + + if (bytes % 32) { + snow3g_8_buffer_ks_8( + bytes, pHandle, pIV1, pIV2, pIV3, pIV4, pIV5, pIV6, + pIV7, pIV8, pBufIn1, pBufOut1, lenInBytes1, pBufIn2, + pBufOut2, lenInBytes2, pBufIn3, pBufOut3, lenInBytes3, + pBufIn4, pBufOut4, lenInBytes4, pBufIn5, pBufOut5, + lenInBytes5, pBufIn6, pBufOut6, lenInBytes6, pBufIn7, + pBufOut7, lenInBytes7, pBufIn8, pBufOut8, lenInBytes8); + } else { + snow3g_8_buffer_ks_32( + bytes, pHandle, pIV1, pIV2, pIV3, pIV4, pIV5, pIV6, + pIV7, pIV8, pBufIn1, pBufOut1, lenInBytes1, pBufIn2, + pBufOut2, lenInBytes2, pBufIn3, pBufOut3, lenInBytes3, + pBufIn4, pBufOut4, lenInBytes4, pBufIn5, pBufOut5, + lenInBytes5, pBufIn6, pBufOut6, lenInBytes6, pBufIn7, + pBufOut7, lenInBytes7, pBufIn8, pBufOut8, lenInBytes8); + } +#ifdef SAFE_DATA + CLEAR_SCRATCH_GPS(); + CLEAR_SCRATCH_SIMD_REGS(); +#endif +#else /* ~AVX2 */ + SNOW3G_F8_2_BUFFER(pHandle, pIV1, pIV2, pBufIn1, pBufOut1, lenInBytes1, + pBufIn2, pBufOut2, lenInBytes2); + + SNOW3G_F8_2_BUFFER(pHandle, pIV3, pIV4, pBufIn3, pBufOut3, lenInBytes3, + pBufIn4, pBufOut4, lenInBytes4); + + SNOW3G_F8_2_BUFFER(pHandle, pIV5, pIV6, pBufIn5, pBufOut5, lenInBytes5, + pBufIn6, pBufOut6, lenInBytes6); + + SNOW3G_F8_2_BUFFER(pHandle, pIV7, pIV8, pBufIn7, pBufOut7, lenInBytes7, + pBufIn8, pBufOut8, lenInBytes8); +#endif /* AVX */ +} + +/****************************************************************************** + * @description + * Snow3G F8 multi packet: + * Performs F8 enc/dec on [n] packets. The operation is performed in-place. + * The input IV's are passed in Little Endian format. + * The KeySchedule is in Little Endian format. + ******************************************************************************/ +void SNOW3G_F8_N_BUFFER(const snow3g_key_schedule_t *pCtx, + const void * const IV[], + const void * const pBufferIn[], + void *pBufferOut[], + const uint32_t bufLenInBytes[], + const uint32_t packetCount) +{ +#ifdef SAFE_PARAM + uint32_t i; + + if ((pCtx == NULL) || (IV == NULL) || (pBufferIn == NULL) || + (pBufferOut == NULL) || (bufLenInBytes == NULL)) + return; + + for (i = 0; i < packetCount; i++) + if ((IV[i] == NULL) || (pBufferIn[i] == NULL) || + (pBufferOut[i] == NULL) || (bufLenInBytes[i] == 0) || + (bufLenInBytes[i] > SNOW3G_MAX_BYTELEN)) + return; +#endif + if (packetCount > 16) { + pBufferOut[0] = NULL; + printf("packetCount too high (%d)\n", packetCount); + return; + } + + uint32_t packet_index, inner_index, pktCnt = packetCount; + int sortNeeded = 0, tempLen = 0; + uint8_t *srctempbuff; + uint8_t *dsttempbuff; + uint8_t *ivtempbuff; + uint8_t *pSrcBuf[NUM_PACKETS_16] = {NULL}; + uint8_t *pDstBuf[NUM_PACKETS_16] = {NULL}; + uint8_t *pIV[NUM_PACKETS_16] = {NULL}; + uint32_t lensBuf[NUM_PACKETS_16] = {0}; + + memcpy((void *)lensBuf, bufLenInBytes, packetCount * sizeof(uint32_t)); + memcpy((void *)pSrcBuf, pBufferIn, packetCount * sizeof(void *)); + memcpy((void *)pDstBuf, pBufferOut, packetCount * sizeof(void *)); + memcpy((void *)pIV, IV, packetCount * sizeof(void *)); + + packet_index = packetCount; + + while (packet_index--) { + + /* check if all packets are sorted by decreasing length */ + if (packet_index > 0 && lensBuf[packet_index - 1] < + lensBuf[packet_index]) { + /* this packet array is not correctly sorted */ + sortNeeded = 1; + } + } + + if (sortNeeded) { + + /* sort packets in decreasing buffer size from [0] to + [n]th packet, ** where buffer[0] will contain longest + buffer and buffer[n] will contain the shortest buffer. + 4 arrays are swapped : + - pointers to input buffers + - pointers to output buffers + - pointers to input IV's + - input buffer lengths */ + packet_index = packetCount; + while (packet_index--) { + + inner_index = packet_index; + while (inner_index--) { + + if (lensBuf[packet_index] > + lensBuf[inner_index]) { + + /* swap buffers to arrange in + descending order from [0]. */ + srctempbuff = pSrcBuf[packet_index]; + dsttempbuff = pDstBuf[packet_index]; + ivtempbuff = pIV[packet_index]; + tempLen = lensBuf[packet_index]; + + pSrcBuf[packet_index] = + pSrcBuf[inner_index]; + pDstBuf[packet_index] = + pDstBuf[inner_index]; + pIV[packet_index] = pIV[inner_index]; + lensBuf[packet_index] = + lensBuf[inner_index]; + + pSrcBuf[inner_index] = srctempbuff; + pDstBuf[inner_index] = dsttempbuff; + pIV[inner_index] = ivtempbuff; + lensBuf[inner_index] = tempLen; + } + } /* for inner packet index (inner bubble-sort) */ + } /* for outer packet index (outer bubble-sort) */ + } /* if sortNeeded */ + + packet_index = 0; + /* process 8 buffers at-a-time */ +#ifdef AVX2 + while (pktCnt >= 8) { + pktCnt -= 8; + SNOW3G_F8_8_BUFFER(pCtx, pIV[packet_index], + pIV[packet_index + 1], + pIV[packet_index + 2], + pIV[packet_index + 3], + pIV[packet_index + 4], + pIV[packet_index + 5], + pIV[packet_index + 6], + pIV[packet_index + 7], + pSrcBuf[packet_index], + pDstBuf[packet_index], + lensBuf[packet_index], + pSrcBuf[packet_index + 1], + pDstBuf[packet_index + 1], + lensBuf[packet_index + 1], + pSrcBuf[packet_index + 2], + pDstBuf[packet_index + 2], + lensBuf[packet_index + 2], + pSrcBuf[packet_index + 3], + pDstBuf[packet_index + 3], + lensBuf[packet_index + 3], + pSrcBuf[packet_index + 4], + pDstBuf[packet_index + 4], + lensBuf[packet_index + 4], + pSrcBuf[packet_index + 5], + pDstBuf[packet_index + 5], + lensBuf[packet_index + 5], + pSrcBuf[packet_index + 6], + pDstBuf[packet_index + 6], + lensBuf[packet_index + 6], + pSrcBuf[packet_index + 7], + pDstBuf[packet_index + 7], + lensBuf[packet_index + 7]); + packet_index += 8; + } +#endif + /* process 4 buffers at-a-time */ + while (pktCnt >= 4) { + pktCnt -= 4; + SNOW3G_F8_4_BUFFER(pCtx, pIV[packet_index + 0], + pIV[packet_index + 1], + pIV[packet_index + 2], + pIV[packet_index + 3], + pSrcBuf[packet_index + 0], + pDstBuf[packet_index + 0], + lensBuf[packet_index + 0], + pSrcBuf[packet_index + 1], + pDstBuf[packet_index + 1], + lensBuf[packet_index + 1], + pSrcBuf[packet_index + 2], + pDstBuf[packet_index + 2], + lensBuf[packet_index + 2], + pSrcBuf[packet_index + 3], + pDstBuf[packet_index + 3], + lensBuf[packet_index + 3]); + packet_index += 4; + } + + /* process 2 packets at-a-time */ + while (pktCnt >= 2) { + pktCnt -= 2; + SNOW3G_F8_2_BUFFER(pCtx, pIV[packet_index + 0], + pIV[packet_index + 1], + pSrcBuf[packet_index + 0], + pDstBuf[packet_index + 0], + lensBuf[packet_index + 0], + pSrcBuf[packet_index + 1], + pDstBuf[packet_index + 1], + lensBuf[packet_index + 1]); + packet_index += 2; + } + + /* remaining packets are processed 1 at a time */ + while (pktCnt--) { + SNOW3G_F8_1_BUFFER(pCtx, pIV[packet_index + 0], + pSrcBuf[packet_index + 0], + pDstBuf[packet_index + 0], + lensBuf[packet_index + 0]); + packet_index++; + } +} + +void SNOW3G_F8_N_BUFFER_MULTIKEY(const snow3g_key_schedule_t * const pCtx[], + const void * const IV[], + const void * const pBufferIn[], + void *pBufferOut[], + const uint32_t bufLenInBytes[], + const uint32_t packetCount) +{ +#ifdef SAFE_PARAM + uint32_t i; + + if ((pCtx == NULL) || (IV == NULL) || (pBufferIn == NULL) || + (pBufferOut == NULL) || (bufLenInBytes == NULL)) + return; + + for (i = 0; i < packetCount; i++) + if ((pCtx[i] == NULL) || (IV[i] == NULL) || + (pBufferIn[i] == NULL) || (pBufferOut[i] == NULL) || + (bufLenInBytes[i] == 0) || + (bufLenInBytes[i] > SNOW3G_MAX_BYTELEN)) + return; +#endif + if (packetCount > 16) { + pBufferOut[0] = NULL; + printf("packetCount too high (%d)\n", packetCount); + return; + } + + uint32_t packet_index, inner_index, pktCnt = packetCount; + int sortNeeded = 0, tempLen = 0; + uint8_t *srctempbuff; + uint8_t *dsttempbuff; + uint8_t *ivtempbuff; + snow3g_key_schedule_t *pCtxBuf[NUM_PACKETS_16] = {NULL}; + uint8_t *pSrcBuf[NUM_PACKETS_16] = {NULL}; + uint8_t *pDstBuf[NUM_PACKETS_16] = {NULL}; + uint8_t *pIV[NUM_PACKETS_16] = {NULL}; + uint32_t lensBuf[NUM_PACKETS_16] = {0}; + snow3g_key_schedule_t *tempCtx; + + memcpy((void *)pCtxBuf, pCtx, packetCount * sizeof(void *)); + memcpy((void *)lensBuf, bufLenInBytes, packetCount * sizeof(uint32_t)); + memcpy((void *)pSrcBuf, pBufferIn, packetCount * sizeof(void *)); + memcpy((void *)pDstBuf, pBufferOut, packetCount * sizeof(void *)); + memcpy((void *)pIV, IV, packetCount * sizeof(void *)); + + packet_index = packetCount; + + while (packet_index--) { + + /* check if all packets are sorted by decreasing length */ + if (packet_index > 0 && lensBuf[packet_index - 1] < + lensBuf[packet_index]) { + /* this packet array is not correctly sorted */ + sortNeeded = 1; + } + } + + if (sortNeeded) { + /* sort packets in decreasing buffer size from [0] to [n]th + packet, where buffer[0] will contain longest buffer and + buffer[n] will contain the shortest buffer. + 4 arrays are swapped : + - pointers to input buffers + - pointers to output buffers + - pointers to input IV's + - input buffer lengths */ + packet_index = packetCount; + while (packet_index--) { + inner_index = packet_index; + while (inner_index--) { + if (lensBuf[packet_index] > + lensBuf[inner_index]) { + /* swap buffers to arrange in + descending order from [0]. */ + srctempbuff = pSrcBuf[packet_index]; + dsttempbuff = pDstBuf[packet_index]; + ivtempbuff = pIV[packet_index]; + tempLen = lensBuf[packet_index]; + tempCtx = pCtxBuf[packet_index]; + + pSrcBuf[packet_index] = + pSrcBuf[inner_index]; + pDstBuf[packet_index] = + pDstBuf[inner_index]; + pIV[packet_index] = pIV[inner_index]; + lensBuf[packet_index] = + lensBuf[inner_index]; + pCtxBuf[packet_index] = + pCtxBuf[inner_index]; + + pSrcBuf[inner_index] = srctempbuff; + pDstBuf[inner_index] = dsttempbuff; + pIV[inner_index] = ivtempbuff; + lensBuf[inner_index] = tempLen; + pCtxBuf[inner_index] = tempCtx; + } + } /* for inner packet index (inner bubble-sort) */ + } /* for outer packet index (outer bubble-sort) */ + } /* if sortNeeded */ + + packet_index = 0; + /* process 8 buffers at-a-time */ +#ifdef AVX2 + while (pktCnt >= 8) { + pktCnt -= 8; + SNOW3G_F8_8_BUFFER_MULTIKEY( + (const snow3g_key_schedule_t * const *) + &pCtxBuf[packet_index], + (const void * const *)&pIV[packet_index], + (const void * const *)&pSrcBuf[packet_index], + (void **)&pDstBuf[packet_index], + &lensBuf[packet_index]); + packet_index += 8; + } +#endif + /* TODO process 4 buffers at-a-time */ + /* TODO process 2 packets at-a-time */ + /* remaining packets are processed 1 at a time */ + while (pktCnt--) { + SNOW3G_F8_1_BUFFER(pCtxBuf[packet_index + 0], + pIV[packet_index + 0], + pSrcBuf[packet_index + 0], + pDstBuf[packet_index + 0], + lensBuf[packet_index + 0]); + packet_index++; + } +} + +/*--------------------------------------------------------- + * @description + * Snow3G F9 1 buffer + * Single buffer digest with IV and precomputed key schedule + *---------------------------------------------------------*/ +void SNOW3G_F9_1_BUFFER(const snow3g_key_schedule_t *pHandle, + const void *pIV, + const void *pBufferIn, + const uint64_t lengthInBits, + void *pDigest) +{ +#ifdef SAFE_PARAM + if ((pHandle == NULL) || (pIV == NULL) || + (pBufferIn == NULL) || (pDigest == NULL) || + (lengthInBits == 0) || (lengthInBits > SNOW3G_MAX_BITLEN)) + return; +#endif + snow3gKeyState1_t ctx; + uint32_t z[5]; + uint64_t lengthInQwords, E, V, P; + uint64_t i, rem_bits; + const uint64_t *inputBuffer; + + inputBuffer = (const uint64_t *)pBufferIn; + + /* Initialize the snow3g key schedule */ + snow3gStateInitialize_1(&ctx, pHandle, pIV); + + /*Generate 5 keystream words*/ + snow3g_f9_keystream_words(&ctx, &z[0]); + + P = ((uint64_t)z[0] << 32) | ((uint64_t)z[1]); + + lengthInQwords = lengthInBits / 64; + + E = 0; + /* all blocks except the last one */ + for (i = 0; i < lengthInQwords; i++) { + V = BSWAP64(inputBuffer[i]); + E = multiply_and_reduce64(E ^ V, P); + } + + /* last bits of last block if any left */ + rem_bits = lengthInBits % 64; + if (rem_bits) { + /* last bytes, do not go past end of buffer */ + memcpy(&V, &inputBuffer[i], (rem_bits + 7) / 8); + V = BSWAP64(V); + V &= (((uint64_t)-1) << (64 - rem_bits)); /* mask extra bits */ + E = multiply_and_reduce64(E ^ V, P); + } + + /* Multiply by Q */ + E = multiply_and_reduce64(E ^ lengthInBits, + (((uint64_t)z[2] << 32) | ((uint64_t)z[3]))); + + /* Final MAC */ + *(uint32_t *)pDigest = + (uint32_t)BSWAP64(E ^ ((uint64_t)z[4] << 32)); +#ifdef SAFE_DATA + CLEAR_VAR(&E, sizeof(E)); + CLEAR_VAR(&V, sizeof(V)); + CLEAR_VAR(&P, sizeof(P)); + CLEAR_MEM(&z, sizeof(z)); + CLEAR_MEM(&ctx, sizeof(ctx)); + CLEAR_SCRATCH_GPS(); + CLEAR_SCRATCH_SIMD_REGS(); +#endif /* SAFE_DATA */ +} + +#endif /* SNOW3G_COMMON_H */ |