summaryrefslogtreecommitdiffstats
path: root/src/spdk/intel-ipsec-mb/include/snow3g_common.h
diff options
context:
space:
mode:
Diffstat (limited to 'src/spdk/intel-ipsec-mb/include/snow3g_common.h')
-rw-r--r--src/spdk/intel-ipsec-mb/include/snow3g_common.h2840
1 files changed, 2840 insertions, 0 deletions
diff --git a/src/spdk/intel-ipsec-mb/include/snow3g_common.h b/src/spdk/intel-ipsec-mb/include/snow3g_common.h
new file mode 100644
index 000000000..d7c7e63c1
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/include/snow3g_common.h
@@ -0,0 +1,2840 @@
+/*******************************************************************************
+ Copyright (c) 2009-2019, Intel Corporation
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice,
+ this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+ * Neither the name of Intel Corporation nor the names of its contributors
+ may be used to endorse or promote products derived from this software
+ without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+ FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+
+/*-----------------------------------------------------------------------
+ *
+ * An implementation of SNOW 3G, the core algorithm for the
+ * 3GPP Confidentiality and Integrity algorithms.
+ *
+ *-----------------------------------------------------------------------*/
+
+#ifndef SNOW3G_COMMON_H
+#define SNOW3G_COMMON_H
+
+#include <stdio.h>
+#include <string.h>
+#include <stdint.h>
+
+#include "intel-ipsec-mb.h"
+#include "include/snow3g.h"
+#include "include/snow3g_internal.h"
+#include "clear_regs_mem.h"
+
+#define CLEAR_MEM clear_mem
+#define CLEAR_VAR clear_var
+
+/* -------------------------------------------------------------------
+ * LFSR array shift by 1 position, 4 packets at a time
+ * ------------------------------------------------------------------ */
+
+#ifdef AVX2
+/* LFSR array shift */
+static inline void ShiftLFSR_8(snow3gKeyState8_t *pCtx)
+{
+ pCtx->iLFSR_X = (pCtx->iLFSR_X + 1) & 15;
+}
+#endif /* AVX2 */
+
+/* LFSR array shift */
+static inline void ShiftLFSR_4(snow3gKeyState4_t *pCtx)
+{
+ pCtx->iLFSR_X = (pCtx->iLFSR_X + 1) % 16;
+}
+
+/*---------------------------------------------------------
+ * @description
+ * Gf2 modular multiplication/reduction
+ *
+ *---------------------------------------------------------*/
+static inline uint64_t multiply_and_reduce64(uint64_t a, uint64_t b)
+{
+ uint64_t msk;
+ uint64_t res = 0;
+ uint64_t i = 64;
+
+ while (i--) {
+ msk = ((int64_t)res >> 63) & 0x1b;
+ res <<= 1;
+ res ^= msk;
+ msk = ((int64_t)b >> 63) & a;
+ b <<= 1;
+ res ^= msk;
+ }
+ return res;
+}
+
+#ifdef AVX2
+/* -------------------------------------------------------------------
+ * ClockLFSR sub-function as defined in snow3g standard
+ * S = LFSR[2]
+ * ^ table_Alpha_div[LFSR[11] & 0xff]
+ * ^ table_Alpha_mul[LFSR[0] & 0xff]
+ * ------------------------------------------------------------------ */
+static void C0_C11_8(__m256i *S, const __m256i *L0, const __m256i *L11)
+{
+ __m256i mask, Sx, B11, B0, offset;
+
+ offset = _mm256_set1_epi32(3);
+ mask = _mm256_setr_epi32(0xF0F0F000, 0xF0F0F004, 0xF0F0F008, 0xF0F0F00C,
+ 0xF0F0F000, 0xF0F0F004, 0xF0F0F008,
+ 0xF0F0F00C);
+ B11 = _mm256_shuffle_epi8(*L11, mask);
+ *S = _mm256_i32gather_epi32(snow3g_table_A_div, B11, 4);
+
+ mask = _mm256_add_epi32(mask, offset);
+ B0 = _mm256_shuffle_epi8(*L0, mask);
+ Sx = _mm256_i32gather_epi32(snow3g_table_A_mul, B0, 4);
+ *S = _mm256_xor_si256(*S, Sx);
+}
+#endif /* AVX2 */
+
+/* -------------------------------------------------------------------
+ * ClockLFSR sub-function as defined in snow3g standard
+ * S = LFSR[2]
+ * ^ table_Alpha_div[LFSR[11] & 0xff]
+ * ^ table_Alpha_mul[LFSR[0] & 0xff]
+ * ------------------------------------------------------------------ */
+static inline void C0_C11_4(uint32_t *S, const __m128i *L0, const __m128i *L11)
+{
+ unsigned B11[4], B0[4];
+
+ B11[0] = _mm_extract_epi8(*L11, 0);
+ B11[1] = _mm_extract_epi8(*L11, 4);
+ B11[2] = _mm_extract_epi8(*L11, 8);
+ B11[3] = _mm_extract_epi8(*L11, 12);
+
+ S[0] = snow3g_table_A_div[B11[0]];
+ S[1] = snow3g_table_A_div[B11[1]];
+ S[2] = snow3g_table_A_div[B11[2]];
+ S[3] = snow3g_table_A_div[B11[3]];
+
+ B0[0] = _mm_extract_epi8(*L0, 3);
+ B0[1] = _mm_extract_epi8(*L0, 7);
+ B0[2] = _mm_extract_epi8(*L0, 11);
+ B0[3] = _mm_extract_epi8(*L0, 15);
+
+ S[0] ^= snow3g_table_A_mul[B0[0]];
+ S[1] ^= snow3g_table_A_mul[B0[1]];
+ S[2] ^= snow3g_table_A_mul[B0[2]];
+ S[3] ^= snow3g_table_A_mul[B0[3]];
+}
+
+#ifdef AVX2
+/* -------------------------------------------------------------------
+ * ClockLFSR function as defined in snow3g standard
+ * S = table_Alpha_div[LFSR[11] & 0xff]
+ * ^ table_Alpha_mul[LFSR[0] >> 24]
+ * ^ LFSR[2] ^ LFSR[0] << 8 ^ LFSR[11] >> 8
+ * ------------------------------------------------------------------ */
+static inline void ClockLFSR_8(snow3gKeyState8_t *pCtx)
+{
+ __m256i X2;
+ __m256i S, T, U;
+
+ U = pCtx->LFSR_X[pCtx->iLFSR_X];
+ S = pCtx->LFSR_X[(pCtx->iLFSR_X + 11) % 16];
+
+ C0_C11_8(&X2, &U, &S);
+
+ T = _mm256_slli_epi32(U, 8);
+ S = _mm256_srli_epi32(S, 8);
+ U = _mm256_xor_si256(T, pCtx->LFSR_X[(pCtx->iLFSR_X + 2) % 16]);
+
+ ShiftLFSR_8(pCtx);
+
+ S = _mm256_xor_si256(S, U);
+ S = _mm256_xor_si256(S, X2);
+ pCtx->LFSR_X[(pCtx->iLFSR_X + 15) % 16] = S;
+}
+#endif /* AVX2 */
+
+/* -------------------------------------------------------------------
+ * ClockLFSR function as defined in snow3g standard
+ * S = table_Alpha_div[LFSR[11] & 0xff]
+ * ^ table_Alpha_mul[LFSR[0] >> 24]
+ * ^ LFSR[2] ^ LFSR[0] << 8 ^ LFSR[11] >> 8
+ * ------------------------------------------------------------------ */
+static inline void ClockLFSR_4(snow3gKeyState4_t *pCtx)
+{
+ uint32_t X2[4];
+ __m128i S, T, U;
+
+ U = pCtx->LFSR_X[pCtx->iLFSR_X];
+ S = pCtx->LFSR_X[(pCtx->iLFSR_X + 11) % 16];
+ C0_C11_4(X2, &U, &S);
+
+ T = _mm_slli_epi32(U, 8);
+ S = _mm_srli_epi32(S, 8);
+ U = _mm_xor_si128(T, pCtx->LFSR_X[(pCtx->iLFSR_X + 2) % 16]);
+ ShiftLFSR_4(pCtx);
+
+ /* (SSE4) */
+ T = _mm_insert_epi32(T, X2[0], 0);
+ T = _mm_insert_epi32(T, X2[1], 1);
+ T = _mm_insert_epi32(T, X2[2], 2);
+ T = _mm_insert_epi32(T, X2[3], 3);
+ S = _mm_xor_si128(S, U);
+ S = _mm_xor_si128(S, T);
+ pCtx->LFSR_X[(pCtx->iLFSR_X + 15) % 16] = S;
+}
+
+#ifdef AVX2
+/* -------------------------------------------------------------------
+ * ClockFSM function as defined in snow3g standard
+ * 8 packets at a time
+ * ------------------------------------------------------------------ */
+static inline void ClockFSM_8(snow3gKeyState8_t *pCtx, __m256i *data)
+{
+ __m256i F, R, S2T0, S2T1, S2T2, S2T3, S1T0, S1T1, S1T2, S1T3;
+ __m256i w3, w2, w1, w0, offset, mask;
+
+ F = _mm256_add_epi32(pCtx->LFSR_X[(pCtx->iLFSR_X + 15)%16],
+ pCtx->FSM_X[0]);
+ R = _mm256_xor_si256(pCtx->LFSR_X[(pCtx->iLFSR_X + 5)%16],
+ pCtx->FSM_X[2]);
+ *data = _mm256_xor_si256(F, pCtx->FSM_X[1]);
+ R = _mm256_add_epi32(R, pCtx->FSM_X[1]);
+ offset = _mm256_set1_epi32(0x1);
+
+ F = pCtx->FSM_X[1];
+ w3 = _mm256_setr_epi32(0xF0F0F000, 0xF0F0F004, 0xF0F0F008,
+ 0xF0F0F00C, 0xF0F0F000, 0xF0F0F004,
+ 0xF0F0F008, 0xF0F0F00C);
+ mask = _mm256_shuffle_epi8(F,w3);
+ S2T0 = _mm256_i32gather_epi32(S2_T0,mask,4);
+
+ w2 = _mm256_add_epi32(w3,offset);
+ mask = _mm256_shuffle_epi8(F,w2);
+ S2T1 = _mm256_i32gather_epi32(S2_T1,mask,4);
+
+ w1 = _mm256_add_epi32(w2,offset);
+ mask = _mm256_shuffle_epi8(pCtx->FSM_X[1],w1);
+ S2T2 = _mm256_i32gather_epi32(S2_T2,mask,4);
+
+ w0 = _mm256_add_epi32(w1,offset);
+ mask = _mm256_shuffle_epi8(F,w0);
+ S2T3 = _mm256_i32gather_epi32(S2_T3,mask,4);
+
+
+ F = pCtx->FSM_X[0];
+ w3 = _mm256_setr_epi32(0xF0F0F000, 0xF0F0F004, 0xF0F0F008,
+ 0xF0F0F00C, 0xF0F0F010, 0xF0F0F014,
+ 0xF0F0F018, 0xF0F0F01C);
+ mask = _mm256_shuffle_epi8(F,w3);
+ S1T0 = _mm256_i32gather_epi32(S1_T0,mask,4);
+
+ w2 = _mm256_add_epi32(w3,offset);
+ mask = _mm256_shuffle_epi8(F,w2);
+ S1T1 = _mm256_i32gather_epi32(S1_T1,mask,4);
+
+ w1 = _mm256_add_epi32(w2,offset);
+ mask = _mm256_shuffle_epi8(F,w1);
+ S1T2 = _mm256_i32gather_epi32(S1_T2,mask,4);
+
+ w0 = _mm256_add_epi32(w1,offset);
+ mask = _mm256_shuffle_epi8(F,w0);
+ S1T3 = _mm256_i32gather_epi32(S1_T3,mask,4);
+
+ S2T0 = _mm256_xor_si256(S2T0, S2T1);
+ S2T2 = _mm256_xor_si256(S2T2, S2T3);
+ S2T0 = _mm256_xor_si256(S2T0, S2T2);
+
+ S1T0 = _mm256_xor_si256(S1T0, S1T1);
+ S1T2 = _mm256_xor_si256(S1T2, S1T3);
+ S1T0 = _mm256_xor_si256(S1T0, S1T2);
+
+
+ pCtx->FSM_X[2] = S2T0;
+ pCtx->FSM_X[1] = S1T0;
+ pCtx->FSM_X[2] = S2T0;
+ pCtx->FSM_X[0] = R;
+}
+
+#endif /* AVX2 */
+
+/* -------------------------------------------------------------------
+ * ClockFSM function as defined in snow3g standard
+ * 4 packets at a time
+ * ------------------------------------------------------------------ */
+static inline void ClockFSM_4(snow3gKeyState4_t *pCtx, __m128i *data)
+{
+ __m128i F, R;
+#ifdef _WIN32
+#pragma warning(push)
+#pragma warning(disable:4556)
+#endif
+#if defined (NO_AESNI) || defined (SAFE_LOOKUP)
+ uint32_t L = 0;
+#endif
+ uint32_t K = 0;
+
+ F = _mm_add_epi32(pCtx->LFSR_X[(pCtx->iLFSR_X + 15) % 16],
+ pCtx->FSM_X[0]);
+ R = _mm_xor_si128(pCtx->LFSR_X[(pCtx->iLFSR_X + 5) % 16],
+ pCtx->FSM_X[2]);
+ *data = _mm_xor_si128(F, pCtx->FSM_X[1]);
+ R = _mm_add_epi32(R, pCtx->FSM_X[1]);
+#if defined (NO_AESNI) || defined (SAFE_LOOKUP)
+ S1_S2_4(pCtx->FSM_X[2], pCtx->FSM_X[1], pCtx->FSM_X[0], K, L, 0);
+ S1_S2_4(pCtx->FSM_X[2], pCtx->FSM_X[1], pCtx->FSM_X[0], K, L, 1);
+ S1_S2_4(pCtx->FSM_X[2], pCtx->FSM_X[1], pCtx->FSM_X[0], K, L, 2);
+ S1_S2_4(pCtx->FSM_X[2], pCtx->FSM_X[1], pCtx->FSM_X[0], K, L, 3);
+#else
+ S1_S2_4(pCtx->FSM_X[2], pCtx->FSM_X[1], pCtx->FSM_X[0], K, 0);
+ S1_S2_4(pCtx->FSM_X[2], pCtx->FSM_X[1], pCtx->FSM_X[0], K, 1);
+ S1_S2_4(pCtx->FSM_X[2], pCtx->FSM_X[1], pCtx->FSM_X[0], K, 2);
+ S1_S2_4(pCtx->FSM_X[2], pCtx->FSM_X[1], pCtx->FSM_X[0], K, 3);
+#endif /* NO_AESNI */
+ pCtx->FSM_X[0] = R;
+
+#ifdef _WIN32
+#pragma warning(pop)
+#endif
+}
+
+/**
+*******************************************************************************
+* @description
+* This function generates 4 bytes of keystream 1 buffer at a time
+*
+* @param[in] pCtx Context where the scheduled keys are stored
+* @param[in/out] pKeyStream Pointer to generated keystream
+*
+*******************************************************************************/
+static inline void snow3g_keystream_1_4(snow3gKeyState1_t *pCtx,
+ uint32_t *pKeyStream)
+{
+ uint32_t F;
+
+ ClockFSM_1(pCtx, &F);
+ *pKeyStream = F ^ pCtx->LFSR_S[0];
+ ClockLFSR_1(pCtx);
+}
+
+/**
+*******************************************************************************
+* @description
+* This function generates 8 bytes of keystream 1 buffer at a time
+*
+* @param[in] pCtx Context where the scheduled keys are stored
+* @param[in/out] pKeyStream Pointer to generated keystream
+*
+*******************************************************************************/
+static inline void snow3g_keystream_1_8(snow3gKeyState1_t *pCtx,
+ uint64_t *pKeyStream)
+{
+ uint64_t F;
+ uint32_t FSM4;
+ uint32_t V0, V1;
+ uint32_t F0, F1;
+ uint32_t R0, R1;
+ uint32_t L0, L1, L11, L12;
+
+ /* Merged clock FSM + clock LFSR + clock FSM + clockLFSR
+ * in order to avoid redundancies in function processing
+ * and less instruction immediate dependencies
+ */
+ L0 = pCtx->LFSR_S[0];
+ V0 = pCtx->LFSR_S[2];
+ L1 = pCtx->LFSR_S[1];
+ V1 = pCtx->LFSR_S[3];
+ R1 = pCtx->FSM_R1;
+ L11 = pCtx->LFSR_S[11];
+ L12 = pCtx->LFSR_S[12];
+ V0 ^= snow3g_table_A_mul[L0 >> 24];
+ V1 ^= snow3g_table_A_mul[L1 >> 24];
+ V0 ^= snow3g_table_A_div[L11 & 0xff];
+ V1 ^= snow3g_table_A_div[L12 & 0xff];
+ V0 ^= L0 << 8;
+ V1 ^= L1 << 8;
+ V0 ^= L11 >> 8;
+ V1 ^= L12 >> 8;
+ F0 = pCtx->LFSR_S[15] + R1;
+ F0 ^= L0;
+ F0 ^= pCtx->FSM_R2;
+ R0 = pCtx->FSM_R3 ^ pCtx->LFSR_S[5];
+ R0 += pCtx->FSM_R2;
+ S1_S2_S3_1(pCtx->FSM_R3, pCtx->FSM_R2, R1, FSM4, R0);
+ R1 = pCtx->FSM_R3 ^ pCtx->LFSR_S[6];
+ F1 = V0 + R0;
+ F1 ^= L1;
+ F1 ^= pCtx->FSM_R2;
+ R1 += pCtx->FSM_R2;
+ pCtx->FSM_R3 = Snow3g_S2(pCtx->FSM_R2);
+ pCtx->FSM_R2 = FSM4;
+ pCtx->FSM_R1 = R1;
+
+ /* Shift LFSR twice */
+ ShiftTwiceLFSR_1(pCtx);
+
+ /* keystream mode LFSR update */
+ pCtx->LFSR_S[14] = V0;
+ pCtx->LFSR_S[15] = V1;
+
+ F = F0;
+ F <<= 32;
+ F |= (uint64_t)F1;
+
+ *pKeyStream = F;
+}
+
+#ifdef AVX2
+/**
+*******************************************************************************
+* @description
+* This function generates 8 bytes of keystream 8 buffers at a time
+*
+* @param[in] pCtx Context where the scheduled keys are stored
+* @param[in/out] pKeyStream Pointer to generated keystream
+*
+*******************************************************************************/
+static inline void snow3g_keystream_8_8(snow3gKeyState8_t *pCtx,
+ __m256i *pKeyStreamLo,
+ __m256i *pKeyStreamHi)
+{
+ __m256i H, L;
+
+ /* first set of 4 bytes */
+ ClockFSM_8(pCtx, &L);
+ L = _mm256_xor_si256(L, pCtx->LFSR_X[pCtx->iLFSR_X]);
+ ClockLFSR_8(pCtx);
+
+ /* second set of 4 bytes */
+ ClockFSM_8(pCtx, &H);
+ H = _mm256_xor_si256(H, pCtx->LFSR_X[pCtx->iLFSR_X]);
+ ClockLFSR_8(pCtx);
+
+ /* merge the 2 sets */
+ *pKeyStreamLo = _mm256_unpacklo_epi32(H, L);
+ *pKeyStreamHi = _mm256_unpackhi_epi32(H, L);
+}
+
+/**
+*******************************************************************************
+* @description
+* This function generates 4 bytes of keystream 8 buffers at a time
+*
+* @param[in] pCtx Context where the scheduled keys are stored
+* @param[in/out] pKeyStream Pointer to generated keystream
+*
+*******************************************************************************/
+static inline void snow3g_keystream_8_4(snow3gKeyState8_t *pCtx,
+ __m256i *pKeyStream)
+{
+ __m256i F;
+
+ ClockFSM_8(pCtx, &F);
+ *pKeyStream = _mm256_xor_si256(F, pCtx->LFSR_X[pCtx->iLFSR_X]);
+ ClockLFSR_8(pCtx);
+}
+
+/**
+*****************************************************************************
+* @description
+* This function generates 32 bytes of keystream 8 buffers at a time
+*
+* @param[in] pCtx Context where the scheduled keys are stored
+* @param[in/out] pKeyStream Array of generated keystreams
+*
+******************************************************************************/
+static inline void snow3g_keystream_8_32(snow3gKeyState8_t *pCtx,
+ __m256i *pKeyStream)
+{
+
+ __m256i temp[8];
+
+ /** produces the next 4 bytes for each buffer */
+ int i;
+
+ /** Byte reversal on each KS */
+ __m256i mask1 = {0x0001020304050607ULL, 0x08090a0b0c0d0e0fULL,
+ 0x0001020304050607ULL, 0x08090a0b0c0d0e0fULL};
+ /** Reversal, shifted 4 bytes right */
+ __m256i mask2 = {0x0405060708090a0bULL, 0x0c0d0e0f00010203ULL,
+ 0x0405060708090a0bULL, 0x0c0d0e0f00010203ULL};
+ /** Reversal, shifted 8 bytes right */
+ __m256i mask3 = {0x08090a0b0c0d0e0fULL, 0x0001020304050607ULL,
+ 0x08090a0b0c0d0e0fULL, 0x0001020304050607ULL};
+ /** Reversal, shifted 12 bytes right */
+ __m256i mask4 = {0x0c0d0e0f00010203ULL, 0x0405060708090a0bULL,
+ 0x0c0d0e0f00010203ULL, 0x0405060708090a0bULL};
+
+ snow3g_keystream_8_4(pCtx, &temp[0]);
+ snow3g_keystream_8_4(pCtx, &temp[1]);
+ snow3g_keystream_8_4(pCtx, &temp[2]);
+ snow3g_keystream_8_4(pCtx, &temp[3]);
+ snow3g_keystream_8_4(pCtx, &temp[4]);
+ snow3g_keystream_8_4(pCtx, &temp[5]);
+ snow3g_keystream_8_4(pCtx, &temp[6]);
+ snow3g_keystream_8_4(pCtx, &temp[7]);
+
+ temp[0] = _mm256_shuffle_epi8(temp[0], mask1);
+ temp[1] = _mm256_shuffle_epi8(temp[1], mask2);
+ temp[2] = _mm256_shuffle_epi8(temp[2], mask3);
+ temp[3] = _mm256_shuffle_epi8(temp[3], mask4);
+ temp[4] = _mm256_shuffle_epi8(temp[4], mask1);
+ temp[5] = _mm256_shuffle_epi8(temp[5], mask2);
+ temp[6] = _mm256_shuffle_epi8(temp[6], mask3);
+ temp[7] = _mm256_shuffle_epi8(temp[7], mask4);
+
+ __m256i blended[8];
+ /* blends KS together: 128bit slice consists
+ of 4 32-bit words for one packet */
+ blended[0] = _mm256_blend_epi32(temp[0], temp[1], 0xaa);
+ blended[1] = _mm256_blend_epi32(temp[0], temp[1], 0x55);
+ blended[2] = _mm256_blend_epi32(temp[2], temp[3], 0xaa);
+ blended[3] = _mm256_blend_epi32(temp[2], temp[3], 0x55);
+ blended[4] = _mm256_blend_epi32(temp[4], temp[5], 0xaa);
+ blended[5] = _mm256_blend_epi32(temp[4], temp[5], 0x55);
+ blended[6] = _mm256_blend_epi32(temp[6], temp[7], 0xaa);
+ blended[7] = _mm256_blend_epi32(temp[6], temp[7], 0x55);
+
+ temp[0] = _mm256_blend_epi32(blended[0], blended[2], 0xcc);
+ temp[1] = _mm256_blend_epi32(blended[1], blended[3], 0x99);
+ temp[2] = _mm256_blend_epi32(blended[0], blended[2], 0x33);
+ temp[3] = _mm256_blend_epi32(blended[1], blended[3], 0x66);
+ temp[4] = _mm256_blend_epi32(blended[4], blended[6], 0xcc);
+ temp[5] = _mm256_blend_epi32(blended[5], blended[7], 0x99);
+ temp[6] = _mm256_blend_epi32(blended[4], blended[6], 0x33);
+ temp[7] = _mm256_blend_epi32(blended[5], blended[7], 0x66);
+
+ /** sorts 32 bit words back into order */
+ blended[0] = temp[0];
+ blended[1] = _mm256_shuffle_epi32(temp[1], 0x39);
+ blended[2] = _mm256_shuffle_epi32(temp[2], 0x4e);
+ blended[3] = _mm256_shuffle_epi32(temp[3], 0x93);
+ blended[4] = temp[4];
+ blended[5] = _mm256_shuffle_epi32(temp[5], 0x39);
+ blended[6] = _mm256_shuffle_epi32(temp[6], 0x4e);
+ blended[7] = _mm256_shuffle_epi32(temp[7], 0x93);
+
+ for (i = 0; i < 4; i++) {
+ pKeyStream[i] = _mm256_permute2x128_si256(blended[i],
+ blended[i + 4], 0x20);
+ pKeyStream[i + 4] = _mm256_permute2x128_si256(
+ blended[i], blended[i + 4], 0x31);
+ }
+}
+
+#endif /* AVX2 */
+
+/**
+*******************************************************************************
+* @description
+* This function generates 4 bytes of keystream 4 buffers at a time
+*
+* @param[in] pCtx Context where the scheduled keys are stored
+* @param[in/out] pKeyStream Pointer to generated keystream
+*
+*******************************************************************************/
+static inline void snow3g_keystream_4_4(snow3gKeyState4_t *pCtx,
+ __m128i *pKeyStream)
+{
+ __m128i F;
+
+ ClockFSM_4(pCtx, &F);
+ *pKeyStream = _mm_xor_si128(F, pCtx->LFSR_X[pCtx->iLFSR_X]);
+ ClockLFSR_4(pCtx);
+}
+
+/**
+*******************************************************************************
+* @description
+* This function generates 8 bytes of keystream 4 buffers at a time
+*
+* @param[in] pCtx Context where the scheduled keys are stored
+* @param[in/out] pKeyStreamLo Pointer to lower end of generated keystream
+* @param[in/out] pKeyStreamHi Pointer to higer end of generated keystream
+*
+*******************************************************************************/
+static inline void snow3g_keystream_4_8(snow3gKeyState4_t *pCtx,
+ __m128i *pKeyStreamLo,
+ __m128i *pKeyStreamHi)
+{
+ __m128i H, L;
+
+ /* first set of 4 bytes */
+ ClockFSM_4(pCtx, &L);
+ L = _mm_xor_si128(L, pCtx->LFSR_X[pCtx->iLFSR_X]);
+ ClockLFSR_4(pCtx);
+
+ /* second set of 4 bytes */
+ ClockFSM_4(pCtx, &H);
+ H = _mm_xor_si128(H, pCtx->LFSR_X[pCtx->iLFSR_X]);
+ ClockLFSR_4(pCtx);
+
+ /* merge the 2 sets */
+ *pKeyStreamLo = _mm_unpacklo_epi32(H, L);
+ *pKeyStreamHi = _mm_unpackhi_epi32(H, L);
+}
+
+/**
+*******************************************************************************
+* @description
+* This function initializes the key schedule for 4 buffers for snow3g f8/f9.
+*
+* @param [in] pCtx Context where the scheduled keys are stored
+* @param [in] pKeySched Key schedule
+* @param [in] pIV1 IV for buffer 1
+* @param [in] pIV2 IV for buffer 2
+* @param [in] pIV3 IV for buffer 3
+* @param [in] pIV4 IV for buffer 4
+*
+*******************************************************************************/
+static inline void
+snow3gStateInitialize_4(snow3gKeyState4_t *pCtx,
+ const snow3g_key_schedule_t *pKeySched,
+ const void *pIV1, const void *pIV2,
+ const void *pIV3, const void *pIV4)
+{
+ uint32_t K, L;
+ int i;
+ __m128i R, S, T, U;
+ __m128i V0, V1, T0, T1;
+
+ /* Initialize the LFSR table from constants, Keys, and IV */
+
+ /* Load complete 128b IV into register (SSE2)*/
+ uint64_t sm[2] = {0x0405060700010203ULL, 0x0c0d0e0f08090a0bULL};
+ __m128i *swapMask = (__m128i *) sm;
+
+ R = _mm_loadu_si128((const __m128i *)pIV1);
+ S = _mm_loadu_si128((const __m128i *)pIV2);
+ T = _mm_loadu_si128((const __m128i *)pIV3);
+ U = _mm_loadu_si128((const __m128i *)pIV4);
+
+ /* initialize the array block (SSE4) */
+ for (i = 0; i < 4; i++) {
+ K = pKeySched->k[i];
+ L = ~K;
+ V0 = _mm_set1_epi32(K);
+ V1 = _mm_set1_epi32(L);
+ pCtx->LFSR_X[i + 4] = V0;
+ pCtx->LFSR_X[i + 12] = V0;
+ pCtx->LFSR_X[i + 0] = V1;
+ pCtx->LFSR_X[i + 8] = V1;
+ }
+ /* Update the schedule structure with IVs */
+ /* Store the 4 IVs in LFSR by a column/row matrix swap
+ * after endianness correction */
+
+ /* endianness swap (SSSE3) */
+ R = _mm_shuffle_epi8(R, *swapMask);
+ S = _mm_shuffle_epi8(S, *swapMask);
+ T = _mm_shuffle_epi8(T, *swapMask);
+ U = _mm_shuffle_epi8(U, *swapMask);
+
+ /* row/column dword inversion (SSE2) */
+ T0 = _mm_unpacklo_epi32(R, S);
+ R = _mm_unpackhi_epi32(R, S);
+ T1 = _mm_unpacklo_epi32(T, U);
+ T = _mm_unpackhi_epi32(T, U);
+
+ /* row/column qword inversion (SSE2) */
+ U = _mm_unpackhi_epi64(R, T);
+ T = _mm_unpacklo_epi64(R, T);
+ S = _mm_unpackhi_epi64(T0, T1);
+ R = _mm_unpacklo_epi64(T0, T1);
+
+ /*IV ^ LFSR (SSE2) */
+ pCtx->LFSR_X[15] = _mm_xor_si128(pCtx->LFSR_X[15], U);
+ pCtx->LFSR_X[12] = _mm_xor_si128(pCtx->LFSR_X[12], T);
+ pCtx->LFSR_X[10] = _mm_xor_si128(pCtx->LFSR_X[10], S);
+ pCtx->LFSR_X[9] = _mm_xor_si128(pCtx->LFSR_X[9], R);
+ pCtx->iLFSR_X = 0;
+ /* FSM initialization (SSE2) */
+ S = _mm_setzero_si128();
+ for (i = 0; i < 3; i++)
+ pCtx->FSM_X[i] = S;
+
+ /* Initialisation rounds */
+ for (i = 0; i < 32; i++) {
+ ClockFSM_4(pCtx, &S);
+ ClockLFSR_4(pCtx);
+ pCtx->LFSR_X[(pCtx->iLFSR_X + 15) % 16] = _mm_xor_si128(
+ pCtx->LFSR_X[(pCtx->iLFSR_X + 15) % 16], S);
+ }
+}
+
+#ifdef AVX2
+/**
+*******************************************************************************
+* @description
+* This function intializes the key schedule for 8 buffers with
+* individual keys, for snow3g f8/f9.
+*
+* @param [in] pCtx Context where scheduled keys are stored
+* @param [in] pKeySched Key schedule
+* @param [in] pIV1 IV for buffer 1
+* @param [in] pIV2 IV for buffer 2
+* @param [in] pIV3 IV for buffer 3
+* @param [in] pIV4 IV for buffer 4
+* @param [in] pIV5 IV for buffer 5
+* @param [in] pIV6 IV for buffer 6
+* @param [in] pIV7 IV for buffer 7
+* @param [in] pIV8 IV for buffer 8
+*
+*******************************************************************************/
+static inline void
+snow3gStateInitialize_8_multiKey(snow3gKeyState8_t *pCtx,
+ const snow3g_key_schedule_t * const KeySched[],
+ const void * const pIV[])
+{
+ DECLARE_ALIGNED(uint32_t k[8], 32);
+ DECLARE_ALIGNED(uint32_t l[8], 32);
+ __m256i *K = (__m256i *)k;
+ __m256i *L = (__m256i *)l;
+
+ int i, j;
+ __m256i mR, mS, mT, mU, T0, T1;
+
+ /* Initialize the LFSR table from constants, Keys, and IV */
+
+ /* Load complete 256b IV into register (SSE2)*/
+ __m256i swapMask = {0x0405060700010203ULL, 0x0c0d0e0f08090a0bULL,
+ 0x0405060700010203ULL, 0x0c0d0e0f08090a0bULL};
+ mR = _mm256_loadu2_m128i((const __m128i *)pIV[4],
+ (const __m128i *)pIV[0]);
+ mS = _mm256_loadu2_m128i((const __m128i *)pIV[5],
+ (const __m128i *)pIV[1]);
+ mT = _mm256_loadu2_m128i((const __m128i *)pIV[6],
+ (const __m128i *)pIV[2]);
+ mU = _mm256_loadu2_m128i((const __m128i *)pIV[7],
+ (const __m128i *)pIV[3]);
+
+ /* initialize the array block (SSE4) */
+ for (i = 0; i < 4; i++) {
+ for (j = 0; j < 8; j++) {
+ k[j] = KeySched[j]->k[i];
+ l[j] = ~k[j];
+ }
+
+ pCtx->LFSR_X[i + 4] = *K;
+ pCtx->LFSR_X[i + 12] = *K;
+ pCtx->LFSR_X[i + 0] = *L;
+ pCtx->LFSR_X[i + 8] = *L;
+ }
+
+ /* Update the schedule structure with IVs */
+ /* Store the 4 IVs in LFSR by a column/row matrix swap
+ * after endianness correction */
+
+ /* endianness swap (SSSE3) */
+ mR = _mm256_shuffle_epi8(mR, swapMask);
+ mS = _mm256_shuffle_epi8(mS, swapMask);
+ mT = _mm256_shuffle_epi8(mT, swapMask);
+ mU = _mm256_shuffle_epi8(mU, swapMask);
+
+ /* row/column dword inversion (SSE2) */
+ T0 = _mm256_unpacklo_epi32(mR, mS);
+ mR = _mm256_unpackhi_epi32(mR, mS);
+ T1 = _mm256_unpacklo_epi32(mT, mU);
+ mT = _mm256_unpackhi_epi32(mT, mU);
+
+ /* row/column qword inversion (SSE2) */
+ mU = _mm256_unpackhi_epi64(mR, mT);
+ mT = _mm256_unpacklo_epi64(mR, mT);
+ mS = _mm256_unpackhi_epi64(T0, T1);
+ mR = _mm256_unpacklo_epi64(T0, T1);
+
+ /*IV ^ LFSR (SSE2) */
+ pCtx->LFSR_X[15] = _mm256_xor_si256(pCtx->LFSR_X[15], mU);
+ pCtx->LFSR_X[12] = _mm256_xor_si256(pCtx->LFSR_X[12], mT);
+ pCtx->LFSR_X[10] = _mm256_xor_si256(pCtx->LFSR_X[10], mS);
+ pCtx->LFSR_X[9] = _mm256_xor_si256(pCtx->LFSR_X[9], mR);
+ pCtx->iLFSR_X = 0;
+ /* FSM initialization (SSE2) */
+ mS = _mm256_setzero_si256();
+ for (i = 0; i < 3; i++)
+ pCtx->FSM_X[i] = mS;
+
+ /* Initialisation rounds */
+ for (i = 0; i < 32; i++) {
+ ClockFSM_8(pCtx, &mS);
+ ClockLFSR_8(pCtx);
+ pCtx->LFSR_X[(pCtx->iLFSR_X + 15) % 16] = _mm256_xor_si256(
+ pCtx->LFSR_X[(pCtx->iLFSR_X + 15) % 16], mS);
+ }
+}
+
+/**
+*******************************************************************************
+* @description
+* This function initializes the key schedule for 8 buffers for snow3g f8/f9.
+*
+* @param [in] pCtx Context where the scheduled keys are stored
+* @param [in] pKeySched Key schedule
+* @param [in] pIV1 IV for buffer 1
+* @param [in] pIV2 IV for buffer 2
+* @param [in] pIV3 IV for buffer 3
+* @param [in] pIV4 IV for buffer 4
+* @param [in] pIV5 IV for buffer 5
+* @param [in] pIV6 IV for buffer 6
+* @param [in] pIV7 IV for buffer 7
+* @param [in] pIV8 IV for buffer 8
+*
+*******************************************************************************/
+static inline void
+snow3gStateInitialize_8(snow3gKeyState8_t *pCtx,
+ const snow3g_key_schedule_t *pKeySched,
+ const void *pIV1, const void *pIV2,
+ const void *pIV3, const void *pIV4,
+ const void *pIV5, const void *pIV6,
+ const void *pIV7, const void *pIV8)
+{
+ uint32_t K, L;
+ int i;
+ __m256i mR, mS, mT, mU, V0, V1, T0, T1;
+
+ /* Initialize the LFSR table from constants, Keys, and IV */
+
+ /* Load complete 256b IV into register (SSE2)*/
+ __m256i swapMask = {0x0405060700010203ULL, 0x0c0d0e0f08090a0bULL,
+ 0x0405060700010203ULL, 0x0c0d0e0f08090a0bULL};
+ mR = _mm256_loadu2_m128i((const __m128i *)pIV5, (const __m128i *)pIV1);
+ mS = _mm256_loadu2_m128i((const __m128i *)pIV6, (const __m128i *)pIV2);
+ mT = _mm256_loadu2_m128i((const __m128i *)pIV7, (const __m128i *)pIV3);
+ mU = _mm256_loadu2_m128i((const __m128i *)pIV8, (const __m128i *)pIV4);
+
+ /* initialize the array block (SSE4) */
+ for (i = 0; i < 4; i++) {
+ K = pKeySched->k[i];
+ L = ~K;
+ V0 = _mm256_set1_epi32(K);
+ V1 = _mm256_set1_epi32(L);
+ pCtx->LFSR_X[i + 4] = V0;
+ pCtx->LFSR_X[i + 12] = V0;
+ pCtx->LFSR_X[i + 0] = V1;
+ pCtx->LFSR_X[i + 8] = V1;
+ }
+
+ /* Update the schedule structure with IVs */
+ /* Store the 4 IVs in LFSR by a column/row matrix swap
+ * after endianness correction */
+
+ /* endianness swap (SSSE3) */
+ mR = _mm256_shuffle_epi8(mR, swapMask);
+ mS = _mm256_shuffle_epi8(mS, swapMask);
+ mT = _mm256_shuffle_epi8(mT, swapMask);
+ mU = _mm256_shuffle_epi8(mU, swapMask);
+
+ /* row/column dword inversion (SSE2) */
+ T0 = _mm256_unpacklo_epi32(mR, mS);
+ mR = _mm256_unpackhi_epi32(mR, mS);
+ T1 = _mm256_unpacklo_epi32(mT, mU);
+ mT = _mm256_unpackhi_epi32(mT, mU);
+
+ /* row/column qword inversion (SSE2) */
+ mU = _mm256_unpackhi_epi64(mR, mT);
+ mT = _mm256_unpacklo_epi64(mR, mT);
+ mS = _mm256_unpackhi_epi64(T0, T1);
+ mR = _mm256_unpacklo_epi64(T0, T1);
+
+ /*IV ^ LFSR (SSE2) */
+ pCtx->LFSR_X[15] = _mm256_xor_si256(pCtx->LFSR_X[15], mU);
+ pCtx->LFSR_X[12] = _mm256_xor_si256(pCtx->LFSR_X[12], mT);
+ pCtx->LFSR_X[10] = _mm256_xor_si256(pCtx->LFSR_X[10], mS);
+ pCtx->LFSR_X[9] = _mm256_xor_si256(pCtx->LFSR_X[9], mR);
+ pCtx->iLFSR_X = 0;
+ /* FSM initialization (SSE2) */
+ mS = _mm256_setzero_si256();
+ for (i = 0; i < 3; i++)
+ pCtx->FSM_X[i] = mS;
+
+ /* Initialisation rounds */
+ for (i = 0; i < 32; i++) {
+ ClockFSM_8(pCtx, &mS);
+ ClockLFSR_8(pCtx);
+ pCtx->LFSR_X[(pCtx->iLFSR_X + 15) % 16] = _mm256_xor_si256(
+ pCtx->LFSR_X[(pCtx->iLFSR_X + 15) % 16], mS);
+ }
+}
+#endif /* AVX2 */
+
+static inline void
+preserve_bits(uint64_t *KS,
+ const uint8_t *pcBufferOut, const uint8_t *pcBufferIn,
+ SafeBuf *safeOutBuf, SafeBuf *safeInBuf,
+ const uint8_t bit_len, const uint8_t byte_len)
+{
+ const uint64_t mask = UINT64_MAX << (SNOW3G_BLOCK_SIZE * 8 - bit_len);
+
+ /* Clear the last bits of the keystream and the input
+ * (input only in out-of-place case) */
+ *KS &= mask;
+ if (pcBufferIn != pcBufferOut) {
+ const uint64_t swapMask = BSWAP64(mask);
+
+ safeInBuf->b64 &= swapMask;
+
+ /*
+ * Merge the last bits from the output, to be preserved,
+ * in the keystream, to be XOR'd with the input
+ * (which last bits are 0, maintaining the output bits)
+ */
+ memcpy_keystrm(safeOutBuf->b8, pcBufferOut, byte_len);
+ *KS |= BSWAP64(safeOutBuf->b64 & ~swapMask);
+ }
+}
+
+/**
+*******************************************************************************
+* @description
+* This function is the core snow3g bit algorithm
+* for the 3GPP confidentiality algorithm
+*
+* @param[in] pCtx Context where the scheduled keys are stored
+* @param[in] pBufferIn Input buffer
+* @param[out] pBufferOut Output buffer
+* @param[in] cipherLengthInBits length in bits of the data to be encrypted
+* @param[in] bitOffset offset in input buffer, where data are valid
+*
+*******************************************************************************/
+static inline void f8_snow3g_bit(snow3gKeyState1_t *pCtx,
+ const void *pIn,
+ void *pOut,
+ const uint32_t lengthInBits,
+ const uint32_t offsetInBits)
+{
+ const uint8_t *pBufferIn = pIn;
+ uint8_t *pBufferOut = pOut;
+ uint32_t cipherLengthInBits = lengthInBits;
+ uint64_t shiftrem = 0;
+ uint64_t KS8, KS8bit; /* 8 bytes of keystream */
+ const uint8_t *pcBufferIn = pBufferIn + (offsetInBits / 8);
+ uint8_t *pcBufferOut = pBufferOut + (offsetInBits / 8);
+ /* Offset into the first byte (0 - 7 bits) */
+ uint32_t remainOffset = offsetInBits % 8;
+ uint32_t byteLength = (cipherLengthInBits + 7) / 8;
+ SafeBuf safeInBuf = {0};
+ SafeBuf safeOutBuf = {0};
+
+ /* Now run the block cipher */
+
+ /* Start with potential partial block (due to offset and length) */
+ snow3g_keystream_1_8(pCtx, &KS8);
+ KS8bit = KS8 >> remainOffset;
+ /* Only one block to encrypt */
+ if (cipherLengthInBits < (64 - remainOffset)) {
+ byteLength = (cipherLengthInBits + 7) / 8;
+ memcpy_keystrm(safeInBuf.b8, pcBufferIn, byteLength);
+ /*
+ * If operation is Out-of-place and there is offset
+ * to be applied, "remainOffset" bits from the output buffer
+ * need to be preserved (only applicable to first byte,
+ * since remainOffset is up to 7 bits)
+ */
+ if ((pIn != pOut) && remainOffset) {
+ const uint8_t mask8 = (uint8_t)
+ (1 << (8 - remainOffset)) - 1;
+
+ safeInBuf.b8[0] = (safeInBuf.b8[0] & mask8) |
+ (pcBufferOut[0] & ~mask8);
+ }
+ /* If last byte is a partial byte, the last bits of the output
+ * need to be preserved */
+ const uint8_t bitlen_with_off = remainOffset +
+ cipherLengthInBits;
+
+ if ((bitlen_with_off & 0x7) != 0)
+ preserve_bits(&KS8bit, pcBufferOut, pcBufferIn,
+ &safeOutBuf, &safeInBuf,
+ bitlen_with_off, byteLength);
+
+ xor_keystrm_rev(safeOutBuf.b8, safeInBuf.b8, KS8bit);
+ memcpy_keystrm(pcBufferOut, safeOutBuf.b8, byteLength);
+ return;
+ }
+ /*
+ * If operation is Out-of-place and there is offset
+ * to be applied, "remainOffset" bits from the output buffer
+ * need to be preserved (only applicable to first byte,
+ * since remainOffset is up to 7 bits)
+ */
+ if ((pIn != pOut) && remainOffset) {
+ const uint8_t mask8 = (uint8_t)(1 << (8 - remainOffset)) - 1;
+
+ memcpy_keystrm(safeInBuf.b8, pcBufferIn, 8);
+ safeInBuf.b8[0] = (safeInBuf.b8[0] & mask8) |
+ (pcBufferOut[0] & ~mask8);
+ xor_keystrm_rev(pcBufferOut, safeInBuf.b8, KS8bit);
+ pcBufferIn += SNOW3G_BLOCK_SIZE;
+ } else {
+ /* At least 64 bits to produce (including offset) */
+ pcBufferIn = xor_keystrm_rev(pcBufferOut, pcBufferIn, KS8bit);
+ }
+
+ if (remainOffset != 0)
+ shiftrem = KS8 << (64 - remainOffset);
+ cipherLengthInBits -= SNOW3G_BLOCK_SIZE * 8 - remainOffset;
+ pcBufferOut += SNOW3G_BLOCK_SIZE;
+
+ while (cipherLengthInBits) {
+ /* produce the next block of keystream */
+ snow3g_keystream_1_8(pCtx, &KS8);
+ KS8bit = (KS8 >> remainOffset) | shiftrem;
+ if (remainOffset != 0)
+ shiftrem = KS8 << (64 - remainOffset);
+ if (cipherLengthInBits >= SNOW3G_BLOCK_SIZE * 8) {
+ pcBufferIn = xor_keystrm_rev(pcBufferOut,
+ pcBufferIn, KS8bit);
+ cipherLengthInBits -= SNOW3G_BLOCK_SIZE * 8;
+ pcBufferOut += SNOW3G_BLOCK_SIZE;
+ /* loop variant */
+ } else {
+ /* end of the loop, handle the last bytes */
+ byteLength = (cipherLengthInBits + 7) / 8;
+ memcpy_keystrm(safeInBuf.b8, pcBufferIn,
+ byteLength);
+
+ /* If last byte is a partial byte, the last bits
+ * of the output need to be preserved */
+ if ((cipherLengthInBits & 0x7) != 0)
+ preserve_bits(&KS8bit, pcBufferOut, pcBufferIn,
+ &safeOutBuf, &safeInBuf,
+ cipherLengthInBits, byteLength);
+
+ xor_keystrm_rev(safeOutBuf.b8, safeInBuf.b8, KS8bit);
+ memcpy_keystrm(pcBufferOut, safeOutBuf.b8, byteLength);
+ cipherLengthInBits = 0;
+ }
+ }
+#ifdef SAFE_DATA
+ CLEAR_VAR(&KS8, sizeof(KS8));
+ CLEAR_VAR(&KS8bit, sizeof(KS8bit));
+ CLEAR_MEM(&safeInBuf, sizeof(safeInBuf));
+ CLEAR_MEM(&safeOutBuf, sizeof(safeOutBuf));
+#endif
+}
+
+/**
+*******************************************************************************
+* @description
+* This function is the core snow3g algorithm for
+* the 3GPP confidentiality and integrity algorithm.
+*
+* @param[in] pCtx Context where the scheduled keys are stored
+* @param[in] pBufferIn Input buffer
+* @param[out] pBufferOut Output buffer
+* @param[in] lengthInBytes length in bytes of the data to be encrypted
+*
+*******************************************************************************/
+static inline void f8_snow3g(snow3gKeyState1_t *pCtx,
+ const void *pIn,
+ void *pOut,
+ const uint32_t lengthInBytes)
+{
+ uint32_t qwords = lengthInBytes / SNOW3G_8_BYTES; /* number of qwords */
+ uint32_t words = lengthInBytes & 4; /* remaining word if not 0 */
+ uint32_t bytes = lengthInBytes & 3; /* remaining bytes */
+ uint32_t KS4; /* 4 bytes of keystream */
+ uint64_t KS8; /* 8 bytes of keystream */
+ const uint8_t *pBufferIn = pIn;
+ uint8_t *pBufferOut = pOut;
+
+ /* process 64 bits at a time */
+ while (qwords--) {
+ /* generate keystream 8 bytes at a time */
+ snow3g_keystream_1_8(pCtx, &KS8);
+
+ /* xor keystream 8 bytes at a time */
+ pBufferIn = xor_keystrm_rev(pBufferOut, pBufferIn, KS8);
+ pBufferOut += SNOW3G_8_BYTES;
+ }
+
+ /* check for remaining 0 to 7 bytes */
+ if (0 != words) {
+ if (bytes) {
+ /* 5 to 7 last bytes, process 8 bytes */
+ uint8_t buftemp[8];
+ uint8_t safeBuff[8];
+
+ memset(safeBuff, 0, SNOW3G_8_BYTES);
+ snow3g_keystream_1_8(pCtx, &KS8);
+ memcpy_keystrm(safeBuff, pBufferIn, 4 + bytes);
+ xor_keystrm_rev(buftemp, safeBuff, KS8);
+ memcpy_keystrm(pBufferOut, buftemp, 4 + bytes);
+#ifdef SAFE_DATA
+ CLEAR_MEM(&safeBuff, sizeof(safeBuff));
+ CLEAR_MEM(&buftemp, sizeof(buftemp));
+#endif
+ } else {
+ /* exactly 4 last bytes */
+ snow3g_keystream_1_4(pCtx, &KS4);
+ xor_keystream_reverse_32(pBufferOut, pBufferIn, KS4);
+ }
+ } else if (0 != bytes) {
+ /* 1 to 3 last bytes */
+ uint8_t buftemp[4];
+ uint8_t safeBuff[4];
+
+ memset(safeBuff, 0, SNOW3G_4_BYTES);
+ snow3g_keystream_1_4(pCtx, &KS4);
+ memcpy_keystream_32(safeBuff, pBufferIn, bytes);
+ xor_keystream_reverse_32(buftemp, safeBuff, KS4);
+ memcpy_keystream_32(pBufferOut, buftemp, bytes);
+#ifdef SAFE_DATA
+ CLEAR_MEM(&safeBuff, sizeof(safeBuff));
+ CLEAR_MEM(&buftemp, sizeof(buftemp));
+#endif
+ }
+
+#ifdef SAFE_DATA
+ CLEAR_VAR(&KS4, sizeof(KS4));
+ CLEAR_VAR(&KS8, sizeof(KS8));
+#endif
+}
+
+#ifdef AVX2
+/**
+*******************************************************************************
+* @description
+* This function converts the state from a 4 buffer state structure to 1
+* buffer state structure.
+*
+* @param[in] pSrcState Pointer to the source state
+* @param[in] pDstState Pointer to the destination state
+* @param[in] NumBuffers Number of buffers
+*
+*******************************************************************************/
+static inline void snow3gStateConvert_8(snow3gKeyState8_t *pSrcState,
+ snow3gKeyState1_t *pDstState,
+ uint32_t NumBuffers)
+{
+ uint32_t T = 0, iLFSR_X = pSrcState->iLFSR_X;
+ __m256i *LFSR_X = pSrcState->LFSR_X;
+ int i;
+
+ for (i = 0; i < 16; i++) {
+ switch (NumBuffers) {
+ case 0:
+ T = _mm256_extract_epi32(LFSR_X[(i + iLFSR_X) % 16], 0);
+ break;
+ case 1:
+ T = _mm256_extract_epi32(LFSR_X[(i + iLFSR_X) % 16], 1);
+ break;
+ case 2:
+ T = _mm256_extract_epi32(LFSR_X[(i + iLFSR_X) % 16], 2);
+ break;
+ case 3:
+ T = _mm256_extract_epi32(LFSR_X[(i + iLFSR_X) % 16], 3);
+ break;
+ case 4:
+ T = _mm256_extract_epi32(LFSR_X[(i + iLFSR_X) % 16], 4);
+ break;
+ case 5:
+ T = _mm256_extract_epi32(LFSR_X[(i + iLFSR_X) % 16], 5);
+ break;
+ case 6:
+ T = _mm256_extract_epi32(LFSR_X[(i + iLFSR_X) % 16], 6);
+ break;
+ case 7:
+ T = _mm256_extract_epi32(LFSR_X[(i + iLFSR_X) % 16], 7);
+ break;
+ }
+ pDstState->LFSR_S[i] = T;
+ }
+ i = 0;
+ switch (NumBuffers) {
+ case 0:
+ T = _mm256_extract_epi32(pSrcState->FSM_X[i], 0);
+ break;
+ case 1:
+ T = _mm256_extract_epi32(pSrcState->FSM_X[i], 1);
+ break;
+ case 2:
+ T = _mm256_extract_epi32(pSrcState->FSM_X[i], 2);
+ break;
+ case 3:
+ T = _mm256_extract_epi32(pSrcState->FSM_X[i], 3);
+ break;
+ case 4:
+ T = _mm256_extract_epi32(pSrcState->FSM_X[i], 4);
+ break;
+ case 5:
+ T = _mm256_extract_epi32(pSrcState->FSM_X[i], 5);
+ break;
+ case 6:
+ T = _mm256_extract_epi32(pSrcState->FSM_X[i], 6);
+ break;
+ case 7:
+ T = _mm256_extract_epi32(pSrcState->FSM_X[i], 7);
+ break;
+ }
+ pDstState->FSM_R1 = T;
+
+ i = 1;
+ switch (NumBuffers) {
+ case 0:
+ T = _mm256_extract_epi32(pSrcState->FSM_X[i], 0);
+ break;
+ case 1:
+ T = _mm256_extract_epi32(pSrcState->FSM_X[i], 1);
+ break;
+ case 2:
+ T = _mm256_extract_epi32(pSrcState->FSM_X[i], 2);
+ break;
+ case 3:
+ T = _mm256_extract_epi32(pSrcState->FSM_X[i], 3);
+ break;
+ case 4:
+ T = _mm256_extract_epi32(pSrcState->FSM_X[i], 4);
+ break;
+ case 5:
+ T = _mm256_extract_epi32(pSrcState->FSM_X[i], 5);
+ break;
+ case 6:
+ T = _mm256_extract_epi32(pSrcState->FSM_X[i], 6);
+ break;
+ case 7:
+ T = _mm256_extract_epi32(pSrcState->FSM_X[i], 7);
+ break;
+ }
+ pDstState->FSM_R2 = T;
+
+ i = 2;
+ switch (NumBuffers) {
+ case 0:
+ T = _mm256_extract_epi32(pSrcState->FSM_X[i], 0);
+ break;
+ case 1:
+ T = _mm256_extract_epi32(pSrcState->FSM_X[i], 1);
+ break;
+ case 2:
+ T = _mm256_extract_epi32(pSrcState->FSM_X[i], 2);
+ break;
+ case 3:
+ T = _mm256_extract_epi32(pSrcState->FSM_X[i], 3);
+ break;
+ case 4:
+ T = _mm256_extract_epi32(pSrcState->FSM_X[i], 4);
+ break;
+ case 5:
+ T = _mm256_extract_epi32(pSrcState->FSM_X[i], 5);
+ break;
+ case 6:
+ T = _mm256_extract_epi32(pSrcState->FSM_X[i], 6);
+ break;
+ case 7:
+ T = _mm256_extract_epi32(pSrcState->FSM_X[i], 7);
+ break;
+ }
+ pDstState->FSM_R3 = T;
+}
+#endif /* AVX2 */
+
+/**
+*******************************************************************************
+* @description
+* This function converts the state from a 4 buffer state structure to 1
+* buffer state structure.
+*
+* @param[in] pSrcState Pointer to the source state
+* @param[in] pDstState Pointer to the destination state
+* @param[in] NumBuffers Number of buffers
+*
+*******************************************************************************/
+static inline void snow3gStateConvert_4(snow3gKeyState4_t *pSrcState,
+ snow3gKeyState1_t *pDstState,
+ uint32_t NumBuffers)
+{
+ uint32_t i;
+ uint32_t T = 0, iLFSR_X = pSrcState->iLFSR_X;
+ __m128i *LFSR_X = pSrcState->LFSR_X;
+
+ for (i = 0; i < 16; i++) {
+ switch (NumBuffers) {
+ case 0:
+ T = _mm_extract_epi32(LFSR_X[(i + iLFSR_X) % 16], 0);
+ break;
+ case 1:
+ T = _mm_extract_epi32(LFSR_X[(i + iLFSR_X) % 16], 1);
+ break;
+ case 2:
+ T = _mm_extract_epi32(LFSR_X[(i + iLFSR_X) % 16], 2);
+ break;
+ case 3:
+ T = _mm_extract_epi32(LFSR_X[(i + iLFSR_X) % 16], 3);
+ break;
+ }
+ pDstState->LFSR_S[i] = T;
+ }
+
+ i = 0;
+ switch (NumBuffers) {
+ case 0:
+ T = _mm_extract_epi32(pSrcState->FSM_X[i], 0);
+ break;
+ case 1:
+ T = _mm_extract_epi32(pSrcState->FSM_X[i], 1);
+ break;
+ case 2:
+ T = _mm_extract_epi32(pSrcState->FSM_X[i], 2);
+ break;
+ case 3:
+ T = _mm_extract_epi32(pSrcState->FSM_X[i], 3);
+ break;
+ }
+ pDstState->FSM_R1 = T;
+
+ i = 1;
+ switch (NumBuffers) {
+ case 0:
+ T = _mm_extract_epi32(pSrcState->FSM_X[i], 0);
+ break;
+ case 1:
+ T = _mm_extract_epi32(pSrcState->FSM_X[i], 1);
+ break;
+ case 2:
+ T = _mm_extract_epi32(pSrcState->FSM_X[i], 2);
+ break;
+ case 3:
+ T = _mm_extract_epi32(pSrcState->FSM_X[i], 3);
+ break;
+ }
+ pDstState->FSM_R2 = T;
+
+ i = 2;
+ switch (NumBuffers) {
+ case 0:
+ T = _mm_extract_epi32(pSrcState->FSM_X[i], 0);
+ break;
+ case 1:
+ T = _mm_extract_epi32(pSrcState->FSM_X[i], 1);
+ break;
+ case 2:
+ T = _mm_extract_epi32(pSrcState->FSM_X[i], 2);
+ break;
+ case 3:
+ T = _mm_extract_epi32(pSrcState->FSM_X[i], 3);
+ break;
+ }
+ pDstState->FSM_R3 = T;
+}
+
+/*---------------------------------------------------------
+ * f8()
+ * Initializations and Context size definitions
+ *---------------------------------------------------------*/
+size_t SNOW3G_KEY_SCHED_SIZE(void) { return sizeof(snow3g_key_schedule_t); }
+
+int SNOW3G_INIT_KEY_SCHED(const void *pKey, snow3g_key_schedule_t *pCtx)
+{
+#ifdef SAFE_PARAM
+ if ((pKey == NULL) || (pCtx == NULL))
+ return -1;
+#endif
+
+ const uint32_t *pKey32 = pKey;
+
+ pCtx->k[3] = BSWAP32(pKey32[0]);
+ pCtx->k[2] = BSWAP32(pKey32[1]);
+ pCtx->k[1] = BSWAP32(pKey32[2]);
+ pCtx->k[0] = BSWAP32(pKey32[3]);
+
+ return 0;
+}
+
+/*---------------------------------------------------------
+ * @description
+ * Snow3G F8 1 buffer:
+ * Single buffer enc/dec with IV and precomputed key schedule
+ *---------------------------------------------------------*/
+void SNOW3G_F8_1_BUFFER(const snow3g_key_schedule_t *pHandle,
+ const void *pIV,
+ const void *pBufferIn,
+ void *pBufferOut,
+ const uint32_t lengthInBytes)
+{
+#ifdef SAFE_PARAM
+ if ((pHandle == NULL) || (pIV == NULL) ||
+ (pBufferIn == NULL) || (pBufferOut == NULL) ||
+ (lengthInBytes == 0) || (lengthInBytes > SNOW3G_MAX_BYTELEN))
+ return;
+#endif
+ snow3gKeyState1_t ctx;
+ uint32_t KS4; /* 4 bytes of keystream */
+
+ /* Initialize the schedule from the IV */
+ snow3gStateInitialize_1(&ctx, pHandle, pIV);
+
+ /* Clock FSM and LFSR once, ignore the keystream */
+ snow3g_keystream_1_4(&ctx, &KS4);
+
+ f8_snow3g(&ctx, pBufferIn, pBufferOut, lengthInBytes);
+
+#ifdef SAFE_DATA
+ CLEAR_VAR(&KS4, sizeof(KS4));
+ CLEAR_MEM(&ctx, sizeof(ctx));
+ CLEAR_SCRATCH_GPS();
+ CLEAR_SCRATCH_SIMD_REGS();
+#endif /* SAFE_DATA */
+}
+
+/*---------------------------------------------------------
+ * @description
+ * Snow3G F8 bit 1 buffer:
+ * Single buffer enc/dec with IV and precomputed key schedule
+ *---------------------------------------------------------*/
+void SNOW3G_F8_1_BUFFER_BIT(const snow3g_key_schedule_t *pHandle,
+ const void *pIV,
+ const void *pBufferIn,
+ void *pBufferOut,
+ const uint32_t lengthInBits,
+ const uint32_t offsetInBits)
+{
+#ifdef SAFE_PARAM
+ if ((pHandle == NULL) || (pIV == NULL) ||
+ (pBufferIn == NULL) || (pBufferOut == NULL) ||
+ (lengthInBits == 0))
+ return;
+#endif
+
+ snow3gKeyState1_t ctx;
+ uint32_t KS4; /* 4 bytes of keystream */
+
+ /* Initialize the schedule from the IV */
+ snow3gStateInitialize_1(&ctx, pHandle, pIV);
+
+ /* Clock FSM and LFSR once, ignore the keystream */
+ snow3g_keystream_1_4(&ctx, &KS4);
+
+ f8_snow3g_bit(&ctx, pBufferIn, pBufferOut, lengthInBits, offsetInBits);
+
+#ifdef SAFE_DATA
+ CLEAR_VAR(&KS4, sizeof(KS4));
+ CLEAR_MEM(&ctx, sizeof(ctx));
+ CLEAR_SCRATCH_GPS();
+ CLEAR_SCRATCH_SIMD_REGS();
+#endif /* SAFE_DATA */
+}
+
+/*---------------------------------------------------------
+ * @description
+ * Snow3G F8 2 buffer:
+ * Two buffers enc/dec with the same key schedule.
+ * The 3 IVs are independent and are passed as an array of pointers.
+ * Each buffer and data length are separate.
+ *---------------------------------------------------------*/
+void SNOW3G_F8_2_BUFFER(const snow3g_key_schedule_t *pHandle,
+ const void *pIV1,
+ const void *pIV2,
+ const void *pBufIn1,
+ void *pBufOut1,
+ const uint32_t lenInBytes1,
+ const void *pBufIn2,
+ void *pBufOut2,
+ const uint32_t lenInBytes2)
+{
+#ifdef SAFE_PARAM
+ if ((pHandle == NULL) || (pIV1 == NULL) || (pIV2 == NULL) ||
+ (pBufIn1 == NULL) || (pBufOut1 == NULL) ||
+ (pBufIn2 == NULL) || (pBufOut2 == NULL) ||
+ (lenInBytes1 == 0) || (lenInBytes1 > SNOW3G_MAX_BYTELEN) ||
+ (lenInBytes2 == 0) || (lenInBytes2 > SNOW3G_MAX_BYTELEN))
+ return;
+#endif
+
+ snow3gKeyState1_t ctx1, ctx2;
+ uint32_t KS4; /* 4 bytes of keystream */
+
+ /* Initialize the schedule from the IV */
+ snow3gStateInitialize_1(&ctx1, pHandle, pIV1);
+
+ /* Clock FSM and LFSR once, ignore the keystream */
+ snow3g_keystream_1_4(&ctx1, &KS4);
+
+ /* data processing for packet 1 */
+ f8_snow3g(&ctx1, pBufIn1, pBufOut1, lenInBytes1);
+
+ /* Initialize the schedule from the IV */
+ snow3gStateInitialize_1(&ctx2, pHandle, pIV2);
+
+ /* Clock FSM and LFSR once, ignore the keystream */
+ snow3g_keystream_1_4(&ctx2, &KS4);
+
+ /* data processing for packet 2 */
+ f8_snow3g(&ctx2, pBufIn2, pBufOut2, lenInBytes2);
+
+#ifdef SAFE_DATA
+ CLEAR_VAR(&KS4, sizeof(KS4));
+ CLEAR_MEM(&ctx1, sizeof(ctx1));
+ CLEAR_MEM(&ctx2, sizeof(ctx2));
+ CLEAR_SCRATCH_GPS();
+ CLEAR_SCRATCH_SIMD_REGS();
+#endif /* SAFE_DATA */
+
+}
+
+/*---------------------------------------------------------
+ * @description
+ * Snow3G F8 4 buffer:
+ * Four packets enc/dec with the same key schedule.
+ * The 4 IVs are independent and are passed as an array of pointers.
+ * Each buffer and data length are separate.
+ *---------------------------------------------------------*/
+void SNOW3G_F8_4_BUFFER(const snow3g_key_schedule_t *pHandle,
+ const void *pIV1,
+ const void *pIV2,
+ const void *pIV3,
+ const void *pIV4,
+ const void *pBufferIn1,
+ void *pBufferOut1,
+ const uint32_t lengthInBytes1,
+ const void *pBufferIn2,
+ void *pBufferOut2,
+ const uint32_t lengthInBytes2,
+ const void *pBufferIn3,
+ void *pBufferOut3,
+ const uint32_t lengthInBytes3,
+ const void *pBufferIn4,
+ void *pBufferOut4,
+ const uint32_t lengthInBytes4)
+{
+#ifdef SAFE_PARAM
+ if ((pHandle == NULL) ||
+ (pIV1 == NULL) || (pIV2 == NULL) ||
+ (pIV3 == NULL) || (pIV4 == NULL) ||
+ (pBufferIn1 == NULL) || (pBufferOut1 == NULL) ||
+ (pBufferIn2 == NULL) || (pBufferOut2 == NULL) ||
+ (pBufferIn3 == NULL) || (pBufferOut3 == NULL) ||
+ (pBufferIn4 == NULL) || (pBufferOut4 == NULL) ||
+ (lengthInBytes1 == 0) || (lengthInBytes1 > SNOW3G_MAX_BYTELEN) ||
+ (lengthInBytes2 == 0) || (lengthInBytes2 > SNOW3G_MAX_BYTELEN) ||
+ (lengthInBytes3 == 0) || (lengthInBytes3 > SNOW3G_MAX_BYTELEN) ||
+ (lengthInBytes4 == 0) || (lengthInBytes4 > SNOW3G_MAX_BYTELEN))
+ return;
+#endif
+
+ snow3gKeyState4_t ctx;
+ __m128i H, L; /* 4 bytes of keystream */
+ uint32_t lenInBytes1 = lengthInBytes1;
+ uint32_t lenInBytes2 = lengthInBytes2;
+ uint32_t lenInBytes3 = lengthInBytes3;
+ uint32_t lenInBytes4 = lengthInBytes4;
+ uint32_t bytes1 =
+ (lenInBytes1 < lenInBytes2 ? lenInBytes1
+ : lenInBytes2); /* number of bytes */
+ uint32_t bytes2 =
+ (lenInBytes3 < lenInBytes4 ? lenInBytes3
+ : lenInBytes4); /* number of bytes */
+ /* min num of bytes */
+ uint32_t bytes = (bytes1 < bytes2) ? bytes1 : bytes2;
+ uint32_t qwords = bytes / SNOW3G_8_BYTES;
+ uint8_t *pBufOut1 = pBufferOut1;
+ uint8_t *pBufOut2 = pBufferOut2;
+ uint8_t *pBufOut3 = pBufferOut3;
+ uint8_t *pBufOut4 = pBufferOut4;
+ const uint8_t *pBufIn1 = pBufferIn1;
+ const uint8_t *pBufIn2 = pBufferIn2;
+ const uint8_t *pBufIn3 = pBufferIn3;
+ const uint8_t *pBufIn4 = pBufferIn4;
+
+ bytes = qwords * SNOW3G_8_BYTES; /* rounded down minimum length */
+
+ /* Initialize the schedule from the IV */
+ snow3gStateInitialize_4(&ctx, pHandle, pIV1, pIV2, pIV3, pIV4);
+
+ /* Clock FSM and LFSR once, ignore the keystream */
+ snow3g_keystream_4_4(&ctx, &L);
+
+ lenInBytes1 -= bytes;
+ lenInBytes2 -= bytes;
+ lenInBytes3 -= bytes;
+ lenInBytes4 -= bytes;
+
+ /* generates 4 bytes at a time on all streams */
+ while (qwords--) {
+ snow3g_keystream_4_8(&ctx, &L, &H);
+ pBufIn1 = xor_keystrm_rev(pBufOut1, pBufIn1,
+ _mm_extract_epi64(L, 0));
+ pBufIn2 = xor_keystrm_rev(pBufOut2, pBufIn2,
+ _mm_extract_epi64(L, 1));
+ pBufIn3 = xor_keystrm_rev(pBufOut3, pBufIn3,
+ _mm_extract_epi64(H, 0));
+ pBufIn4 = xor_keystrm_rev(pBufOut4, pBufIn4,
+ _mm_extract_epi64(H, 1));
+
+ pBufOut1 += SNOW3G_8_BYTES;
+ pBufOut2 += SNOW3G_8_BYTES;
+ pBufOut3 += SNOW3G_8_BYTES;
+ pBufOut4 += SNOW3G_8_BYTES;
+ }
+
+ /* process the remaining of each buffer
+ * - extract the LFSR and FSM structures
+ * - Continue process 1 buffer
+ */
+ if (lenInBytes1) {
+ snow3gKeyState1_t ctx1;
+
+ snow3gStateConvert_4(&ctx, &ctx1, 0);
+ f8_snow3g(&ctx1, pBufIn1, pBufOut1, lenInBytes1);
+ }
+
+ if (lenInBytes2) {
+ snow3gKeyState1_t ctx2;
+
+ snow3gStateConvert_4(&ctx, &ctx2, 1);
+ f8_snow3g(&ctx2, pBufIn2, pBufOut2, lenInBytes2);
+ }
+
+ if (lenInBytes3) {
+ snow3gKeyState1_t ctx3;
+
+ snow3gStateConvert_4(&ctx, &ctx3, 2);
+ f8_snow3g(&ctx3, pBufIn3, pBufOut3, lenInBytes3);
+ }
+
+ if (lenInBytes4) {
+ snow3gKeyState1_t ctx4;
+
+ snow3gStateConvert_4(&ctx, &ctx4, 3);
+ f8_snow3g(&ctx4, pBufIn4, pBufOut4, lenInBytes4);
+ }
+
+#ifdef SAFE_DATA
+ H = _mm_setzero_si128();
+ L = _mm_setzero_si128();
+ CLEAR_MEM(&ctx, sizeof(ctx));
+ CLEAR_SCRATCH_GPS();
+ CLEAR_SCRATCH_SIMD_REGS();
+#endif /* SAFE_DATA */
+
+}
+
+#ifdef AVX2
+/*---------------------------------------------------------
+ * @description
+ * Snow3G 8 buffer ks 8 multi:
+ * Processes 8 packets 8 bytes at a time.
+ * Uses individual key schedule for each buffer.
+ *---------------------------------------------------------*/
+static inline void
+snow3g_8_buffer_ks_8_multi(uint32_t bytes,
+ const snow3g_key_schedule_t * const pKey[],
+ const void * const IV[],
+ const void * const pBufferIn[],
+ void *pBufferOut[], const uint32_t *lengthInBytes)
+{
+ uint32_t qwords = bytes / SNOW3G_8_BYTES;
+ __m256i H, L; /* 8 bytes of keystream */
+ snow3gKeyState8_t ctx;
+ int i;
+ const uint8_t *tBufferIn[8];
+ uint8_t *tBufferOut[8];
+ uint32_t tLenInBytes[8];
+
+ bytes = qwords * SNOW3G_8_BYTES; /* rounded down minimum length */
+
+ for (i = 0; i < 8; i++) {
+ tBufferIn[i] = pBufferIn[i];
+ tBufferOut[i] = pBufferOut[i];
+ tLenInBytes[i] = lengthInBytes[i];
+ }
+
+ /* Initialize the schedule from the IV */
+ snow3gStateInitialize_8_multiKey(&ctx, pKey, IV);
+
+ /* Clock FSM and LFSR once, ignore the keystream */
+ snow3g_keystream_8_4(&ctx, &L);
+
+ for (i = 0; i < 8; i++)
+ tLenInBytes[i] -= bytes;
+
+ /* generates 8 sets at a time on all streams */
+ for (i = qwords; i != 0; i--) {
+ int j;
+
+ snow3g_keystream_8_8(&ctx, &L, &H);
+
+ tBufferIn[0] = xor_keystrm_rev(tBufferOut[0], tBufferIn[0],
+ _mm256_extract_epi64(L, 0));
+ tBufferIn[1] = xor_keystrm_rev(tBufferOut[1], tBufferIn[1],
+ _mm256_extract_epi64(L, 1));
+ tBufferIn[2] = xor_keystrm_rev(tBufferOut[2], tBufferIn[2],
+ _mm256_extract_epi64(H, 0));
+ tBufferIn[3] = xor_keystrm_rev(tBufferOut[3], tBufferIn[3],
+ _mm256_extract_epi64(H, 1));
+ tBufferIn[4] = xor_keystrm_rev(tBufferOut[4], tBufferIn[4],
+ _mm256_extract_epi64(L, 2));
+ tBufferIn[5] = xor_keystrm_rev(tBufferOut[5], tBufferIn[5],
+ _mm256_extract_epi64(L, 3));
+ tBufferIn[6] = xor_keystrm_rev(tBufferOut[6], tBufferIn[6],
+ _mm256_extract_epi64(H, 2));
+ tBufferIn[7] = xor_keystrm_rev(tBufferOut[7], tBufferIn[7],
+ _mm256_extract_epi64(H, 3));
+
+ for (j = 0; j < 8; j++)
+ tBufferOut[j] += SNOW3G_8_BYTES;
+ }
+
+ /* process the remaining of each buffer
+ * - extract the LFSR and FSM structures
+ * - Continue process 1 buffer
+ */
+ if (tLenInBytes[0]) {
+ snow3gKeyState1_t ctx1;
+
+ snow3gStateConvert_8(&ctx, &ctx1, 0);
+ f8_snow3g(&ctx1, tBufferIn[0], tBufferOut[0], tLenInBytes[0]);
+ }
+ if (tLenInBytes[1]) {
+ snow3gKeyState1_t ctx2;
+
+ snow3gStateConvert_8(&ctx, &ctx2, 1);
+ f8_snow3g(&ctx2, tBufferIn[1], tBufferOut[1], tLenInBytes[1]);
+ }
+ if (tLenInBytes[2]) {
+ snow3gKeyState1_t ctx3;
+
+ snow3gStateConvert_8(&ctx, &ctx3, 2);
+ f8_snow3g(&ctx3, tBufferIn[2], tBufferOut[2], tLenInBytes[2]);
+ }
+ if (tLenInBytes[3]) {
+ snow3gKeyState1_t ctx4;
+
+ snow3gStateConvert_8(&ctx, &ctx4, 3);
+ f8_snow3g(&ctx4, tBufferIn[3], tBufferOut[3], tLenInBytes[3]);
+ }
+ if (tLenInBytes[4]) {
+ snow3gKeyState1_t ctx5;
+
+ snow3gStateConvert_8(&ctx, &ctx5, 4);
+ f8_snow3g(&ctx5, tBufferIn[4], tBufferOut[4], tLenInBytes[4]);
+ }
+ if (tLenInBytes[5]) {
+ snow3gKeyState1_t ctx6;
+
+ snow3gStateConvert_8(&ctx, &ctx6, 5);
+ f8_snow3g(&ctx6, tBufferIn[5], tBufferOut[5], tLenInBytes[5]);
+ }
+ if (tLenInBytes[6]) {
+ snow3gKeyState1_t ctx7;
+
+ snow3gStateConvert_8(&ctx, &ctx7, 6);
+ f8_snow3g(&ctx7, tBufferIn[6], tBufferOut[6], tLenInBytes[6]);
+ }
+ if (tLenInBytes[7]) {
+ snow3gKeyState1_t ctx8;
+
+ snow3gStateConvert_8(&ctx, &ctx8, 7);
+ f8_snow3g(&ctx8, tBufferIn[7], tBufferOut[7], tLenInBytes[7]);
+ }
+
+#ifdef SAFE_DATA
+ H = _mm256_setzero_si256();
+ L = _mm256_setzero_si256();
+ CLEAR_MEM(&ctx, sizeof(ctx));
+#endif /* SAFE_DATA */
+}
+
+/*---------------------------------------------------------
+ * @description
+ * Snow3G 8 buffer ks 32 multi:
+ * Processes 8 packets 32 bytes at a time.
+ * Uses individual key schedule for each buffer.
+ *---------------------------------------------------------*/
+static inline void
+snow3g_8_buffer_ks_32_multi(uint32_t bytes,
+ const snow3g_key_schedule_t * const pKey[],
+ const void * const IV[],
+ const void * const pBufferIn[],
+ void *pBufferOut[], const uint32_t *lengthInBytes)
+{
+
+ snow3gKeyState8_t ctx;
+ uint32_t i;
+
+ const uint8_t *tBufferIn[8];
+ uint8_t *tBufferOut[8];
+ uint32_t tLenInBytes[8];
+
+ for (i = 0; i < 8; i++) {
+ tBufferIn[i] = pBufferIn[i];
+ tBufferOut[i] = pBufferOut[i];
+ tLenInBytes[i] = lengthInBytes[i];
+ }
+
+ uint32_t blocks = bytes / 32;
+
+ bytes = blocks * 32; /* rounded down minimum length */
+
+ /* Initialize the schedule from the IV */
+ snow3gStateInitialize_8_multiKey(&ctx, pKey, IV);
+
+ /* Clock FSM and LFSR once, ignore the keystream */
+ __m256i ks[8];
+
+ snow3g_keystream_8_4(&ctx, ks);
+
+ for (i = 0; i < 8; i++)
+ tLenInBytes[i] -= bytes;
+
+ __m256i in[8];
+
+ /* generates 8 sets at a time on all streams */
+ for (i = 0; i < blocks; i++) {
+ int j;
+
+ in[0] = _mm256_loadu_si256((const __m256i *)tBufferIn[0]);
+ in[1] = _mm256_loadu_si256((const __m256i *)tBufferIn[1]);
+ in[2] = _mm256_loadu_si256((const __m256i *)tBufferIn[2]);
+ in[3] = _mm256_loadu_si256((const __m256i *)tBufferIn[3]);
+ in[4] = _mm256_loadu_si256((const __m256i *)tBufferIn[4]);
+ in[5] = _mm256_loadu_si256((const __m256i *)tBufferIn[5]);
+ in[6] = _mm256_loadu_si256((const __m256i *)tBufferIn[6]);
+ in[7] = _mm256_loadu_si256((const __m256i *)tBufferIn[7]);
+
+ snow3g_keystream_8_32(&ctx, ks);
+
+ _mm256_storeu_si256((__m256i *)tBufferOut[0],
+ _mm256_xor_si256(in[0], ks[0]));
+ _mm256_storeu_si256((__m256i *)tBufferOut[1],
+ _mm256_xor_si256(in[1], ks[1]));
+ _mm256_storeu_si256((__m256i *)tBufferOut[2],
+ _mm256_xor_si256(in[2], ks[2]));
+ _mm256_storeu_si256((__m256i *)tBufferOut[3],
+ _mm256_xor_si256(in[3], ks[3]));
+ _mm256_storeu_si256((__m256i *)tBufferOut[4],
+ _mm256_xor_si256(in[4], ks[4]));
+ _mm256_storeu_si256((__m256i *)tBufferOut[5],
+ _mm256_xor_si256(in[5], ks[5]));
+ _mm256_storeu_si256((__m256i *)tBufferOut[6],
+ _mm256_xor_si256(in[6], ks[6]));
+ _mm256_storeu_si256((__m256i *)tBufferOut[7],
+ _mm256_xor_si256(in[7], ks[7]));
+
+ for (j = 0; j < 8; j++) {
+ tBufferIn[i] += 32;
+ tBufferOut[i] += 32;
+ }
+ }
+
+ /* process the remaining of each buffer
+ * - extract the LFSR and FSM structures
+ * - Continue process 1 buffer
+ */
+ if (tLenInBytes[0]) {
+ snow3gKeyState1_t ctx1;
+
+ snow3gStateConvert_8(&ctx, &ctx1, 0);
+ f8_snow3g(&ctx1, tBufferIn[0], tBufferOut[0], tLenInBytes[0]);
+ }
+ if (tLenInBytes[1]) {
+ snow3gKeyState1_t ctx2;
+
+ snow3gStateConvert_8(&ctx, &ctx2, 1);
+ f8_snow3g(&ctx2, tBufferIn[1], tBufferOut[1], tLenInBytes[1]);
+ }
+ if (tLenInBytes[2]) {
+ snow3gKeyState1_t ctx3;
+
+ snow3gStateConvert_8(&ctx, &ctx3, 2);
+ f8_snow3g(&ctx3, tBufferIn[2], tBufferOut[2], tLenInBytes[2]);
+ }
+ if (tLenInBytes[3]) {
+ snow3gKeyState1_t ctx4;
+
+ snow3gStateConvert_8(&ctx, &ctx4, 3);
+ f8_snow3g(&ctx4, tBufferIn[3], tBufferOut[3], tLenInBytes[3]);
+ }
+ if (tLenInBytes[4]) {
+ snow3gKeyState1_t ctx5;
+
+ snow3gStateConvert_8(&ctx, &ctx5, 4);
+ f8_snow3g(&ctx5, tBufferIn[4], tBufferOut[4], tLenInBytes[4]);
+ }
+ if (tLenInBytes[5]) {
+ snow3gKeyState1_t ctx6;
+
+ snow3gStateConvert_8(&ctx, &ctx6, 5);
+ f8_snow3g(&ctx6, tBufferIn[5], tBufferOut[5], tLenInBytes[5]);
+ }
+ if (tLenInBytes[6]) {
+ snow3gKeyState1_t ctx7;
+
+ snow3gStateConvert_8(&ctx, &ctx7, 6);
+ f8_snow3g(&ctx7, tBufferIn[6], tBufferOut[6], tLenInBytes[6]);
+ }
+ if (tLenInBytes[7]) {
+ snow3gKeyState1_t ctx8;
+
+ snow3gStateConvert_8(&ctx, &ctx8, 7);
+ f8_snow3g(&ctx8, tBufferIn[7], tBufferOut[7], tLenInBytes[7]);
+ }
+
+#ifdef SAFE_DATA
+ CLEAR_MEM(&ctx, sizeof(ctx));
+ CLEAR_MEM(&ks, sizeof(ks));
+ CLEAR_MEM(&in, sizeof(in));
+#endif /* SAFE_DATA */
+}
+
+/*---------------------------------------------------------
+ * @description
+ * Snow3G 8 buffer ks 8 multi:
+ * Processes 8 packets 8 bytes at a time.
+ * Uses same key schedule for each buffer.
+ *---------------------------------------------------------*/
+static inline void
+snow3g_8_buffer_ks_8(uint32_t bytes,
+ const snow3g_key_schedule_t *pHandle,
+ const void *pIV1,
+ const void *pIV2,
+ const void *pIV3,
+ const void *pIV4,
+ const void *pIV5,
+ const void *pIV6,
+ const void *pIV7,
+ const void *pIV8,
+ const void *pBufferIn1, void *pBufferOut1,
+ const uint32_t lengthInBytes1,
+ const void *pBufferIn2, void *pBufferOut2,
+ const uint32_t lengthInBytes2,
+ const void *pBufferIn3, void *pBufferOut3,
+ const uint32_t lengthInBytes3,
+ const void *pBufferIn4, void *pBufferOut4,
+ const uint32_t lengthInBytes4,
+ const void *pBufferIn5, void *pBufferOut5,
+ const uint32_t lengthInBytes5,
+ const void *pBufferIn6, void *pBufferOut6,
+ const uint32_t lengthInBytes6,
+ const void *pBufferIn7, void *pBufferOut7,
+ const uint32_t lengthInBytes7,
+ const void *pBufferIn8, void *pBufferOut8,
+ const uint32_t lengthInBytes8)
+{
+
+ uint32_t qwords = bytes / SNOW3G_8_BYTES;
+ __m256i H, L; /* 8 bytes of keystream */
+ snow3gKeyState8_t ctx;
+ int i;
+ uint32_t lenInBytes1 = lengthInBytes1;
+ uint32_t lenInBytes2 = lengthInBytes2;
+ uint32_t lenInBytes3 = lengthInBytes3;
+ uint32_t lenInBytes4 = lengthInBytes4;
+ uint32_t lenInBytes5 = lengthInBytes5;
+ uint32_t lenInBytes6 = lengthInBytes6;
+ uint32_t lenInBytes7 = lengthInBytes7;
+ uint32_t lenInBytes8 = lengthInBytes8;
+ uint8_t *pBufOut1 = pBufferOut1;
+ uint8_t *pBufOut2 = pBufferOut2;
+ uint8_t *pBufOut3 = pBufferOut3;
+ uint8_t *pBufOut4 = pBufferOut4;
+ uint8_t *pBufOut5 = pBufferOut5;
+ uint8_t *pBufOut6 = pBufferOut6;
+ uint8_t *pBufOut7 = pBufferOut7;
+ uint8_t *pBufOut8 = pBufferOut8;
+ const uint8_t *pBufIn1 = pBufferIn1;
+ const uint8_t *pBufIn2 = pBufferIn2;
+ const uint8_t *pBufIn3 = pBufferIn3;
+ const uint8_t *pBufIn4 = pBufferIn4;
+ const uint8_t *pBufIn5 = pBufferIn5;
+ const uint8_t *pBufIn6 = pBufferIn6;
+ const uint8_t *pBufIn7 = pBufferIn7;
+ const uint8_t *pBufIn8 = pBufferIn8;
+
+ bytes = qwords * SNOW3G_8_BYTES; /* rounded down minimum length */
+
+ /* Initialize the schedule from the IV */
+ snow3gStateInitialize_8(&ctx, pHandle, pIV1, pIV2, pIV3,
+ pIV4, pIV5, pIV6, pIV7, pIV8);
+
+ /* Clock FSM and LFSR once, ignore the keystream */
+ snow3g_keystream_8_4(&ctx, &L);
+
+ lenInBytes1 -= bytes;
+ lenInBytes2 -= bytes;
+ lenInBytes3 -= bytes;
+ lenInBytes4 -= bytes;
+ lenInBytes5 -= bytes;
+ lenInBytes6 -= bytes;
+ lenInBytes7 -= bytes;
+ lenInBytes8 -= bytes;
+
+ /* generates 8 sets at a time on all streams */
+ for (i = qwords; i != 0; i--) {
+ snow3g_keystream_8_8(&ctx, &L, &H);
+
+ pBufIn1 = xor_keystrm_rev(pBufOut1, pBufIn1,
+ _mm256_extract_epi64(L, 0));
+ pBufIn2 = xor_keystrm_rev(pBufOut2, pBufIn2,
+ _mm256_extract_epi64(L, 1));
+ pBufIn3 = xor_keystrm_rev(pBufOut3, pBufIn3,
+ _mm256_extract_epi64(H, 0));
+ pBufIn4 = xor_keystrm_rev(pBufOut4, pBufIn4,
+ _mm256_extract_epi64(H, 1));
+ pBufIn5 = xor_keystrm_rev(pBufOut5, pBufIn5,
+ _mm256_extract_epi64(L, 2));
+ pBufIn6 = xor_keystrm_rev(pBufOut6, pBufIn6,
+ _mm256_extract_epi64(L, 3));
+ pBufIn7 = xor_keystrm_rev(pBufOut7, pBufIn7,
+ _mm256_extract_epi64(H, 2));
+ pBufIn8 = xor_keystrm_rev(pBufOut8, pBufIn8,
+ _mm256_extract_epi64(H, 3));
+
+ pBufOut1 += SNOW3G_8_BYTES;
+ pBufOut2 += SNOW3G_8_BYTES;
+ pBufOut3 += SNOW3G_8_BYTES;
+ pBufOut4 += SNOW3G_8_BYTES;
+ pBufOut5 += SNOW3G_8_BYTES;
+ pBufOut6 += SNOW3G_8_BYTES;
+ pBufOut7 += SNOW3G_8_BYTES;
+ pBufOut8 += SNOW3G_8_BYTES;
+ }
+
+ /* process the remaining of each buffer
+ * - extract the LFSR and FSM structures
+ * - Continue process 1 buffer
+ */
+ if (lenInBytes1) {
+ snow3gKeyState1_t ctx1;
+
+ snow3gStateConvert_8(&ctx, &ctx1, 0);
+ f8_snow3g(&ctx1, pBufIn1, pBufOut1, lenInBytes1);
+ }
+
+ if (lenInBytes2) {
+ snow3gKeyState1_t ctx2;
+
+ snow3gStateConvert_8(&ctx, &ctx2, 1);
+ f8_snow3g(&ctx2, pBufIn2, pBufOut2, lenInBytes2);
+ }
+
+ if (lenInBytes3) {
+ snow3gKeyState1_t ctx3;
+
+ snow3gStateConvert_8(&ctx, &ctx3, 2);
+ f8_snow3g(&ctx3, pBufIn3, pBufOut3, lenInBytes3);
+ }
+
+ if (lenInBytes4) {
+ snow3gKeyState1_t ctx4;
+
+ snow3gStateConvert_8(&ctx, &ctx4, 3);
+ f8_snow3g(&ctx4, pBufIn4, pBufOut4, lenInBytes4);
+ }
+
+ if (lenInBytes5) {
+ snow3gKeyState1_t ctx5;
+
+ snow3gStateConvert_8(&ctx, &ctx5, 4);
+ f8_snow3g(&ctx5, pBufIn5, pBufOut5, lenInBytes5);
+ }
+
+ if (lenInBytes6) {
+ snow3gKeyState1_t ctx6;
+
+ snow3gStateConvert_8(&ctx, &ctx6, 5);
+ f8_snow3g(&ctx6, pBufIn6, pBufOut6, lenInBytes6);
+ }
+
+ if (lenInBytes7) {
+ snow3gKeyState1_t ctx7;
+
+ snow3gStateConvert_8(&ctx, &ctx7, 6);
+ f8_snow3g(&ctx7, pBufIn7, pBufOut7, lenInBytes7);
+ }
+
+ if (lenInBytes8) {
+ snow3gKeyState1_t ctx8;
+
+ snow3gStateConvert_8(&ctx, &ctx8, 7);
+ f8_snow3g(&ctx8, pBufIn8, pBufOut8, lenInBytes8);
+ }
+
+#ifdef SAFE_DATA
+ H = _mm256_setzero_si256();
+ L = _mm256_setzero_si256();
+ CLEAR_MEM(&ctx, sizeof(ctx));
+#endif /* SAFE_DATA */
+}
+
+/*---------------------------------------------------------
+ * @description
+ * Snow3G 8 buffer ks 32 multi:
+ * Processes 8 packets 32 bytes at a time.
+ * Uses same key schedule for each buffer.
+ *---------------------------------------------------------*/
+static inline void
+snow3g_8_buffer_ks_32(uint32_t bytes,
+ const snow3g_key_schedule_t *pKey,
+ const void *pIV1, const void *pIV2,
+ const void *pIV3, const void *pIV4,
+ const void *pIV5, const void *pIV6,
+ const void *pIV7, const void *pIV8,
+ const void *pBufferIn1, void *pBufferOut1,
+ const uint32_t lengthInBytes1,
+ const void *pBufferIn2, void *pBufferOut2,
+ const uint32_t lengthInBytes2,
+ const void *pBufferIn3, void *pBufferOut3,
+ const uint32_t lengthInBytes3,
+ const void *pBufferIn4, void *pBufferOut4,
+ const uint32_t lengthInBytes4,
+ const void *pBufferIn5, void *pBufferOut5,
+ const uint32_t lengthInBytes5,
+ const void *pBufferIn6, void *pBufferOut6,
+ const uint32_t lengthInBytes6,
+ const void *pBufferIn7, void *pBufferOut7,
+ const uint32_t lengthInBytes7,
+ const void *pBufferIn8, void *pBufferOut8,
+ const uint32_t lengthInBytes8)
+{
+ snow3gKeyState8_t ctx;
+ uint32_t i;
+ uint32_t lenInBytes1 = lengthInBytes1;
+ uint32_t lenInBytes2 = lengthInBytes2;
+ uint32_t lenInBytes3 = lengthInBytes3;
+ uint32_t lenInBytes4 = lengthInBytes4;
+ uint32_t lenInBytes5 = lengthInBytes5;
+ uint32_t lenInBytes6 = lengthInBytes6;
+ uint32_t lenInBytes7 = lengthInBytes7;
+ uint32_t lenInBytes8 = lengthInBytes8;
+ uint8_t *pBufOut1 = pBufferOut1;
+ uint8_t *pBufOut2 = pBufferOut2;
+ uint8_t *pBufOut3 = pBufferOut3;
+ uint8_t *pBufOut4 = pBufferOut4;
+ uint8_t *pBufOut5 = pBufferOut5;
+ uint8_t *pBufOut6 = pBufferOut6;
+ uint8_t *pBufOut7 = pBufferOut7;
+ uint8_t *pBufOut8 = pBufferOut8;
+ const uint8_t *pBufIn1 = pBufferIn1;
+ const uint8_t *pBufIn2 = pBufferIn2;
+ const uint8_t *pBufIn3 = pBufferIn3;
+ const uint8_t *pBufIn4 = pBufferIn4;
+ const uint8_t *pBufIn5 = pBufferIn5;
+ const uint8_t *pBufIn6 = pBufferIn6;
+ const uint8_t *pBufIn7 = pBufferIn7;
+ const uint8_t *pBufIn8 = pBufferIn8;
+
+ uint32_t blocks = bytes / 32;
+
+ bytes = blocks * 32; /* rounded down minimum length */
+
+ /* Initialize the schedule from the IV */
+ snow3gStateInitialize_8(&ctx, pKey, pIV1, pIV2, pIV3, pIV4, pIV5, pIV6,
+ pIV7, pIV8);
+
+ /* Clock FSM and LFSR once, ignore the keystream */
+ __m256i ks[8];
+
+ snow3g_keystream_8_4(&ctx, ks);
+
+ lenInBytes1 -= bytes;
+ lenInBytes2 -= bytes;
+ lenInBytes3 -= bytes;
+ lenInBytes4 -= bytes;
+ lenInBytes5 -= bytes;
+ lenInBytes6 -= bytes;
+ lenInBytes7 -= bytes;
+ lenInBytes8 -= bytes;
+
+ __m256i in[8];
+
+ /* generates 8 sets at a time on all streams */
+ for (i = 0; i < blocks; i++) {
+
+ in[0] = _mm256_loadu_si256((const __m256i *)pBufIn1);
+ in[1] = _mm256_loadu_si256((const __m256i *)pBufIn2);
+ in[2] = _mm256_loadu_si256((const __m256i *)pBufIn3);
+ in[3] = _mm256_loadu_si256((const __m256i *)pBufIn4);
+ in[4] = _mm256_loadu_si256((const __m256i *)pBufIn5);
+ in[5] = _mm256_loadu_si256((const __m256i *)pBufIn6);
+ in[6] = _mm256_loadu_si256((const __m256i *)pBufIn7);
+ in[7] = _mm256_loadu_si256((const __m256i *)pBufIn8);
+
+ snow3g_keystream_8_32(&ctx, ks);
+
+ _mm256_storeu_si256((__m256i *)pBufOut1,
+ _mm256_xor_si256(in[0], ks[0]));
+ _mm256_storeu_si256((__m256i *)pBufOut2,
+ _mm256_xor_si256(in[1], ks[1]));
+ _mm256_storeu_si256((__m256i *)pBufOut3,
+ _mm256_xor_si256(in[2], ks[2]));
+ _mm256_storeu_si256((__m256i *)pBufOut4,
+ _mm256_xor_si256(in[3], ks[3]));
+ _mm256_storeu_si256((__m256i *)pBufOut5,
+ _mm256_xor_si256(in[4], ks[4]));
+ _mm256_storeu_si256((__m256i *)pBufOut6,
+ _mm256_xor_si256(in[5], ks[5]));
+ _mm256_storeu_si256((__m256i *)pBufOut7,
+ _mm256_xor_si256(in[6], ks[6]));
+ _mm256_storeu_si256((__m256i *)pBufOut8,
+ _mm256_xor_si256(in[7], ks[7]));
+
+ pBufIn1 += 32;
+ pBufIn2 += 32;
+ pBufIn3 += 32;
+ pBufIn4 += 32;
+ pBufIn5 += 32;
+ pBufIn6 += 32;
+ pBufIn7 += 32;
+ pBufIn8 += 32;
+
+ pBufOut1 += 32;
+ pBufOut2 += 32;
+ pBufOut3 += 32;
+ pBufOut4 += 32;
+ pBufOut5 += 32;
+ pBufOut6 += 32;
+ pBufOut7 += 32;
+ pBufOut8 += 32;
+ }
+
+ /* process the remaining of each buffer
+ * - extract the LFSR and FSM structures
+ * - Continue process 1 buffer
+ */
+ if (lenInBytes1) {
+ snow3gKeyState1_t ctx1;
+
+ snow3gStateConvert_8(&ctx, &ctx1, 0);
+ f8_snow3g(&ctx1, pBufIn1, pBufOut1, lenInBytes1);
+ }
+
+ if (lenInBytes2) {
+ snow3gKeyState1_t ctx2;
+
+ snow3gStateConvert_8(&ctx, &ctx2, 1);
+ f8_snow3g(&ctx2, pBufIn2, pBufOut2, lenInBytes2);
+ }
+
+ if (lenInBytes3) {
+ snow3gKeyState1_t ctx3;
+
+ snow3gStateConvert_8(&ctx, &ctx3, 2);
+ f8_snow3g(&ctx3, pBufIn3, pBufOut3, lenInBytes3);
+ }
+
+ if (lenInBytes4) {
+ snow3gKeyState1_t ctx4;
+
+ snow3gStateConvert_8(&ctx, &ctx4, 3);
+ f8_snow3g(&ctx4, pBufIn4, pBufOut4, lenInBytes4);
+ }
+
+ if (lenInBytes5) {
+ snow3gKeyState1_t ctx5;
+
+ snow3gStateConvert_8(&ctx, &ctx5, 4);
+ f8_snow3g(&ctx5, pBufIn5, pBufOut5, lenInBytes5);
+ }
+
+ if (lenInBytes6) {
+ snow3gKeyState1_t ctx6;
+
+ snow3gStateConvert_8(&ctx, &ctx6, 5);
+ f8_snow3g(&ctx6, pBufIn6, pBufOut6, lenInBytes6);
+ }
+
+ if (lenInBytes7) {
+ snow3gKeyState1_t ctx7;
+
+ snow3gStateConvert_8(&ctx, &ctx7, 6);
+ f8_snow3g(&ctx7, pBufIn7, pBufOut7, lenInBytes7);
+ }
+
+ if (lenInBytes8) {
+ snow3gKeyState1_t ctx8;
+
+ snow3gStateConvert_8(&ctx, &ctx8, 7);
+ f8_snow3g(&ctx8, pBufIn8, pBufOut8, lenInBytes8);
+ }
+
+#ifdef SAFE_DATA
+ CLEAR_MEM(&ctx, sizeof(ctx));
+ CLEAR_MEM(&ks, sizeof(ks));
+ CLEAR_MEM(&in, sizeof(in));
+#endif /* SAFE_DATA */
+}
+#endif /* AVX2 */
+
+/*---------------------------------------------------------
+ * @description
+ * Snow3G F8 8 buffer, multi-key:
+ * Eight packets enc/dec with eight respective key schedules.
+ * The 8 IVs are independent and are passed as an array of pointers.
+ * Each buffer and data length are separate.
+ *---------------------------------------------------------*/
+void SNOW3G_F8_8_BUFFER_MULTIKEY(const snow3g_key_schedule_t * const pKey[],
+ const void * const IV[],
+ const void * const BufferIn[],
+ void *BufferOut[],
+ const uint32_t lengthInBytes[])
+{
+ int i;
+
+#ifdef SAFE_PARAM
+ if ((pKey == NULL) || (IV == NULL) || (BufferIn == NULL) ||
+ (BufferOut == NULL) || (lengthInBytes == NULL))
+ return;
+
+ for (i = 0; i < 8; i++)
+ if ((pKey[i] == NULL) || (IV[i] == NULL) ||
+ (BufferIn[i] == NULL) || (BufferOut[i] == NULL) ||
+ (lengthInBytes[i] == 0) ||
+ (lengthInBytes[i] > SNOW3G_MAX_BYTELEN))
+ return;
+#endif
+
+#ifndef AVX2
+ /* basic C workaround for lack of non AVX2 implementation */
+ for (i = 0; i < 8; i++)
+ SNOW3G_F8_1_BUFFER(pKey[i], IV[i], BufferIn[i], BufferOut[i],
+ lengthInBytes[i]);
+#else
+ uint32_t bytes = lengthInBytes[0];
+
+ /* find min byte lenght */
+ for (i = 1; i < 8; i++)
+ if (lengthInBytes[i] < bytes)
+ bytes = lengthInBytes[i];
+
+ if (bytes % 32) {
+ snow3g_8_buffer_ks_8_multi(bytes, pKey, IV, BufferIn, BufferOut,
+ lengthInBytes);
+ } else {
+ snow3g_8_buffer_ks_32_multi(bytes, pKey, IV, BufferIn,
+ BufferOut, lengthInBytes);
+ }
+#ifdef SAFE_DATA
+ CLEAR_SCRATCH_GPS();
+ CLEAR_SCRATCH_SIMD_REGS();
+#endif
+#endif /* AVX2 */
+}
+
+/*---------------------------------------------------------
+ * @description
+ * Snow3G F8 8 buffer:
+ * Eight packets enc/dec with the same key schedule.
+ * The 8 IVs are independent and are passed as an array of pointers.
+ * Each buffer and data length are separate.
+ * Uses AVX instructions.
+ *---------------------------------------------------------*/
+void SNOW3G_F8_8_BUFFER(const snow3g_key_schedule_t *pHandle,
+ const void *pIV1,
+ const void *pIV2,
+ const void *pIV3,
+ const void *pIV4,
+ const void *pIV5,
+ const void *pIV6,
+ const void *pIV7,
+ const void *pIV8,
+ const void *pBufIn1,
+ void *pBufOut1,
+ const uint32_t lenInBytes1,
+ const void *pBufIn2,
+ void *pBufOut2,
+ const uint32_t lenInBytes2,
+ const void *pBufIn3,
+ void *pBufOut3,
+ const uint32_t lenInBytes3,
+ const void *pBufIn4,
+ void *pBufOut4,
+ const uint32_t lenInBytes4,
+ const void *pBufIn5,
+ void *pBufOut5,
+ const uint32_t lenInBytes5,
+ const void *pBufIn6,
+ void *pBufOut6,
+ const uint32_t lenInBytes6,
+ const void *pBufIn7,
+ void *pBufOut7,
+ const uint32_t lenInBytes7,
+ const void *pBufIn8,
+ void *pBufOut8,
+ const uint32_t lenInBytes8)
+{
+#ifdef SAFE_PARAM
+ if ((pHandle == NULL) ||
+ (pIV1 == NULL) || (pIV2 == NULL) ||
+ (pIV3 == NULL) || (pIV4 == NULL) ||
+ (pIV5 == NULL) || (pIV6 == NULL) ||
+ (pIV7 == NULL) || (pIV8 == NULL) ||
+ (pBufIn1 == NULL) || (pBufOut1 == NULL) ||
+ (pBufIn2 == NULL) || (pBufOut2 == NULL) ||
+ (pBufIn3 == NULL) || (pBufOut3 == NULL) ||
+ (pBufIn4 == NULL) || (pBufOut4 == NULL) ||
+ (pBufIn5 == NULL) || (pBufOut5 == NULL) ||
+ (pBufIn6 == NULL) || (pBufOut6 == NULL) ||
+ (pBufIn7 == NULL) || (pBufOut7 == NULL) ||
+ (pBufIn8 == NULL) || (pBufOut8 == NULL) ||
+ (lenInBytes1 == 0) || (lenInBytes1 > SNOW3G_MAX_BYTELEN) ||
+ (lenInBytes2 == 0) || (lenInBytes2 > SNOW3G_MAX_BYTELEN) ||
+ (lenInBytes3 == 0) || (lenInBytes3 > SNOW3G_MAX_BYTELEN) ||
+ (lenInBytes4 == 0) || (lenInBytes4 > SNOW3G_MAX_BYTELEN) ||
+ (lenInBytes5 == 0) || (lenInBytes5 > SNOW3G_MAX_BYTELEN) ||
+ (lenInBytes6 == 0) || (lenInBytes6 > SNOW3G_MAX_BYTELEN) ||
+ (lenInBytes7 == 0) || (lenInBytes7 > SNOW3G_MAX_BYTELEN) ||
+ (lenInBytes8 == 0) || (lenInBytes8 > SNOW3G_MAX_BYTELEN))
+ return;
+#endif
+
+#ifdef AVX2
+ uint32_t bytes1 =
+ (lenInBytes1 < lenInBytes2 ? lenInBytes1
+ : lenInBytes2); /* number of bytes */
+ uint32_t bytes2 =
+ (lenInBytes3 < lenInBytes4 ? lenInBytes3
+ : lenInBytes4); /* number of bytes */
+ uint32_t bytes3 =
+ (lenInBytes5 < lenInBytes6 ? lenInBytes5
+ : lenInBytes6); /* number of bytes */
+ uint32_t bytes4 =
+ (lenInBytes7 < lenInBytes8 ? lenInBytes7
+ : lenInBytes8); /* number of bytes */
+ uint32_t bytesq1 =
+ (bytes1 < bytes2) ? bytes1 : bytes2; /* min number of bytes */
+ uint32_t bytesq2 = (bytes3 < bytes4) ? bytes3 : bytes4;
+ uint32_t bytes = (bytesq1 < bytesq2) ? bytesq1 : bytesq2;
+
+ if (bytes % 32) {
+ snow3g_8_buffer_ks_8(
+ bytes, pHandle, pIV1, pIV2, pIV3, pIV4, pIV5, pIV6,
+ pIV7, pIV8, pBufIn1, pBufOut1, lenInBytes1, pBufIn2,
+ pBufOut2, lenInBytes2, pBufIn3, pBufOut3, lenInBytes3,
+ pBufIn4, pBufOut4, lenInBytes4, pBufIn5, pBufOut5,
+ lenInBytes5, pBufIn6, pBufOut6, lenInBytes6, pBufIn7,
+ pBufOut7, lenInBytes7, pBufIn8, pBufOut8, lenInBytes8);
+ } else {
+ snow3g_8_buffer_ks_32(
+ bytes, pHandle, pIV1, pIV2, pIV3, pIV4, pIV5, pIV6,
+ pIV7, pIV8, pBufIn1, pBufOut1, lenInBytes1, pBufIn2,
+ pBufOut2, lenInBytes2, pBufIn3, pBufOut3, lenInBytes3,
+ pBufIn4, pBufOut4, lenInBytes4, pBufIn5, pBufOut5,
+ lenInBytes5, pBufIn6, pBufOut6, lenInBytes6, pBufIn7,
+ pBufOut7, lenInBytes7, pBufIn8, pBufOut8, lenInBytes8);
+ }
+#ifdef SAFE_DATA
+ CLEAR_SCRATCH_GPS();
+ CLEAR_SCRATCH_SIMD_REGS();
+#endif
+#else /* ~AVX2 */
+ SNOW3G_F8_2_BUFFER(pHandle, pIV1, pIV2, pBufIn1, pBufOut1, lenInBytes1,
+ pBufIn2, pBufOut2, lenInBytes2);
+
+ SNOW3G_F8_2_BUFFER(pHandle, pIV3, pIV4, pBufIn3, pBufOut3, lenInBytes3,
+ pBufIn4, pBufOut4, lenInBytes4);
+
+ SNOW3G_F8_2_BUFFER(pHandle, pIV5, pIV6, pBufIn5, pBufOut5, lenInBytes5,
+ pBufIn6, pBufOut6, lenInBytes6);
+
+ SNOW3G_F8_2_BUFFER(pHandle, pIV7, pIV8, pBufIn7, pBufOut7, lenInBytes7,
+ pBufIn8, pBufOut8, lenInBytes8);
+#endif /* AVX */
+}
+
+/******************************************************************************
+ * @description
+ * Snow3G F8 multi packet:
+ * Performs F8 enc/dec on [n] packets. The operation is performed in-place.
+ * The input IV's are passed in Little Endian format.
+ * The KeySchedule is in Little Endian format.
+ ******************************************************************************/
+void SNOW3G_F8_N_BUFFER(const snow3g_key_schedule_t *pCtx,
+ const void * const IV[],
+ const void * const pBufferIn[],
+ void *pBufferOut[],
+ const uint32_t bufLenInBytes[],
+ const uint32_t packetCount)
+{
+#ifdef SAFE_PARAM
+ uint32_t i;
+
+ if ((pCtx == NULL) || (IV == NULL) || (pBufferIn == NULL) ||
+ (pBufferOut == NULL) || (bufLenInBytes == NULL))
+ return;
+
+ for (i = 0; i < packetCount; i++)
+ if ((IV[i] == NULL) || (pBufferIn[i] == NULL) ||
+ (pBufferOut[i] == NULL) || (bufLenInBytes[i] == 0) ||
+ (bufLenInBytes[i] > SNOW3G_MAX_BYTELEN))
+ return;
+#endif
+ if (packetCount > 16) {
+ pBufferOut[0] = NULL;
+ printf("packetCount too high (%d)\n", packetCount);
+ return;
+ }
+
+ uint32_t packet_index, inner_index, pktCnt = packetCount;
+ int sortNeeded = 0, tempLen = 0;
+ uint8_t *srctempbuff;
+ uint8_t *dsttempbuff;
+ uint8_t *ivtempbuff;
+ uint8_t *pSrcBuf[NUM_PACKETS_16] = {NULL};
+ uint8_t *pDstBuf[NUM_PACKETS_16] = {NULL};
+ uint8_t *pIV[NUM_PACKETS_16] = {NULL};
+ uint32_t lensBuf[NUM_PACKETS_16] = {0};
+
+ memcpy((void *)lensBuf, bufLenInBytes, packetCount * sizeof(uint32_t));
+ memcpy((void *)pSrcBuf, pBufferIn, packetCount * sizeof(void *));
+ memcpy((void *)pDstBuf, pBufferOut, packetCount * sizeof(void *));
+ memcpy((void *)pIV, IV, packetCount * sizeof(void *));
+
+ packet_index = packetCount;
+
+ while (packet_index--) {
+
+ /* check if all packets are sorted by decreasing length */
+ if (packet_index > 0 && lensBuf[packet_index - 1] <
+ lensBuf[packet_index]) {
+ /* this packet array is not correctly sorted */
+ sortNeeded = 1;
+ }
+ }
+
+ if (sortNeeded) {
+
+ /* sort packets in decreasing buffer size from [0] to
+ [n]th packet, ** where buffer[0] will contain longest
+ buffer and buffer[n] will contain the shortest buffer.
+ 4 arrays are swapped :
+ - pointers to input buffers
+ - pointers to output buffers
+ - pointers to input IV's
+ - input buffer lengths */
+ packet_index = packetCount;
+ while (packet_index--) {
+
+ inner_index = packet_index;
+ while (inner_index--) {
+
+ if (lensBuf[packet_index] >
+ lensBuf[inner_index]) {
+
+ /* swap buffers to arrange in
+ descending order from [0]. */
+ srctempbuff = pSrcBuf[packet_index];
+ dsttempbuff = pDstBuf[packet_index];
+ ivtempbuff = pIV[packet_index];
+ tempLen = lensBuf[packet_index];
+
+ pSrcBuf[packet_index] =
+ pSrcBuf[inner_index];
+ pDstBuf[packet_index] =
+ pDstBuf[inner_index];
+ pIV[packet_index] = pIV[inner_index];
+ lensBuf[packet_index] =
+ lensBuf[inner_index];
+
+ pSrcBuf[inner_index] = srctempbuff;
+ pDstBuf[inner_index] = dsttempbuff;
+ pIV[inner_index] = ivtempbuff;
+ lensBuf[inner_index] = tempLen;
+ }
+ } /* for inner packet index (inner bubble-sort) */
+ } /* for outer packet index (outer bubble-sort) */
+ } /* if sortNeeded */
+
+ packet_index = 0;
+ /* process 8 buffers at-a-time */
+#ifdef AVX2
+ while (pktCnt >= 8) {
+ pktCnt -= 8;
+ SNOW3G_F8_8_BUFFER(pCtx, pIV[packet_index],
+ pIV[packet_index + 1],
+ pIV[packet_index + 2],
+ pIV[packet_index + 3],
+ pIV[packet_index + 4],
+ pIV[packet_index + 5],
+ pIV[packet_index + 6],
+ pIV[packet_index + 7],
+ pSrcBuf[packet_index],
+ pDstBuf[packet_index],
+ lensBuf[packet_index],
+ pSrcBuf[packet_index + 1],
+ pDstBuf[packet_index + 1],
+ lensBuf[packet_index + 1],
+ pSrcBuf[packet_index + 2],
+ pDstBuf[packet_index + 2],
+ lensBuf[packet_index + 2],
+ pSrcBuf[packet_index + 3],
+ pDstBuf[packet_index + 3],
+ lensBuf[packet_index + 3],
+ pSrcBuf[packet_index + 4],
+ pDstBuf[packet_index + 4],
+ lensBuf[packet_index + 4],
+ pSrcBuf[packet_index + 5],
+ pDstBuf[packet_index + 5],
+ lensBuf[packet_index + 5],
+ pSrcBuf[packet_index + 6],
+ pDstBuf[packet_index + 6],
+ lensBuf[packet_index + 6],
+ pSrcBuf[packet_index + 7],
+ pDstBuf[packet_index + 7],
+ lensBuf[packet_index + 7]);
+ packet_index += 8;
+ }
+#endif
+ /* process 4 buffers at-a-time */
+ while (pktCnt >= 4) {
+ pktCnt -= 4;
+ SNOW3G_F8_4_BUFFER(pCtx, pIV[packet_index + 0],
+ pIV[packet_index + 1],
+ pIV[packet_index + 2],
+ pIV[packet_index + 3],
+ pSrcBuf[packet_index + 0],
+ pDstBuf[packet_index + 0],
+ lensBuf[packet_index + 0],
+ pSrcBuf[packet_index + 1],
+ pDstBuf[packet_index + 1],
+ lensBuf[packet_index + 1],
+ pSrcBuf[packet_index + 2],
+ pDstBuf[packet_index + 2],
+ lensBuf[packet_index + 2],
+ pSrcBuf[packet_index + 3],
+ pDstBuf[packet_index + 3],
+ lensBuf[packet_index + 3]);
+ packet_index += 4;
+ }
+
+ /* process 2 packets at-a-time */
+ while (pktCnt >= 2) {
+ pktCnt -= 2;
+ SNOW3G_F8_2_BUFFER(pCtx, pIV[packet_index + 0],
+ pIV[packet_index + 1],
+ pSrcBuf[packet_index + 0],
+ pDstBuf[packet_index + 0],
+ lensBuf[packet_index + 0],
+ pSrcBuf[packet_index + 1],
+ pDstBuf[packet_index + 1],
+ lensBuf[packet_index + 1]);
+ packet_index += 2;
+ }
+
+ /* remaining packets are processed 1 at a time */
+ while (pktCnt--) {
+ SNOW3G_F8_1_BUFFER(pCtx, pIV[packet_index + 0],
+ pSrcBuf[packet_index + 0],
+ pDstBuf[packet_index + 0],
+ lensBuf[packet_index + 0]);
+ packet_index++;
+ }
+}
+
+void SNOW3G_F8_N_BUFFER_MULTIKEY(const snow3g_key_schedule_t * const pCtx[],
+ const void * const IV[],
+ const void * const pBufferIn[],
+ void *pBufferOut[],
+ const uint32_t bufLenInBytes[],
+ const uint32_t packetCount)
+{
+#ifdef SAFE_PARAM
+ uint32_t i;
+
+ if ((pCtx == NULL) || (IV == NULL) || (pBufferIn == NULL) ||
+ (pBufferOut == NULL) || (bufLenInBytes == NULL))
+ return;
+
+ for (i = 0; i < packetCount; i++)
+ if ((pCtx[i] == NULL) || (IV[i] == NULL) ||
+ (pBufferIn[i] == NULL) || (pBufferOut[i] == NULL) ||
+ (bufLenInBytes[i] == 0) ||
+ (bufLenInBytes[i] > SNOW3G_MAX_BYTELEN))
+ return;
+#endif
+ if (packetCount > 16) {
+ pBufferOut[0] = NULL;
+ printf("packetCount too high (%d)\n", packetCount);
+ return;
+ }
+
+ uint32_t packet_index, inner_index, pktCnt = packetCount;
+ int sortNeeded = 0, tempLen = 0;
+ uint8_t *srctempbuff;
+ uint8_t *dsttempbuff;
+ uint8_t *ivtempbuff;
+ snow3g_key_schedule_t *pCtxBuf[NUM_PACKETS_16] = {NULL};
+ uint8_t *pSrcBuf[NUM_PACKETS_16] = {NULL};
+ uint8_t *pDstBuf[NUM_PACKETS_16] = {NULL};
+ uint8_t *pIV[NUM_PACKETS_16] = {NULL};
+ uint32_t lensBuf[NUM_PACKETS_16] = {0};
+ snow3g_key_schedule_t *tempCtx;
+
+ memcpy((void *)pCtxBuf, pCtx, packetCount * sizeof(void *));
+ memcpy((void *)lensBuf, bufLenInBytes, packetCount * sizeof(uint32_t));
+ memcpy((void *)pSrcBuf, pBufferIn, packetCount * sizeof(void *));
+ memcpy((void *)pDstBuf, pBufferOut, packetCount * sizeof(void *));
+ memcpy((void *)pIV, IV, packetCount * sizeof(void *));
+
+ packet_index = packetCount;
+
+ while (packet_index--) {
+
+ /* check if all packets are sorted by decreasing length */
+ if (packet_index > 0 && lensBuf[packet_index - 1] <
+ lensBuf[packet_index]) {
+ /* this packet array is not correctly sorted */
+ sortNeeded = 1;
+ }
+ }
+
+ if (sortNeeded) {
+ /* sort packets in decreasing buffer size from [0] to [n]th
+ packet, where buffer[0] will contain longest buffer and
+ buffer[n] will contain the shortest buffer.
+ 4 arrays are swapped :
+ - pointers to input buffers
+ - pointers to output buffers
+ - pointers to input IV's
+ - input buffer lengths */
+ packet_index = packetCount;
+ while (packet_index--) {
+ inner_index = packet_index;
+ while (inner_index--) {
+ if (lensBuf[packet_index] >
+ lensBuf[inner_index]) {
+ /* swap buffers to arrange in
+ descending order from [0]. */
+ srctempbuff = pSrcBuf[packet_index];
+ dsttempbuff = pDstBuf[packet_index];
+ ivtempbuff = pIV[packet_index];
+ tempLen = lensBuf[packet_index];
+ tempCtx = pCtxBuf[packet_index];
+
+ pSrcBuf[packet_index] =
+ pSrcBuf[inner_index];
+ pDstBuf[packet_index] =
+ pDstBuf[inner_index];
+ pIV[packet_index] = pIV[inner_index];
+ lensBuf[packet_index] =
+ lensBuf[inner_index];
+ pCtxBuf[packet_index] =
+ pCtxBuf[inner_index];
+
+ pSrcBuf[inner_index] = srctempbuff;
+ pDstBuf[inner_index] = dsttempbuff;
+ pIV[inner_index] = ivtempbuff;
+ lensBuf[inner_index] = tempLen;
+ pCtxBuf[inner_index] = tempCtx;
+ }
+ } /* for inner packet index (inner bubble-sort) */
+ } /* for outer packet index (outer bubble-sort) */
+ } /* if sortNeeded */
+
+ packet_index = 0;
+ /* process 8 buffers at-a-time */
+#ifdef AVX2
+ while (pktCnt >= 8) {
+ pktCnt -= 8;
+ SNOW3G_F8_8_BUFFER_MULTIKEY(
+ (const snow3g_key_schedule_t * const *)
+ &pCtxBuf[packet_index],
+ (const void * const *)&pIV[packet_index],
+ (const void * const *)&pSrcBuf[packet_index],
+ (void **)&pDstBuf[packet_index],
+ &lensBuf[packet_index]);
+ packet_index += 8;
+ }
+#endif
+ /* TODO process 4 buffers at-a-time */
+ /* TODO process 2 packets at-a-time */
+ /* remaining packets are processed 1 at a time */
+ while (pktCnt--) {
+ SNOW3G_F8_1_BUFFER(pCtxBuf[packet_index + 0],
+ pIV[packet_index + 0],
+ pSrcBuf[packet_index + 0],
+ pDstBuf[packet_index + 0],
+ lensBuf[packet_index + 0]);
+ packet_index++;
+ }
+}
+
+/*---------------------------------------------------------
+ * @description
+ * Snow3G F9 1 buffer
+ * Single buffer digest with IV and precomputed key schedule
+ *---------------------------------------------------------*/
+void SNOW3G_F9_1_BUFFER(const snow3g_key_schedule_t *pHandle,
+ const void *pIV,
+ const void *pBufferIn,
+ const uint64_t lengthInBits,
+ void *pDigest)
+{
+#ifdef SAFE_PARAM
+ if ((pHandle == NULL) || (pIV == NULL) ||
+ (pBufferIn == NULL) || (pDigest == NULL) ||
+ (lengthInBits == 0) || (lengthInBits > SNOW3G_MAX_BITLEN))
+ return;
+#endif
+ snow3gKeyState1_t ctx;
+ uint32_t z[5];
+ uint64_t lengthInQwords, E, V, P;
+ uint64_t i, rem_bits;
+ const uint64_t *inputBuffer;
+
+ inputBuffer = (const uint64_t *)pBufferIn;
+
+ /* Initialize the snow3g key schedule */
+ snow3gStateInitialize_1(&ctx, pHandle, pIV);
+
+ /*Generate 5 keystream words*/
+ snow3g_f9_keystream_words(&ctx, &z[0]);
+
+ P = ((uint64_t)z[0] << 32) | ((uint64_t)z[1]);
+
+ lengthInQwords = lengthInBits / 64;
+
+ E = 0;
+ /* all blocks except the last one */
+ for (i = 0; i < lengthInQwords; i++) {
+ V = BSWAP64(inputBuffer[i]);
+ E = multiply_and_reduce64(E ^ V, P);
+ }
+
+ /* last bits of last block if any left */
+ rem_bits = lengthInBits % 64;
+ if (rem_bits) {
+ /* last bytes, do not go past end of buffer */
+ memcpy(&V, &inputBuffer[i], (rem_bits + 7) / 8);
+ V = BSWAP64(V);
+ V &= (((uint64_t)-1) << (64 - rem_bits)); /* mask extra bits */
+ E = multiply_and_reduce64(E ^ V, P);
+ }
+
+ /* Multiply by Q */
+ E = multiply_and_reduce64(E ^ lengthInBits,
+ (((uint64_t)z[2] << 32) | ((uint64_t)z[3])));
+
+ /* Final MAC */
+ *(uint32_t *)pDigest =
+ (uint32_t)BSWAP64(E ^ ((uint64_t)z[4] << 32));
+#ifdef SAFE_DATA
+ CLEAR_VAR(&E, sizeof(E));
+ CLEAR_VAR(&V, sizeof(V));
+ CLEAR_VAR(&P, sizeof(P));
+ CLEAR_MEM(&z, sizeof(z));
+ CLEAR_MEM(&ctx, sizeof(ctx));
+ CLEAR_SCRATCH_GPS();
+ CLEAR_SCRATCH_SIMD_REGS();
+#endif /* SAFE_DATA */
+}
+
+#endif /* SNOW3G_COMMON_H */