1 files changed, 2840 insertions, 0 deletions
diff --git a/src/spdk/intel-ipsec-mb/include/snow3g_common.h b/src/spdk/intel-ipsec-mb/include/snow3g_common.h
new file mode 100644
index 000000000..d7c7e63c1
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/include/snow3g_common.h
@@ -0,0 +1,2840 @@
+/*******************************************************************************
+  Copyright (c) 2009-2019, Intel Corporation
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are met:
+
+      * Redistributions of source code must retain the above copyright notice,
+        this list of conditions and the following disclaimer.
+      * Redistributions in binary form must reproduce the above copyright
+        notice, this list of conditions and the following disclaimer in the
+        documentation and/or other materials provided with the distribution.
+      * Neither the name of Intel Corporation nor the names of its contributors
+        may be used to endorse or promote products derived from this software
+        without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+  DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+  DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+  SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+  CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+  OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+
+/*-----------------------------------------------------------------------
+ *
+ * An implementation of SNOW 3G, the core algorithm for the
+ * 3GPP Confidentiality and Integrity algorithms.
+ *
+ *-----------------------------------------------------------------------*/
+
+#ifndef SNOW3G_COMMON_H
+#define SNOW3G_COMMON_H
+
+#include <stdio.h>
+#include <string.h>
+#include <stdint.h>
+
+#include "intel-ipsec-mb.h"
+#include "include/snow3g.h"
+#include "include/snow3g_internal.h"
+#include "clear_regs_mem.h"
+
+#define CLEAR_MEM clear_mem
+#define CLEAR_VAR clear_var
+
+/* -------------------------------------------------------------------
+ * LFSR array shift by 1 position, 4 packets at a time
+ * ------------------------------------------------------------------ */
+
+#ifdef AVX2
+/* LFSR array shift */
+static inline void ShiftLFSR_8(snow3gKeyState8_t *pCtx)
+{
+        pCtx->iLFSR_X = (pCtx->iLFSR_X + 1) & 15;
+}
+#endif /* AVX2 */
+
+/* LFSR array shift */
+static inline void ShiftLFSR_4(snow3gKeyState4_t *pCtx)
+{
+        pCtx->iLFSR_X = (pCtx->iLFSR_X + 1) % 16;
+}
+
+/*---------------------------------------------------------
+ * @description
+ * Gf2 modular multiplication/reduction
+ *
+ *---------------------------------------------------------*/
+static inline uint64_t multiply_and_reduce64(uint64_t a, uint64_t b)
+{
+        uint64_t msk;
+        uint64_t res = 0;
+        uint64_t i = 64;
+
+        while (i--) {
+                msk = ((int64_t)res >> 63) & 0x1b;
+                res <<= 1;
+                res ^= msk;
+                msk = ((int64_t)b >> 63) & a;
+                b <<= 1;
+                res ^= msk;
+        }
+        return res;
+}
+
+#ifdef AVX2
+/* -------------------------------------------------------------------
+ * ClockLFSR sub-function as defined in snow3g standard
+ * S = LFSR[2]
+ *       ^ table_Alpha_div[LFSR[11] & 0xff]
+ *       ^ table_Alpha_mul[LFSR[0] & 0xff]
+ * ------------------------------------------------------------------ */
+static void C0_C11_8(__m256i *S, const __m256i *L0, const __m256i *L11)
+{
+        __m256i mask, Sx, B11, B0, offset;
+
+        offset = _mm256_set1_epi32(3);
+        mask = _mm256_setr_epi32(0xF0F0F000, 0xF0F0F004, 0xF0F0F008, 0xF0F0F00C,
+                                 0xF0F0F000, 0xF0F0F004, 0xF0F0F008,
+                                 0xF0F0F00C);
+        B11 = _mm256_shuffle_epi8(*L11, mask);
+        *S = _mm256_i32gather_epi32(snow3g_table_A_div, B11, 4);
+
+        mask = _mm256_add_epi32(mask, offset);
+        B0 = _mm256_shuffle_epi8(*L0, mask);
+        Sx = _mm256_i32gather_epi32(snow3g_table_A_mul, B0, 4);
+        *S = _mm256_xor_si256(*S, Sx);
+}
+#endif /* AVX2 */
+
+/* -------------------------------------------------------------------
+ * ClockLFSR sub-function as defined in snow3g standard
+ * S = LFSR[2]
+ *       ^ table_Alpha_div[LFSR[11] & 0xff]
+ *       ^ table_Alpha_mul[LFSR[0] & 0xff]
+ * ------------------------------------------------------------------ */
+static inline void C0_C11_4(uint32_t *S, const __m128i *L0, const __m128i *L11)
+{
+        unsigned B11[4], B0[4];
+
+        B11[0] = _mm_extract_epi8(*L11, 0);
+        B11[1] = _mm_extract_epi8(*L11, 4);
+        B11[2] = _mm_extract_epi8(*L11, 8);
+        B11[3] = _mm_extract_epi8(*L11, 12);
+
+        S[0] = snow3g_table_A_div[B11[0]];
+        S[1] = snow3g_table_A_div[B11[1]];
+        S[2] = snow3g_table_A_div[B11[2]];
+        S[3] = snow3g_table_A_div[B11[3]];
+
+        B0[0] = _mm_extract_epi8(*L0, 3);
+        B0[1] = _mm_extract_epi8(*L0, 7);
+        B0[2] = _mm_extract_epi8(*L0, 11);
+        B0[3] = _mm_extract_epi8(*L0, 15);
+
+        S[0] ^= snow3g_table_A_mul[B0[0]];
+        S[1] ^= snow3g_table_A_mul[B0[1]];
+        S[2] ^= snow3g_table_A_mul[B0[2]];
+        S[3] ^= snow3g_table_A_mul[B0[3]];
+}
+
+#ifdef AVX2
+/* -------------------------------------------------------------------
+ * ClockLFSR function as defined in snow3g standard
+ * S =  table_Alpha_div[LFSR[11] & 0xff]
+ *       ^ table_Alpha_mul[LFSR[0] >> 24]
+ *       ^ LFSR[2] ^ LFSR[0] << 8 ^ LFSR[11] >> 8
+ * ------------------------------------------------------------------ */
+static inline void ClockLFSR_8(snow3gKeyState8_t *pCtx)
+{
+        __m256i X2;
+        __m256i S, T, U;
+
+        U = pCtx->LFSR_X[pCtx->iLFSR_X];
+        S = pCtx->LFSR_X[(pCtx->iLFSR_X + 11) % 16];
+
+        C0_C11_8(&X2, &U, &S);
+
+        T = _mm256_slli_epi32(U, 8);
+        S = _mm256_srli_epi32(S, 8);
+        U = _mm256_xor_si256(T, pCtx->LFSR_X[(pCtx->iLFSR_X + 2) % 16]);
+
+        ShiftLFSR_8(pCtx);
+
+        S = _mm256_xor_si256(S, U);
+        S = _mm256_xor_si256(S, X2);
+        pCtx->LFSR_X[(pCtx->iLFSR_X + 15) % 16] = S;
+}
+#endif /* AVX2 */
+
+/* -------------------------------------------------------------------
+ * ClockLFSR function as defined in snow3g standard
+ * S =  table_Alpha_div[LFSR[11] & 0xff]
+ *       ^ table_Alpha_mul[LFSR[0] >> 24]
+ *       ^ LFSR[2] ^ LFSR[0] << 8 ^ LFSR[11] >> 8
+ * ------------------------------------------------------------------ */
+static inline void ClockLFSR_4(snow3gKeyState4_t *pCtx)
+{
+        uint32_t X2[4];
+        __m128i S, T, U;
+
+        U = pCtx->LFSR_X[pCtx->iLFSR_X];
+        S = pCtx->LFSR_X[(pCtx->iLFSR_X + 11) % 16];
+        C0_C11_4(X2, &U, &S);
+
+        T = _mm_slli_epi32(U, 8);
+        S = _mm_srli_epi32(S, 8);
+        U = _mm_xor_si128(T, pCtx->LFSR_X[(pCtx->iLFSR_X + 2) % 16]);
+        ShiftLFSR_4(pCtx);
+
+        /* (SSE4) */
+        T = _mm_insert_epi32(T, X2[0], 0);
+        T = _mm_insert_epi32(T, X2[1], 1);
+        T = _mm_insert_epi32(T, X2[2], 2);
+        T = _mm_insert_epi32(T, X2[3], 3);
+        S = _mm_xor_si128(S, U);
+        S = _mm_xor_si128(S, T);
+        pCtx->LFSR_X[(pCtx->iLFSR_X + 15) % 16] = S;
+}
+
+#ifdef AVX2
+/* -------------------------------------------------------------------
+ * ClockFSM function as defined in snow3g standard
+ * 8 packets at a time
+ * ------------------------------------------------------------------ */
+static inline void ClockFSM_8(snow3gKeyState8_t *pCtx, __m256i *data)
+{
+        __m256i F, R, S2T0, S2T1, S2T2, S2T3, S1T0, S1T1, S1T2, S1T3;
+        __m256i w3, w2, w1, w0, offset, mask;
+
+        F = _mm256_add_epi32(pCtx->LFSR_X[(pCtx->iLFSR_X + 15)%16],
+                             pCtx->FSM_X[0]);
+        R = _mm256_xor_si256(pCtx->LFSR_X[(pCtx->iLFSR_X + 5)%16],
+                             pCtx->FSM_X[2]);
+        *data = _mm256_xor_si256(F, pCtx->FSM_X[1]);
+        R = _mm256_add_epi32(R, pCtx->FSM_X[1]);
+        offset = _mm256_set1_epi32(0x1);
+
+        F = pCtx->FSM_X[1];
+        w3   = _mm256_setr_epi32(0xF0F0F000, 0xF0F0F004, 0xF0F0F008,
+                                 0xF0F0F00C, 0xF0F0F000, 0xF0F0F004,
+                                 0xF0F0F008, 0xF0F0F00C);
+        mask = _mm256_shuffle_epi8(F,w3);
+        S2T0 = _mm256_i32gather_epi32(S2_T0,mask,4);
+
+        w2   = _mm256_add_epi32(w3,offset);
+        mask = _mm256_shuffle_epi8(F,w2);
+        S2T1 = _mm256_i32gather_epi32(S2_T1,mask,4);
+
+        w1   = _mm256_add_epi32(w2,offset);
+        mask = _mm256_shuffle_epi8(pCtx->FSM_X[1],w1);
+        S2T2 = _mm256_i32gather_epi32(S2_T2,mask,4);
+
+        w0   = _mm256_add_epi32(w1,offset);
+        mask = _mm256_shuffle_epi8(F,w0);
+        S2T3 = _mm256_i32gather_epi32(S2_T3,mask,4);
+
+
+        F = pCtx->FSM_X[0];
+        w3   = _mm256_setr_epi32(0xF0F0F000, 0xF0F0F004, 0xF0F0F008,
+                                 0xF0F0F00C, 0xF0F0F010, 0xF0F0F014,
+                                 0xF0F0F018, 0xF0F0F01C);
+        mask = _mm256_shuffle_epi8(F,w3);
+        S1T0 = _mm256_i32gather_epi32(S1_T0,mask,4);
+
+        w2   = _mm256_add_epi32(w3,offset);
+        mask = _mm256_shuffle_epi8(F,w2);
+        S1T1 = _mm256_i32gather_epi32(S1_T1,mask,4);
+
+        w1   = _mm256_add_epi32(w2,offset);
+        mask = _mm256_shuffle_epi8(F,w1);
+        S1T2 = _mm256_i32gather_epi32(S1_T2,mask,4);
+
+        w0   = _mm256_add_epi32(w1,offset);
+        mask = _mm256_shuffle_epi8(F,w0);
+        S1T3 = _mm256_i32gather_epi32(S1_T3,mask,4);
+
+        S2T0 = _mm256_xor_si256(S2T0, S2T1);
+        S2T2 = _mm256_xor_si256(S2T2, S2T3);
+        S2T0  = _mm256_xor_si256(S2T0, S2T2);
+
+        S1T0 = _mm256_xor_si256(S1T0, S1T1);
+        S1T2 = _mm256_xor_si256(S1T2, S1T3);
+        S1T0 = _mm256_xor_si256(S1T0, S1T2);
+
+
+        pCtx->FSM_X[2]  = S2T0;
+        pCtx->FSM_X[1]  = S1T0;
+        pCtx->FSM_X[2]  = S2T0;
+        pCtx->FSM_X[0] = R;
+}
+
+#endif /* AVX2 */
+
+/* -------------------------------------------------------------------
+ * ClockFSM function as defined in snow3g standard
+ * 4 packets at a time
+ * ------------------------------------------------------------------ */
+static inline void ClockFSM_4(snow3gKeyState4_t *pCtx, __m128i *data)
+{
+        __m128i F, R;
+#ifdef _WIN32
+#pragma warning(push)
+#pragma warning(disable:4556)
+#endif
+#if defined (NO_AESNI) || defined (SAFE_LOOKUP)
+        uint32_t L = 0;
+#endif
+        uint32_t K = 0;
+
+        F = _mm_add_epi32(pCtx->LFSR_X[(pCtx->iLFSR_X + 15) % 16],
+                          pCtx->FSM_X[0]);
+        R = _mm_xor_si128(pCtx->LFSR_X[(pCtx->iLFSR_X + 5) % 16],
+                          pCtx->FSM_X[2]);
+        *data = _mm_xor_si128(F, pCtx->FSM_X[1]);
+        R = _mm_add_epi32(R, pCtx->FSM_X[1]);
+#if defined (NO_AESNI) || defined (SAFE_LOOKUP)
+        S1_S2_4(pCtx->FSM_X[2], pCtx->FSM_X[1], pCtx->FSM_X[0], K, L, 0);
+        S1_S2_4(pCtx->FSM_X[2], pCtx->FSM_X[1], pCtx->FSM_X[0], K, L, 1);
+        S1_S2_4(pCtx->FSM_X[2], pCtx->FSM_X[1], pCtx->FSM_X[0], K, L, 2);
+        S1_S2_4(pCtx->FSM_X[2], pCtx->FSM_X[1], pCtx->FSM_X[0], K, L, 3);
+#else
+        S1_S2_4(pCtx->FSM_X[2], pCtx->FSM_X[1], pCtx->FSM_X[0], K, 0);
+        S1_S2_4(pCtx->FSM_X[2], pCtx->FSM_X[1], pCtx->FSM_X[0], K, 1);
+        S1_S2_4(pCtx->FSM_X[2], pCtx->FSM_X[1], pCtx->FSM_X[0], K, 2);
+        S1_S2_4(pCtx->FSM_X[2], pCtx->FSM_X[1], pCtx->FSM_X[0], K, 3);
+#endif /* NO_AESNI */
+        pCtx->FSM_X[0] = R;
+
+#ifdef _WIN32
+#pragma warning(pop)
+#endif
+}
+
+/**
+*******************************************************************************
+* @description
+* This function generates 4 bytes of keystream 1 buffer at a time
+*
+* @param[in]     pCtx       Context where the scheduled keys are stored
+* @param[in/out] pKeyStream Pointer to generated keystream
+*
+*******************************************************************************/
+static inline void snow3g_keystream_1_4(snow3gKeyState1_t *pCtx,
+                                        uint32_t *pKeyStream)
+{
+        uint32_t F;
+
+        ClockFSM_1(pCtx, &F);
+        *pKeyStream = F ^ pCtx->LFSR_S[0];
+        ClockLFSR_1(pCtx);
+}
+
+/**
+*******************************************************************************
+* @description
+* This function generates 8 bytes of keystream 1 buffer at a time
+*
+* @param[in]            pCtx         Context where the scheduled keys are stored
+* @param[in/out]        pKeyStream   Pointer to generated keystream
+*
+*******************************************************************************/
+static inline void snow3g_keystream_1_8(snow3gKeyState1_t *pCtx,
+                                        uint64_t *pKeyStream)
+{
+        uint64_t F;
+        uint32_t FSM4;
+        uint32_t V0, V1;
+        uint32_t F0, F1;
+        uint32_t R0, R1;
+        uint32_t L0, L1, L11, L12;
+
+        /* Merged clock FSM + clock LFSR + clock FSM + clockLFSR
+         * in order to avoid redundancies in function processing
+         * and less instruction immediate dependencies
+         */
+        L0 = pCtx->LFSR_S[0];
+        V0 = pCtx->LFSR_S[2];
+        L1 = pCtx->LFSR_S[1];
+        V1 = pCtx->LFSR_S[3];
+        R1 = pCtx->FSM_R1;
+        L11 = pCtx->LFSR_S[11];
+        L12 = pCtx->LFSR_S[12];
+        V0 ^= snow3g_table_A_mul[L0 >> 24];
+        V1 ^= snow3g_table_A_mul[L1 >> 24];
+        V0 ^= snow3g_table_A_div[L11 & 0xff];
+        V1 ^= snow3g_table_A_div[L12 & 0xff];
+        V0 ^= L0 << 8;
+        V1 ^= L1 << 8;
+        V0 ^= L11 >> 8;
+        V1 ^= L12 >> 8;
+        F0 = pCtx->LFSR_S[15] + R1;
+        F0 ^= L0;
+        F0 ^= pCtx->FSM_R2;
+        R0 = pCtx->FSM_R3 ^ pCtx->LFSR_S[5];
+        R0 += pCtx->FSM_R2;
+        S1_S2_S3_1(pCtx->FSM_R3, pCtx->FSM_R2, R1, FSM4, R0);
+        R1 = pCtx->FSM_R3 ^ pCtx->LFSR_S[6];
+        F1 = V0 + R0;
+        F1 ^= L1;
+        F1 ^= pCtx->FSM_R2;
+        R1 += pCtx->FSM_R2;
+        pCtx->FSM_R3 = Snow3g_S2(pCtx->FSM_R2);
+        pCtx->FSM_R2 = FSM4;
+        pCtx->FSM_R1 = R1;
+
+        /* Shift LFSR twice */
+        ShiftTwiceLFSR_1(pCtx);
+
+        /* keystream mode LFSR update */
+        pCtx->LFSR_S[14] = V0;
+        pCtx->LFSR_S[15] = V1;
+
+        F = F0;
+        F <<= 32;
+        F |= (uint64_t)F1;
+
+        *pKeyStream = F;
+}
+
+#ifdef AVX2
+/**
+*******************************************************************************
+* @description
+* This function generates 8 bytes of keystream 8 buffers at a time
+*
+* @param[in]            pCtx         Context where the scheduled keys are stored
+* @param[in/out]        pKeyStream   Pointer to generated keystream
+*
+*******************************************************************************/
+static inline void snow3g_keystream_8_8(snow3gKeyState8_t *pCtx,
+                                        __m256i *pKeyStreamLo,
+                                        __m256i *pKeyStreamHi)
+{
+        __m256i H, L;
+
+        /* first set of 4 bytes */
+        ClockFSM_8(pCtx, &L);
+        L = _mm256_xor_si256(L, pCtx->LFSR_X[pCtx->iLFSR_X]);
+        ClockLFSR_8(pCtx);
+
+        /* second set of 4 bytes */
+        ClockFSM_8(pCtx, &H);
+        H = _mm256_xor_si256(H, pCtx->LFSR_X[pCtx->iLFSR_X]);
+        ClockLFSR_8(pCtx);
+
+        /* merge the 2 sets */
+        *pKeyStreamLo = _mm256_unpacklo_epi32(H, L);
+        *pKeyStreamHi = _mm256_unpackhi_epi32(H, L);
+}
+
+/**
+*******************************************************************************
+* @description
+* This function generates 4 bytes of keystream 8 buffers at a time
+*
+* @param[in]            pCtx         Context where the scheduled keys are stored
+* @param[in/out]        pKeyStream   Pointer to generated keystream
+*
+*******************************************************************************/
+static inline void snow3g_keystream_8_4(snow3gKeyState8_t *pCtx,
+                                        __m256i *pKeyStream)
+{
+        __m256i F;
+
+        ClockFSM_8(pCtx, &F);
+        *pKeyStream = _mm256_xor_si256(F, pCtx->LFSR_X[pCtx->iLFSR_X]);
+        ClockLFSR_8(pCtx);
+}
+
+/**
+*****************************************************************************
+* @description
+* This function generates 32 bytes of keystream 8 buffers at a time
+*
+* @param[in]            pCtx         Context where the scheduled keys are stored
+* @param[in/out]        pKeyStream   Array of generated keystreams
+*
+******************************************************************************/
+static inline void snow3g_keystream_8_32(snow3gKeyState8_t *pCtx,
+                                         __m256i *pKeyStream)
+{
+
+        __m256i temp[8];
+
+        /** produces the next 4 bytes for each buffer */
+        int i;
+
+        /** Byte reversal on each KS */
+        __m256i mask1 = {0x0001020304050607ULL, 0x08090a0b0c0d0e0fULL,
+                         0x0001020304050607ULL, 0x08090a0b0c0d0e0fULL};
+        /** Reversal, shifted 4 bytes right */
+        __m256i mask2 = {0x0405060708090a0bULL, 0x0c0d0e0f00010203ULL,
+                         0x0405060708090a0bULL, 0x0c0d0e0f00010203ULL};
+        /** Reversal, shifted 8 bytes right */
+        __m256i mask3 = {0x08090a0b0c0d0e0fULL, 0x0001020304050607ULL,
+                         0x08090a0b0c0d0e0fULL, 0x0001020304050607ULL};
+        /** Reversal, shifted 12 bytes right */
+        __m256i mask4 = {0x0c0d0e0f00010203ULL, 0x0405060708090a0bULL,
+                         0x0c0d0e0f00010203ULL, 0x0405060708090a0bULL};
+
+        snow3g_keystream_8_4(pCtx, &temp[0]);
+        snow3g_keystream_8_4(pCtx, &temp[1]);
+        snow3g_keystream_8_4(pCtx, &temp[2]);
+        snow3g_keystream_8_4(pCtx, &temp[3]);
+        snow3g_keystream_8_4(pCtx, &temp[4]);
+        snow3g_keystream_8_4(pCtx, &temp[5]);
+        snow3g_keystream_8_4(pCtx, &temp[6]);
+        snow3g_keystream_8_4(pCtx, &temp[7]);
+
+        temp[0] = _mm256_shuffle_epi8(temp[0], mask1);
+        temp[1] = _mm256_shuffle_epi8(temp[1], mask2);
+        temp[2] = _mm256_shuffle_epi8(temp[2], mask3);
+        temp[3] = _mm256_shuffle_epi8(temp[3], mask4);
+        temp[4] = _mm256_shuffle_epi8(temp[4], mask1);
+        temp[5] = _mm256_shuffle_epi8(temp[5], mask2);
+        temp[6] = _mm256_shuffle_epi8(temp[6], mask3);
+        temp[7] = _mm256_shuffle_epi8(temp[7], mask4);
+
+        __m256i blended[8];
+        /* blends KS together: 128bit slice consists
+           of 4 32-bit words for one packet */
+        blended[0] = _mm256_blend_epi32(temp[0], temp[1], 0xaa);
+        blended[1] = _mm256_blend_epi32(temp[0], temp[1], 0x55);
+        blended[2] = _mm256_blend_epi32(temp[2], temp[3], 0xaa);
+        blended[3] = _mm256_blend_epi32(temp[2], temp[3], 0x55);
+        blended[4] = _mm256_blend_epi32(temp[4], temp[5], 0xaa);
+        blended[5] = _mm256_blend_epi32(temp[4], temp[5], 0x55);
+        blended[6] = _mm256_blend_epi32(temp[6], temp[7], 0xaa);
+        blended[7] = _mm256_blend_epi32(temp[6], temp[7], 0x55);
+
+        temp[0] = _mm256_blend_epi32(blended[0], blended[2], 0xcc);
+        temp[1] = _mm256_blend_epi32(blended[1], blended[3], 0x99);
+        temp[2] = _mm256_blend_epi32(blended[0], blended[2], 0x33);
+        temp[3] = _mm256_blend_epi32(blended[1], blended[3], 0x66);
+        temp[4] = _mm256_blend_epi32(blended[4], blended[6], 0xcc);
+        temp[5] = _mm256_blend_epi32(blended[5], blended[7], 0x99);
+        temp[6] = _mm256_blend_epi32(blended[4], blended[6], 0x33);
+        temp[7] = _mm256_blend_epi32(blended[5], blended[7], 0x66);
+
+        /** sorts 32 bit words back into order */
+        blended[0] = temp[0];
+        blended[1] = _mm256_shuffle_epi32(temp[1], 0x39);
+        blended[2] = _mm256_shuffle_epi32(temp[2], 0x4e);
+        blended[3] = _mm256_shuffle_epi32(temp[3], 0x93);
+        blended[4] = temp[4];
+        blended[5] = _mm256_shuffle_epi32(temp[5], 0x39);
+        blended[6] = _mm256_shuffle_epi32(temp[6], 0x4e);
+        blended[7] = _mm256_shuffle_epi32(temp[7], 0x93);
+
+        for (i = 0; i < 4; i++) {
+                pKeyStream[i] = _mm256_permute2x128_si256(blended[i],
+                                                          blended[i + 4], 0x20);
+                pKeyStream[i + 4] = _mm256_permute2x128_si256(
+                        blended[i], blended[i + 4], 0x31);
+        }
+}
+
+#endif /* AVX2 */
+
+/**
+*******************************************************************************
+* @description
+* This function generates 4 bytes of keystream 4 buffers at a time
+*
+* @param[in]            pCtx         Context where the scheduled keys are stored
+* @param[in/out]        pKeyStream   Pointer to generated keystream
+*
+*******************************************************************************/
+static inline void snow3g_keystream_4_4(snow3gKeyState4_t *pCtx,
+                                        __m128i *pKeyStream)
+{
+        __m128i F;
+
+        ClockFSM_4(pCtx, &F);
+        *pKeyStream = _mm_xor_si128(F, pCtx->LFSR_X[pCtx->iLFSR_X]);
+        ClockLFSR_4(pCtx);
+}
+
+/**
+*******************************************************************************
+* @description
+* This function generates 8 bytes of keystream 4 buffers at a time
+*
+* @param[in]            pCtx         Context where the scheduled keys are stored
+* @param[in/out]        pKeyStreamLo Pointer to lower end of generated keystream
+* @param[in/out]        pKeyStreamHi Pointer to higer end of generated keystream
+*
+*******************************************************************************/
+static inline void snow3g_keystream_4_8(snow3gKeyState4_t *pCtx,
+                                        __m128i *pKeyStreamLo,
+                                        __m128i *pKeyStreamHi)
+{
+        __m128i H, L;
+
+        /* first set of 4 bytes */
+        ClockFSM_4(pCtx, &L);
+        L = _mm_xor_si128(L, pCtx->LFSR_X[pCtx->iLFSR_X]);
+        ClockLFSR_4(pCtx);
+
+        /* second set of 4 bytes */
+        ClockFSM_4(pCtx, &H);
+        H = _mm_xor_si128(H, pCtx->LFSR_X[pCtx->iLFSR_X]);
+        ClockLFSR_4(pCtx);
+
+        /* merge the 2 sets */
+        *pKeyStreamLo = _mm_unpacklo_epi32(H, L);
+        *pKeyStreamHi = _mm_unpackhi_epi32(H, L);
+}
+
+/**
+*******************************************************************************
+* @description
+* This function initializes the key schedule for 4 buffers for snow3g f8/f9.
+*
+*       @param [in]      pCtx        Context where the scheduled keys are stored
+*       @param [in]      pKeySched   Key schedule
+*       @param [in]      pIV1        IV for buffer 1
+*       @param [in]      pIV2        IV for buffer 2
+*       @param [in]      pIV3        IV for buffer 3
+*       @param [in]      pIV4        IV for buffer 4
+*
+*******************************************************************************/
+static inline void
+snow3gStateInitialize_4(snow3gKeyState4_t *pCtx,
+                        const snow3g_key_schedule_t *pKeySched,
+                        const void *pIV1, const void *pIV2,
+                        const void *pIV3, const void *pIV4)
+{
+        uint32_t K, L;
+        int i;
+        __m128i R, S, T, U;
+        __m128i V0, V1, T0, T1;
+
+        /* Initialize the LFSR table from constants, Keys, and IV */
+
+        /* Load complete 128b IV into register (SSE2)*/
+        uint64_t sm[2] = {0x0405060700010203ULL, 0x0c0d0e0f08090a0bULL};
+        __m128i *swapMask = (__m128i *) sm;
+
+        R = _mm_loadu_si128((const __m128i *)pIV1);
+        S = _mm_loadu_si128((const __m128i *)pIV2);
+        T = _mm_loadu_si128((const __m128i *)pIV3);
+        U = _mm_loadu_si128((const __m128i *)pIV4);
+
+        /* initialize the array block (SSE4) */
+        for (i = 0; i < 4; i++) {
+                K = pKeySched->k[i];
+                L = ~K;
+                V0 = _mm_set1_epi32(K);
+                V1 = _mm_set1_epi32(L);
+                pCtx->LFSR_X[i + 4] = V0;
+                pCtx->LFSR_X[i + 12] = V0;
+                pCtx->LFSR_X[i + 0] = V1;
+                pCtx->LFSR_X[i + 8] = V1;
+        }
+        /* Update the schedule structure with IVs */
+        /* Store the 4 IVs in LFSR by a column/row matrix swap
+         * after endianness correction */
+
+        /* endianness swap (SSSE3) */
+        R = _mm_shuffle_epi8(R, *swapMask);
+        S = _mm_shuffle_epi8(S, *swapMask);
+        T = _mm_shuffle_epi8(T, *swapMask);
+        U = _mm_shuffle_epi8(U, *swapMask);
+
+        /* row/column dword inversion (SSE2) */
+        T0 = _mm_unpacklo_epi32(R, S);
+        R = _mm_unpackhi_epi32(R, S);
+        T1 = _mm_unpacklo_epi32(T, U);
+        T = _mm_unpackhi_epi32(T, U);
+
+        /* row/column qword inversion (SSE2) */
+        U = _mm_unpackhi_epi64(R, T);
+        T = _mm_unpacklo_epi64(R, T);
+        S = _mm_unpackhi_epi64(T0, T1);
+        R = _mm_unpacklo_epi64(T0, T1);
+
+        /*IV ^ LFSR (SSE2) */
+        pCtx->LFSR_X[15] = _mm_xor_si128(pCtx->LFSR_X[15], U);
+        pCtx->LFSR_X[12] = _mm_xor_si128(pCtx->LFSR_X[12], T);
+        pCtx->LFSR_X[10] = _mm_xor_si128(pCtx->LFSR_X[10], S);
+        pCtx->LFSR_X[9] = _mm_xor_si128(pCtx->LFSR_X[9], R);
+        pCtx->iLFSR_X = 0;
+        /* FSM initialization (SSE2) */
+        S = _mm_setzero_si128();
+        for (i = 0; i < 3; i++)
+                pCtx->FSM_X[i] = S;
+
+        /* Initialisation rounds */
+        for (i = 0; i < 32; i++) {
+                ClockFSM_4(pCtx, &S);
+                ClockLFSR_4(pCtx);
+                pCtx->LFSR_X[(pCtx->iLFSR_X + 15) % 16] = _mm_xor_si128(
+                        pCtx->LFSR_X[(pCtx->iLFSR_X + 15) % 16], S);
+        }
+}
+
+#ifdef AVX2
+/**
+*******************************************************************************
+* @description
+* This function intializes the key schedule for 8 buffers with
+* individual keys, for snow3g f8/f9.
+*
+*       @param [in]      pCtx            Context where scheduled keys are stored
+*       @param [in]      pKeySched       Key schedule
+*       @param [in]      pIV1            IV for buffer 1
+*       @param [in]      pIV2            IV for buffer 2
+*       @param [in]      pIV3            IV for buffer 3
+*       @param [in]      pIV4            IV for buffer 4
+*       @param [in]      pIV5            IV for buffer 5
+*       @param [in]      pIV6            IV for buffer 6
+*       @param [in]      pIV7            IV for buffer 7
+*       @param [in]      pIV8            IV for buffer 8
+*
+*******************************************************************************/
+static inline void
+snow3gStateInitialize_8_multiKey(snow3gKeyState8_t *pCtx,
+                                 const snow3g_key_schedule_t * const KeySched[],
+                                 const void * const pIV[])
+{
+        DECLARE_ALIGNED(uint32_t k[8], 32);
+        DECLARE_ALIGNED(uint32_t l[8], 32);
+        __m256i *K = (__m256i *)k;
+        __m256i *L = (__m256i *)l;
+
+        int i, j;
+        __m256i mR, mS, mT, mU, T0, T1;
+
+        /* Initialize the LFSR table from constants, Keys, and IV */
+
+        /* Load complete 256b IV into register (SSE2)*/
+        __m256i swapMask = {0x0405060700010203ULL, 0x0c0d0e0f08090a0bULL,
+                            0x0405060700010203ULL, 0x0c0d0e0f08090a0bULL};
+        mR = _mm256_loadu2_m128i((const __m128i *)pIV[4],
+                                 (const __m128i *)pIV[0]);
+        mS = _mm256_loadu2_m128i((const __m128i *)pIV[5],
+                                 (const __m128i *)pIV[1]);
+        mT = _mm256_loadu2_m128i((const __m128i *)pIV[6],
+                                 (const __m128i *)pIV[2]);
+        mU = _mm256_loadu2_m128i((const __m128i *)pIV[7],
+                                 (const __m128i *)pIV[3]);
+
+        /* initialize the array block (SSE4) */
+        for (i = 0; i < 4; i++) {
+                for (j = 0; j < 8; j++) {
+                        k[j] = KeySched[j]->k[i];
+                        l[j] = ~k[j];
+                }
+
+                pCtx->LFSR_X[i + 4] = *K;
+                pCtx->LFSR_X[i + 12] = *K;
+                pCtx->LFSR_X[i + 0] = *L;
+                pCtx->LFSR_X[i + 8] = *L;
+        }
+
+        /* Update the schedule structure with IVs */
+        /* Store the 4 IVs in LFSR by a column/row matrix swap
+         * after endianness correction */
+
+        /* endianness swap (SSSE3) */
+        mR = _mm256_shuffle_epi8(mR, swapMask);
+        mS = _mm256_shuffle_epi8(mS, swapMask);
+        mT = _mm256_shuffle_epi8(mT, swapMask);
+        mU = _mm256_shuffle_epi8(mU, swapMask);
+
+        /* row/column dword inversion (SSE2) */
+        T0 = _mm256_unpacklo_epi32(mR, mS);
+        mR = _mm256_unpackhi_epi32(mR, mS);
+        T1 = _mm256_unpacklo_epi32(mT, mU);
+        mT = _mm256_unpackhi_epi32(mT, mU);
+
+        /* row/column qword inversion (SSE2) */
+        mU = _mm256_unpackhi_epi64(mR, mT);
+        mT = _mm256_unpacklo_epi64(mR, mT);
+        mS = _mm256_unpackhi_epi64(T0, T1);
+        mR = _mm256_unpacklo_epi64(T0, T1);
+
+        /*IV ^ LFSR (SSE2) */
+        pCtx->LFSR_X[15] = _mm256_xor_si256(pCtx->LFSR_X[15], mU);
+        pCtx->LFSR_X[12] = _mm256_xor_si256(pCtx->LFSR_X[12], mT);
+        pCtx->LFSR_X[10] = _mm256_xor_si256(pCtx->LFSR_X[10], mS);
+        pCtx->LFSR_X[9] = _mm256_xor_si256(pCtx->LFSR_X[9], mR);
+        pCtx->iLFSR_X = 0;
+        /* FSM initialization (SSE2) */
+        mS = _mm256_setzero_si256();
+        for (i = 0; i < 3; i++)
+                pCtx->FSM_X[i] = mS;
+
+        /* Initialisation rounds */
+        for (i = 0; i < 32; i++) {
+                ClockFSM_8(pCtx, &mS);
+                ClockLFSR_8(pCtx);
+                pCtx->LFSR_X[(pCtx->iLFSR_X + 15) % 16] = _mm256_xor_si256(
+                        pCtx->LFSR_X[(pCtx->iLFSR_X + 15) % 16], mS);
+        }
+}
+
+/**
+*******************************************************************************
+* @description
+* This function initializes the key schedule for 8 buffers for snow3g f8/f9.
+*
+*       @param [in]     pCtx         Context where the scheduled keys are stored
+*       @param [in]     pKeySched    Key schedule
+*       @param [in]     pIV1         IV for buffer 1
+*       @param [in]     pIV2         IV for buffer 2
+*       @param [in]     pIV3         IV for buffer 3
+*       @param [in]     pIV4         IV for buffer 4
+*       @param [in]     pIV5         IV for buffer 5
+*       @param [in]     pIV6         IV for buffer 6
+*       @param [in]     pIV7         IV for buffer 7
+*       @param [in]     pIV8         IV for buffer 8
+*
+*******************************************************************************/
+static inline void
+snow3gStateInitialize_8(snow3gKeyState8_t *pCtx,
+                        const snow3g_key_schedule_t *pKeySched,
+                        const void *pIV1, const void *pIV2,
+                        const void *pIV3, const void *pIV4,
+                        const void *pIV5, const void *pIV6,
+                        const void *pIV7, const void *pIV8)
+{
+        uint32_t K, L;
+        int i;
+        __m256i mR, mS, mT, mU, V0, V1, T0, T1;
+
+        /* Initialize the LFSR table from constants, Keys, and IV */
+
+        /* Load complete 256b IV into register (SSE2)*/
+        __m256i swapMask = {0x0405060700010203ULL, 0x0c0d0e0f08090a0bULL,
+                            0x0405060700010203ULL, 0x0c0d0e0f08090a0bULL};
+        mR = _mm256_loadu2_m128i((const __m128i *)pIV5, (const __m128i *)pIV1);
+        mS = _mm256_loadu2_m128i((const __m128i *)pIV6, (const __m128i *)pIV2);
+        mT = _mm256_loadu2_m128i((const __m128i *)pIV7, (const __m128i *)pIV3);
+        mU = _mm256_loadu2_m128i((const __m128i *)pIV8, (const __m128i *)pIV4);
+
+        /* initialize the array block (SSE4) */
+        for (i = 0; i < 4; i++) {
+                K = pKeySched->k[i];
+                L = ~K;
+                V0 = _mm256_set1_epi32(K);
+                V1 = _mm256_set1_epi32(L);
+                pCtx->LFSR_X[i + 4] = V0;
+                pCtx->LFSR_X[i + 12] = V0;
+                pCtx->LFSR_X[i + 0] = V1;
+                pCtx->LFSR_X[i + 8] = V1;
+        }
+
+        /* Update the schedule structure with IVs */
+        /* Store the 4 IVs in LFSR by a column/row matrix swap
+         * after endianness correction */
+
+        /* endianness swap (SSSE3) */
+        mR = _mm256_shuffle_epi8(mR, swapMask);
+        mS = _mm256_shuffle_epi8(mS, swapMask);
+        mT = _mm256_shuffle_epi8(mT, swapMask);
+        mU = _mm256_shuffle_epi8(mU, swapMask);
+
+        /* row/column dword inversion (SSE2) */
+        T0 = _mm256_unpacklo_epi32(mR, mS);
+        mR = _mm256_unpackhi_epi32(mR, mS);
+        T1 = _mm256_unpacklo_epi32(mT, mU);
+        mT = _mm256_unpackhi_epi32(mT, mU);
+
+        /* row/column qword inversion (SSE2) */
+        mU = _mm256_unpackhi_epi64(mR, mT);
+        mT = _mm256_unpacklo_epi64(mR, mT);
+        mS = _mm256_unpackhi_epi64(T0, T1);
+        mR = _mm256_unpacklo_epi64(T0, T1);
+
+        /*IV ^ LFSR (SSE2) */
+        pCtx->LFSR_X[15] = _mm256_xor_si256(pCtx->LFSR_X[15], mU);
+        pCtx->LFSR_X[12] = _mm256_xor_si256(pCtx->LFSR_X[12], mT);
+        pCtx->LFSR_X[10] = _mm256_xor_si256(pCtx->LFSR_X[10], mS);
+        pCtx->LFSR_X[9] = _mm256_xor_si256(pCtx->LFSR_X[9], mR);
+        pCtx->iLFSR_X = 0;
+        /* FSM initialization (SSE2) */
+        mS = _mm256_setzero_si256();
+        for (i = 0; i < 3; i++)
+                pCtx->FSM_X[i] = mS;
+
+        /* Initialisation rounds */
+        for (i = 0; i < 32; i++) {
+                ClockFSM_8(pCtx, &mS);
+                ClockLFSR_8(pCtx);
+                pCtx->LFSR_X[(pCtx->iLFSR_X + 15) % 16] = _mm256_xor_si256(
+                        pCtx->LFSR_X[(pCtx->iLFSR_X + 15) % 16], mS);
+        }
+}
+#endif /* AVX2 */
+
+static inline void
+preserve_bits(uint64_t *KS,
+              const uint8_t *pcBufferOut, const uint8_t *pcBufferIn,
+              SafeBuf *safeOutBuf, SafeBuf *safeInBuf,
+              const uint8_t bit_len, const uint8_t byte_len)
+{
+        const uint64_t mask = UINT64_MAX << (SNOW3G_BLOCK_SIZE * 8 - bit_len);
+
+        /* Clear the last bits of the keystream and the input
+         * (input only in out-of-place case) */
+        *KS &= mask;
+        if (pcBufferIn != pcBufferOut) {
+                const uint64_t swapMask = BSWAP64(mask);
+
+                safeInBuf->b64 &= swapMask;
+
+                /*
+                 * Merge the last bits from the output, to be preserved,
+                 * in the keystream, to be XOR'd with the input
+                 * (which last bits are 0, maintaining the output bits)
+                 */
+                memcpy_keystrm(safeOutBuf->b8, pcBufferOut, byte_len);
+                *KS |= BSWAP64(safeOutBuf->b64 & ~swapMask);
+        }
+}
+
+/**
+*******************************************************************************
+* @description
+* This function is the core snow3g bit algorithm
+* for the 3GPP confidentiality algorithm
+*
+* @param[in]    pCtx                Context where the scheduled keys are stored
+* @param[in]    pBufferIn           Input buffer
+* @param[out]   pBufferOut          Output buffer
+* @param[in]    cipherLengthInBits  length in bits of the data to be encrypted
+* @param[in]    bitOffset           offset in input buffer, where data are valid
+*
+*******************************************************************************/
+static inline void f8_snow3g_bit(snow3gKeyState1_t *pCtx,
+                                 const void *pIn,
+                                 void *pOut,
+                                 const uint32_t lengthInBits,
+                                 const uint32_t offsetInBits)
+{
+        const uint8_t *pBufferIn = pIn;
+        uint8_t *pBufferOut = pOut;
+        uint32_t cipherLengthInBits = lengthInBits;
+        uint64_t shiftrem = 0;
+        uint64_t KS8, KS8bit; /* 8 bytes of keystream */
+        const uint8_t *pcBufferIn = pBufferIn + (offsetInBits / 8);
+        uint8_t *pcBufferOut = pBufferOut + (offsetInBits / 8);
+        /* Offset into the first byte (0 - 7 bits) */
+        uint32_t remainOffset = offsetInBits % 8;
+        uint32_t byteLength = (cipherLengthInBits + 7) / 8;
+        SafeBuf safeInBuf = {0};
+        SafeBuf safeOutBuf = {0};
+
+        /* Now run the block cipher */
+
+        /* Start with potential partial block (due to offset and length) */
+        snow3g_keystream_1_8(pCtx, &KS8);
+        KS8bit = KS8 >> remainOffset;
+        /* Only one block to encrypt */
+        if (cipherLengthInBits < (64 - remainOffset)) {
+                byteLength = (cipherLengthInBits + 7) / 8;
+                memcpy_keystrm(safeInBuf.b8, pcBufferIn, byteLength);
+                /*
+                 * If operation is Out-of-place and there is offset
+                 * to be applied, "remainOffset" bits from the output buffer
+                 * need to be preserved (only applicable to first byte,
+                 * since remainOffset is up to 7 bits)
+                 */
+                if ((pIn != pOut) && remainOffset) {
+                        const uint8_t mask8 = (uint8_t)
+                                (1 << (8 - remainOffset)) - 1;
+
+                        safeInBuf.b8[0] = (safeInBuf.b8[0] & mask8) |
+                                (pcBufferOut[0] & ~mask8);
+                }
+                /* If last byte is a partial byte, the last bits of the output
+                 * need to be preserved */
+                const uint8_t bitlen_with_off = remainOffset +
+                        cipherLengthInBits;
+
+                if ((bitlen_with_off & 0x7) != 0)
+                        preserve_bits(&KS8bit, pcBufferOut, pcBufferIn,
+                                      &safeOutBuf, &safeInBuf,
+                                      bitlen_with_off, byteLength);
+
+                xor_keystrm_rev(safeOutBuf.b8, safeInBuf.b8, KS8bit);
+                memcpy_keystrm(pcBufferOut, safeOutBuf.b8, byteLength);
+                return;
+        }
+        /*
+         * If operation is Out-of-place and there is offset
+         * to be applied, "remainOffset" bits from the output buffer
+         * need to be preserved (only applicable to first byte,
+         * since remainOffset is up to 7 bits)
+         */
+        if ((pIn != pOut) && remainOffset) {
+                const uint8_t mask8 = (uint8_t)(1 << (8 - remainOffset)) - 1;
+
+                memcpy_keystrm(safeInBuf.b8, pcBufferIn, 8);
+                safeInBuf.b8[0] = (safeInBuf.b8[0] & mask8) |
+                        (pcBufferOut[0] & ~mask8);
+                xor_keystrm_rev(pcBufferOut, safeInBuf.b8, KS8bit);
+                pcBufferIn += SNOW3G_BLOCK_SIZE;
+        } else {
+                /* At least 64 bits to produce (including offset) */
+                pcBufferIn = xor_keystrm_rev(pcBufferOut, pcBufferIn, KS8bit);
+        }
+
+        if (remainOffset != 0)
+                shiftrem = KS8 << (64 - remainOffset);
+        cipherLengthInBits -= SNOW3G_BLOCK_SIZE * 8 - remainOffset;
+        pcBufferOut += SNOW3G_BLOCK_SIZE;
+
+        while (cipherLengthInBits) {
+                /* produce the next block of keystream */
+                snow3g_keystream_1_8(pCtx, &KS8);
+                KS8bit = (KS8 >> remainOffset) | shiftrem;
+                if (remainOffset != 0)
+                        shiftrem = KS8 << (64 - remainOffset);
+                if (cipherLengthInBits >= SNOW3G_BLOCK_SIZE * 8) {
+                        pcBufferIn = xor_keystrm_rev(pcBufferOut,
+                                                     pcBufferIn, KS8bit);
+                        cipherLengthInBits -= SNOW3G_BLOCK_SIZE * 8;
+                        pcBufferOut += SNOW3G_BLOCK_SIZE;
+                        /* loop variant */
+                } else {
+                        /* end of the loop, handle the last bytes */
+                        byteLength = (cipherLengthInBits + 7) / 8;
+                        memcpy_keystrm(safeInBuf.b8, pcBufferIn,
+                                       byteLength);
+
+                        /* If last byte is a partial byte, the last bits
+                         * of the output need to be preserved */
+                        if ((cipherLengthInBits & 0x7) != 0)
+                                preserve_bits(&KS8bit, pcBufferOut, pcBufferIn,
+                                              &safeOutBuf, &safeInBuf,
+                                              cipherLengthInBits, byteLength);
+
+                        xor_keystrm_rev(safeOutBuf.b8, safeInBuf.b8, KS8bit);
+                        memcpy_keystrm(pcBufferOut, safeOutBuf.b8, byteLength);
+                        cipherLengthInBits = 0;
+                }
+        }
+#ifdef SAFE_DATA
+        CLEAR_VAR(&KS8, sizeof(KS8));
+        CLEAR_VAR(&KS8bit, sizeof(KS8bit));
+        CLEAR_MEM(&safeInBuf, sizeof(safeInBuf));
+        CLEAR_MEM(&safeOutBuf, sizeof(safeOutBuf));
+#endif
+}
+
+/**
+*******************************************************************************
+* @description
+* This function is the core snow3g algorithm for
+* the 3GPP confidentiality and integrity algorithm.
+*
+* @param[in]       pCtx            Context where the scheduled keys are stored
+* @param[in]       pBufferIn       Input buffer
+* @param[out]      pBufferOut      Output buffer
+* @param[in]       lengthInBytes   length in bytes of the data to be encrypted
+*
+*******************************************************************************/
+static inline void f8_snow3g(snow3gKeyState1_t *pCtx,
+                             const void *pIn,
+                             void *pOut,
+                             const uint32_t lengthInBytes)
+{
+        uint32_t qwords = lengthInBytes / SNOW3G_8_BYTES; /* number of qwords */
+        uint32_t words = lengthInBytes & 4; /* remaining word if not 0 */
+        uint32_t bytes = lengthInBytes & 3; /* remaining bytes */
+        uint32_t KS4;                       /* 4 bytes of keystream */
+        uint64_t KS8;                       /* 8 bytes of keystream */
+        const uint8_t *pBufferIn = pIn;
+        uint8_t *pBufferOut = pOut;
+
+        /* process 64 bits at a time */
+        while (qwords--) {
+                /* generate keystream 8 bytes at a time */
+                snow3g_keystream_1_8(pCtx, &KS8);
+
+                /* xor keystream 8 bytes at a time */
+                pBufferIn = xor_keystrm_rev(pBufferOut, pBufferIn, KS8);
+                pBufferOut += SNOW3G_8_BYTES;
+        }
+
+        /* check for remaining 0 to 7 bytes */
+        if (0 != words) {
+                if (bytes) {
+                        /* 5 to 7 last bytes, process 8 bytes */
+                        uint8_t buftemp[8];
+                        uint8_t safeBuff[8];
+
+                        memset(safeBuff, 0, SNOW3G_8_BYTES);
+                        snow3g_keystream_1_8(pCtx, &KS8);
+                        memcpy_keystrm(safeBuff, pBufferIn, 4 + bytes);
+                        xor_keystrm_rev(buftemp, safeBuff, KS8);
+                        memcpy_keystrm(pBufferOut, buftemp, 4 + bytes);
+#ifdef SAFE_DATA
+                        CLEAR_MEM(&safeBuff, sizeof(safeBuff));
+                        CLEAR_MEM(&buftemp, sizeof(buftemp));
+#endif
+                } else {
+                        /* exactly 4 last bytes */
+                        snow3g_keystream_1_4(pCtx, &KS4);
+                        xor_keystream_reverse_32(pBufferOut, pBufferIn, KS4);
+                }
+        } else if (0 != bytes) {
+                /* 1 to 3 last bytes */
+                uint8_t buftemp[4];
+                uint8_t safeBuff[4];
+
+                memset(safeBuff, 0, SNOW3G_4_BYTES);
+                snow3g_keystream_1_4(pCtx, &KS4);
+                memcpy_keystream_32(safeBuff, pBufferIn, bytes);
+                xor_keystream_reverse_32(buftemp, safeBuff, KS4);
+                memcpy_keystream_32(pBufferOut, buftemp, bytes);
+#ifdef SAFE_DATA
+                CLEAR_MEM(&safeBuff, sizeof(safeBuff));
+                CLEAR_MEM(&buftemp, sizeof(buftemp));
+#endif
+        }
+
+#ifdef SAFE_DATA
+        CLEAR_VAR(&KS4, sizeof(KS4));
+        CLEAR_VAR(&KS8, sizeof(KS8));
+#endif
+}
+
+#ifdef AVX2
+/**
+*******************************************************************************
+* @description
+* This function converts the state from a 4 buffer state structure to 1
+* buffer state structure.
+*
+* @param[in]    pSrcState               Pointer to the source state
+* @param[in]    pDstState               Pointer to the destination state
+* @param[in]    NumBuffers              Number of buffers
+*
+*******************************************************************************/
+static inline void snow3gStateConvert_8(snow3gKeyState8_t *pSrcState,
+                                        snow3gKeyState1_t *pDstState,
+                                        uint32_t NumBuffers)
+{
+        uint32_t T = 0, iLFSR_X = pSrcState->iLFSR_X;
+        __m256i *LFSR_X = pSrcState->LFSR_X;
+        int i;
+
+        for (i = 0; i < 16; i++) {
+                switch (NumBuffers) {
+                case 0:
+                        T = _mm256_extract_epi32(LFSR_X[(i + iLFSR_X) % 16], 0);
+                        break;
+                case 1:
+                        T = _mm256_extract_epi32(LFSR_X[(i + iLFSR_X) % 16], 1);
+                        break;
+                case 2:
+                        T = _mm256_extract_epi32(LFSR_X[(i + iLFSR_X) % 16], 2);
+                        break;
+                case 3:
+                        T = _mm256_extract_epi32(LFSR_X[(i + iLFSR_X) % 16], 3);
+                        break;
+                case 4:
+                        T = _mm256_extract_epi32(LFSR_X[(i + iLFSR_X) % 16], 4);
+                        break;
+                case 5:
+                        T = _mm256_extract_epi32(LFSR_X[(i + iLFSR_X) % 16], 5);
+                        break;
+                case 6:
+                        T = _mm256_extract_epi32(LFSR_X[(i + iLFSR_X) % 16], 6);
+                        break;
+                case 7:
+                        T = _mm256_extract_epi32(LFSR_X[(i + iLFSR_X) % 16], 7);
+                        break;
+                }
+                pDstState->LFSR_S[i] = T;
+        }
+        i = 0;
+        switch (NumBuffers) {
+        case 0:
+                T = _mm256_extract_epi32(pSrcState->FSM_X[i], 0);
+                break;
+        case 1:
+                T = _mm256_extract_epi32(pSrcState->FSM_X[i], 1);
+                break;
+        case 2:
+                T = _mm256_extract_epi32(pSrcState->FSM_X[i], 2);
+                break;
+        case 3:
+                T = _mm256_extract_epi32(pSrcState->FSM_X[i], 3);
+                break;
+        case 4:
+                T = _mm256_extract_epi32(pSrcState->FSM_X[i], 4);
+                break;
+        case 5:
+                T = _mm256_extract_epi32(pSrcState->FSM_X[i], 5);
+                break;
+        case 6:
+                T = _mm256_extract_epi32(pSrcState->FSM_X[i], 6);
+                break;
+        case 7:
+                T = _mm256_extract_epi32(pSrcState->FSM_X[i], 7);
+                break;
+        }
+        pDstState->FSM_R1 = T;
+
+        i = 1;
+        switch (NumBuffers) {
+        case 0:
+                T = _mm256_extract_epi32(pSrcState->FSM_X[i], 0);
+                break;
+        case 1:
+                T = _mm256_extract_epi32(pSrcState->FSM_X[i], 1);
+                break;
+        case 2:
+                T = _mm256_extract_epi32(pSrcState->FSM_X[i], 2);
+                break;
+        case 3:
+                T = _mm256_extract_epi32(pSrcState->FSM_X[i], 3);
+                break;
+        case 4:
+                T = _mm256_extract_epi32(pSrcState->FSM_X[i], 4);
+                break;
+        case 5:
+                T = _mm256_extract_epi32(pSrcState->FSM_X[i], 5);
+                break;
+        case 6:
+                T = _mm256_extract_epi32(pSrcState->FSM_X[i], 6);
+                break;
+        case 7:
+                T = _mm256_extract_epi32(pSrcState->FSM_X[i], 7);
+                break;
+        }
+        pDstState->FSM_R2 = T;
+
+        i = 2;
+        switch (NumBuffers) {
+        case 0:
+                T = _mm256_extract_epi32(pSrcState->FSM_X[i], 0);
+                break;
+        case 1:
+                T = _mm256_extract_epi32(pSrcState->FSM_X[i], 1);
+                break;
+        case 2:
+                T = _mm256_extract_epi32(pSrcState->FSM_X[i], 2);
+                break;
+        case 3:
+                T = _mm256_extract_epi32(pSrcState->FSM_X[i], 3);
+                break;
+        case 4:
+                T = _mm256_extract_epi32(pSrcState->FSM_X[i], 4);
+                break;
+        case 5:
+                T = _mm256_extract_epi32(pSrcState->FSM_X[i], 5);
+                break;
+        case 6:
+                T = _mm256_extract_epi32(pSrcState->FSM_X[i], 6);
+                break;
+        case 7:
+                T = _mm256_extract_epi32(pSrcState->FSM_X[i], 7);
+                break;
+        }
+        pDstState->FSM_R3 = T;
+}
+#endif /* AVX2 */
+
+/**
+*******************************************************************************
+* @description
+* This function converts the state from a 4 buffer state structure to 1
+* buffer state structure.
+*
+* @param[in]    pSrcState               Pointer to the source state
+* @param[in]    pDstState               Pointer to the destination state
+* @param[in]    NumBuffers              Number of buffers
+*
+*******************************************************************************/
+static inline void snow3gStateConvert_4(snow3gKeyState4_t *pSrcState,
+                                        snow3gKeyState1_t *pDstState,
+                                        uint32_t NumBuffers)
+{
+        uint32_t i;
+        uint32_t T = 0, iLFSR_X = pSrcState->iLFSR_X;
+        __m128i *LFSR_X = pSrcState->LFSR_X;
+
+        for (i = 0; i < 16; i++) {
+                switch (NumBuffers) {
+                case 0:
+                        T = _mm_extract_epi32(LFSR_X[(i + iLFSR_X) % 16], 0);
+                        break;
+                case 1:
+                        T = _mm_extract_epi32(LFSR_X[(i + iLFSR_X) % 16], 1);
+                        break;
+                case 2:
+                        T = _mm_extract_epi32(LFSR_X[(i + iLFSR_X) % 16], 2);
+                        break;
+                case 3:
+                        T = _mm_extract_epi32(LFSR_X[(i + iLFSR_X) % 16], 3);
+                        break;
+                }
+                pDstState->LFSR_S[i] = T;
+        }
+
+        i = 0;
+        switch (NumBuffers) {
+        case 0:
+                T = _mm_extract_epi32(pSrcState->FSM_X[i], 0);
+                break;
+        case 1:
+                T = _mm_extract_epi32(pSrcState->FSM_X[i], 1);
+                break;
+        case 2:
+                T = _mm_extract_epi32(pSrcState->FSM_X[i], 2);
+                break;
+        case 3:
+                T = _mm_extract_epi32(pSrcState->FSM_X[i], 3);
+                break;
+        }
+        pDstState->FSM_R1 = T;
+
+        i = 1;
+        switch (NumBuffers) {
+        case 0:
+                T = _mm_extract_epi32(pSrcState->FSM_X[i], 0);
+                break;
+        case 1:
+                T = _mm_extract_epi32(pSrcState->FSM_X[i], 1);
+                break;
+        case 2:
+                T = _mm_extract_epi32(pSrcState->FSM_X[i], 2);
+                break;
+        case 3:
+                T = _mm_extract_epi32(pSrcState->FSM_X[i], 3);
+                break;
+        }
+        pDstState->FSM_R2 = T;
+
+        i = 2;
+        switch (NumBuffers) {
+        case 0:
+                T = _mm_extract_epi32(pSrcState->FSM_X[i], 0);
+                break;
+        case 1:
+                T = _mm_extract_epi32(pSrcState->FSM_X[i], 1);
+                break;
+        case 2:
+                T = _mm_extract_epi32(pSrcState->FSM_X[i], 2);
+                break;
+        case 3:
+                T = _mm_extract_epi32(pSrcState->FSM_X[i], 3);
+                break;
+        }
+        pDstState->FSM_R3 = T;
+}
+
+/*---------------------------------------------------------
+ * f8()
+ * Initializations and Context size definitions
+ *---------------------------------------------------------*/
+size_t SNOW3G_KEY_SCHED_SIZE(void) { return sizeof(snow3g_key_schedule_t); }
+
+int SNOW3G_INIT_KEY_SCHED(const void *pKey, snow3g_key_schedule_t *pCtx)
+{
+#ifdef SAFE_PARAM
+        if ((pKey == NULL) || (pCtx == NULL))
+                return -1;
+#endif
+
+        const uint32_t *pKey32 = pKey;
+
+        pCtx->k[3] = BSWAP32(pKey32[0]);
+        pCtx->k[2] = BSWAP32(pKey32[1]);
+        pCtx->k[1] = BSWAP32(pKey32[2]);
+        pCtx->k[0] = BSWAP32(pKey32[3]);
+
+        return 0;
+}
+
+/*---------------------------------------------------------
+ * @description
+ *      Snow3G F8 1 buffer:
+ *      Single buffer enc/dec with IV and precomputed key schedule
+ *---------------------------------------------------------*/
+void SNOW3G_F8_1_BUFFER(const snow3g_key_schedule_t *pHandle,
+                        const void *pIV,
+                        const void *pBufferIn,
+                        void  *pBufferOut,
+                        const uint32_t lengthInBytes)
+{
+#ifdef SAFE_PARAM
+        if ((pHandle == NULL) || (pIV == NULL) ||
+            (pBufferIn == NULL) || (pBufferOut == NULL) ||
+            (lengthInBytes == 0) || (lengthInBytes > SNOW3G_MAX_BYTELEN))
+                return;
+#endif
+        snow3gKeyState1_t ctx;
+        uint32_t KS4; /* 4 bytes of keystream */
+
+        /* Initialize the schedule from the IV */
+        snow3gStateInitialize_1(&ctx, pHandle, pIV);
+
+        /* Clock FSM and LFSR once, ignore the keystream */
+        snow3g_keystream_1_4(&ctx, &KS4);
+
+        f8_snow3g(&ctx, pBufferIn, pBufferOut, lengthInBytes);
+
+#ifdef SAFE_DATA
+        CLEAR_VAR(&KS4, sizeof(KS4));
+        CLEAR_MEM(&ctx, sizeof(ctx));
+        CLEAR_SCRATCH_GPS();
+        CLEAR_SCRATCH_SIMD_REGS();
+#endif /* SAFE_DATA */
+}
+
+/*---------------------------------------------------------
+ * @description
+ *      Snow3G F8 bit 1 buffer:
+ *      Single buffer enc/dec with IV and precomputed key schedule
+ *---------------------------------------------------------*/
+void SNOW3G_F8_1_BUFFER_BIT(const snow3g_key_schedule_t *pHandle,
+                            const void *pIV,
+                            const void *pBufferIn,
+                            void *pBufferOut,
+                            const uint32_t lengthInBits,
+                            const uint32_t offsetInBits)
+{
+#ifdef SAFE_PARAM
+        if ((pHandle == NULL) || (pIV == NULL) ||
+            (pBufferIn == NULL) || (pBufferOut == NULL) ||
+            (lengthInBits == 0))
+                return;
+#endif
+
+        snow3gKeyState1_t ctx;
+        uint32_t KS4; /* 4 bytes of keystream */
+
+        /* Initialize the schedule from the IV */
+        snow3gStateInitialize_1(&ctx, pHandle, pIV);
+
+        /* Clock FSM and LFSR once, ignore the keystream */
+        snow3g_keystream_1_4(&ctx, &KS4);
+
+        f8_snow3g_bit(&ctx, pBufferIn, pBufferOut, lengthInBits, offsetInBits);
+
+#ifdef SAFE_DATA
+        CLEAR_VAR(&KS4, sizeof(KS4));
+        CLEAR_MEM(&ctx, sizeof(ctx));
+        CLEAR_SCRATCH_GPS();
+        CLEAR_SCRATCH_SIMD_REGS();
+#endif /* SAFE_DATA */
+}
+
+/*---------------------------------------------------------
+ * @description
+ *      Snow3G F8 2 buffer:
+ *      Two buffers enc/dec with the same key schedule.
+ *      The 3 IVs are independent and are passed as an array of pointers.
+ *      Each buffer and data length are separate.
+ *---------------------------------------------------------*/
+void SNOW3G_F8_2_BUFFER(const snow3g_key_schedule_t *pHandle,
+                        const void *pIV1,
+                        const void *pIV2,
+                        const void *pBufIn1,
+                        void *pBufOut1,
+                        const uint32_t lenInBytes1,
+                        const void *pBufIn2,
+                        void *pBufOut2,
+                        const uint32_t lenInBytes2)
+{
+#ifdef SAFE_PARAM
+        if ((pHandle == NULL) || (pIV1 == NULL) || (pIV2 == NULL) ||
+            (pBufIn1 == NULL) || (pBufOut1 == NULL) ||
+            (pBufIn2 == NULL) || (pBufOut2 == NULL) ||
+            (lenInBytes1 == 0) || (lenInBytes1 > SNOW3G_MAX_BYTELEN) ||
+            (lenInBytes2 == 0) || (lenInBytes2 > SNOW3G_MAX_BYTELEN))
+                return;
+#endif
+
+        snow3gKeyState1_t ctx1, ctx2;
+        uint32_t KS4; /* 4 bytes of keystream */
+
+        /* Initialize the schedule from the IV */
+        snow3gStateInitialize_1(&ctx1, pHandle, pIV1);
+
+        /* Clock FSM and LFSR once, ignore the keystream */
+        snow3g_keystream_1_4(&ctx1, &KS4);
+
+        /* data processing for packet 1 */
+        f8_snow3g(&ctx1, pBufIn1, pBufOut1, lenInBytes1);
+
+        /* Initialize the schedule from the IV */
+        snow3gStateInitialize_1(&ctx2, pHandle, pIV2);
+
+        /* Clock FSM and LFSR once, ignore the keystream */
+        snow3g_keystream_1_4(&ctx2, &KS4);
+
+        /* data processing for packet 2 */
+        f8_snow3g(&ctx2, pBufIn2, pBufOut2, lenInBytes2);
+
+#ifdef SAFE_DATA
+        CLEAR_VAR(&KS4, sizeof(KS4));
+        CLEAR_MEM(&ctx1, sizeof(ctx1));
+        CLEAR_MEM(&ctx2, sizeof(ctx2));
+        CLEAR_SCRATCH_GPS();
+        CLEAR_SCRATCH_SIMD_REGS();
+#endif /* SAFE_DATA */
+
+}
+
+/*---------------------------------------------------------
+ * @description
+ *      Snow3G F8 4 buffer:
+ *      Four packets enc/dec with the same key schedule.
+ *      The 4 IVs are independent and are passed as an array of pointers.
+ *      Each buffer and data length are separate.
+ *---------------------------------------------------------*/
+void SNOW3G_F8_4_BUFFER(const snow3g_key_schedule_t *pHandle,
+                        const void *pIV1,
+                        const void *pIV2,
+                        const void *pIV3,
+                        const void *pIV4,
+                        const void *pBufferIn1,
+                        void *pBufferOut1,
+                        const uint32_t lengthInBytes1,
+                        const void *pBufferIn2,
+                        void *pBufferOut2,
+                        const uint32_t lengthInBytes2,
+                        const void *pBufferIn3,
+                        void *pBufferOut3,
+                        const uint32_t lengthInBytes3,
+                        const void *pBufferIn4,
+                        void *pBufferOut4,
+                        const uint32_t lengthInBytes4)
+{
+#ifdef SAFE_PARAM
+        if ((pHandle == NULL) ||
+            (pIV1 == NULL) || (pIV2 == NULL) ||
+            (pIV3 == NULL) || (pIV4 == NULL) ||
+            (pBufferIn1 == NULL) || (pBufferOut1 == NULL) ||
+            (pBufferIn2 == NULL) || (pBufferOut2 == NULL) ||
+            (pBufferIn3 == NULL) || (pBufferOut3 == NULL) ||
+            (pBufferIn4 == NULL) || (pBufferOut4 == NULL) ||
+            (lengthInBytes1 == 0) || (lengthInBytes1 > SNOW3G_MAX_BYTELEN) ||
+            (lengthInBytes2 == 0) || (lengthInBytes2 > SNOW3G_MAX_BYTELEN) ||
+            (lengthInBytes3 == 0) || (lengthInBytes3 > SNOW3G_MAX_BYTELEN) ||
+            (lengthInBytes4 == 0) || (lengthInBytes4 > SNOW3G_MAX_BYTELEN))
+                return;
+#endif
+
+        snow3gKeyState4_t ctx;
+        __m128i H, L; /* 4 bytes of keystream */
+        uint32_t lenInBytes1 = lengthInBytes1;
+        uint32_t lenInBytes2 = lengthInBytes2;
+        uint32_t lenInBytes3 = lengthInBytes3;
+        uint32_t lenInBytes4 = lengthInBytes4;
+        uint32_t bytes1 =
+                (lenInBytes1 < lenInBytes2 ? lenInBytes1
+                 : lenInBytes2); /* number of bytes */
+        uint32_t bytes2 =
+                (lenInBytes3 < lenInBytes4 ? lenInBytes3
+                 : lenInBytes4);    /* number of bytes */
+        /* min num of bytes */
+        uint32_t bytes = (bytes1 < bytes2) ? bytes1 : bytes2;
+        uint32_t qwords = bytes / SNOW3G_8_BYTES;
+        uint8_t *pBufOut1 = pBufferOut1;
+        uint8_t *pBufOut2 = pBufferOut2;
+        uint8_t *pBufOut3 = pBufferOut3;
+        uint8_t *pBufOut4 = pBufferOut4;
+        const uint8_t *pBufIn1 = pBufferIn1;
+        const uint8_t *pBufIn2 = pBufferIn2;
+        const uint8_t *pBufIn3 = pBufferIn3;
+        const uint8_t *pBufIn4 = pBufferIn4;
+
+        bytes = qwords * SNOW3G_8_BYTES; /* rounded down minimum length */
+
+        /* Initialize the schedule from the IV */
+        snow3gStateInitialize_4(&ctx, pHandle, pIV1, pIV2, pIV3, pIV4);
+
+        /* Clock FSM and LFSR once, ignore the keystream */
+        snow3g_keystream_4_4(&ctx, &L);
+
+        lenInBytes1 -= bytes;
+        lenInBytes2 -= bytes;
+        lenInBytes3 -= bytes;
+        lenInBytes4 -= bytes;
+
+        /* generates 4 bytes at a time on all streams */
+        while (qwords--) {
+                snow3g_keystream_4_8(&ctx, &L, &H);
+                pBufIn1 = xor_keystrm_rev(pBufOut1, pBufIn1,
+                                          _mm_extract_epi64(L, 0));
+                pBufIn2 = xor_keystrm_rev(pBufOut2, pBufIn2,
+                                          _mm_extract_epi64(L, 1));
+                pBufIn3 = xor_keystrm_rev(pBufOut3, pBufIn3,
+                                          _mm_extract_epi64(H, 0));
+                pBufIn4 = xor_keystrm_rev(pBufOut4, pBufIn4,
+                                          _mm_extract_epi64(H, 1));
+
+                pBufOut1 += SNOW3G_8_BYTES;
+                pBufOut2 += SNOW3G_8_BYTES;
+                pBufOut3 += SNOW3G_8_BYTES;
+                pBufOut4 += SNOW3G_8_BYTES;
+        }
+
+        /* process the remaining of each buffer
+         *  - extract the LFSR and FSM structures
+         *  - Continue process 1 buffer
+         */
+        if (lenInBytes1) {
+                snow3gKeyState1_t ctx1;
+
+                snow3gStateConvert_4(&ctx, &ctx1, 0);
+                f8_snow3g(&ctx1, pBufIn1, pBufOut1, lenInBytes1);
+        }
+
+        if (lenInBytes2) {
+                snow3gKeyState1_t ctx2;
+
+                snow3gStateConvert_4(&ctx, &ctx2, 1);
+                f8_snow3g(&ctx2, pBufIn2, pBufOut2, lenInBytes2);
+        }
+
+        if (lenInBytes3) {
+                snow3gKeyState1_t ctx3;
+
+                snow3gStateConvert_4(&ctx, &ctx3, 2);
+                f8_snow3g(&ctx3, pBufIn3, pBufOut3, lenInBytes3);
+        }
+
+        if (lenInBytes4) {
+                snow3gKeyState1_t ctx4;
+
+                snow3gStateConvert_4(&ctx, &ctx4, 3);
+                f8_snow3g(&ctx4, pBufIn4, pBufOut4, lenInBytes4);
+        }
+
+#ifdef SAFE_DATA
+        H = _mm_setzero_si128();
+        L = _mm_setzero_si128();
+        CLEAR_MEM(&ctx, sizeof(ctx));
+        CLEAR_SCRATCH_GPS();
+        CLEAR_SCRATCH_SIMD_REGS();
+#endif /* SAFE_DATA */
+
+}
+
+#ifdef AVX2
+/*---------------------------------------------------------
+ * @description
+ *      Snow3G 8 buffer ks 8 multi:
+ *      Processes 8 packets 8 bytes at a time.
+ *      Uses individual key schedule for each buffer.
+ *---------------------------------------------------------*/
+static inline void
+snow3g_8_buffer_ks_8_multi(uint32_t bytes,
+                           const snow3g_key_schedule_t * const pKey[],
+                           const void * const IV[],
+                           const void * const pBufferIn[],
+                           void *pBufferOut[], const uint32_t *lengthInBytes)
+{
+        uint32_t qwords = bytes / SNOW3G_8_BYTES;
+        __m256i H, L; /* 8 bytes of keystream */
+        snow3gKeyState8_t ctx;
+        int i;
+        const uint8_t *tBufferIn[8];
+        uint8_t *tBufferOut[8];
+        uint32_t tLenInBytes[8];
+
+        bytes = qwords * SNOW3G_8_BYTES; /* rounded down minimum length */
+
+        for (i = 0; i < 8; i++) {
+                tBufferIn[i] = pBufferIn[i];
+                tBufferOut[i] = pBufferOut[i];
+                tLenInBytes[i] = lengthInBytes[i];
+        }
+
+        /* Initialize the schedule from the IV */
+        snow3gStateInitialize_8_multiKey(&ctx, pKey, IV);
+
+        /* Clock FSM and LFSR once, ignore the keystream */
+        snow3g_keystream_8_4(&ctx, &L);
+
+        for (i = 0; i < 8; i++)
+                tLenInBytes[i] -= bytes;
+
+        /* generates 8 sets at a time on all streams */
+        for (i = qwords; i != 0; i--) {
+                int j;
+
+                snow3g_keystream_8_8(&ctx, &L, &H);
+
+                tBufferIn[0] = xor_keystrm_rev(tBufferOut[0], tBufferIn[0],
+                                               _mm256_extract_epi64(L, 0));
+                tBufferIn[1] = xor_keystrm_rev(tBufferOut[1], tBufferIn[1],
+                                               _mm256_extract_epi64(L, 1));
+                tBufferIn[2] = xor_keystrm_rev(tBufferOut[2], tBufferIn[2],
+                                               _mm256_extract_epi64(H, 0));
+                tBufferIn[3] = xor_keystrm_rev(tBufferOut[3], tBufferIn[3],
+                                               _mm256_extract_epi64(H, 1));
+                tBufferIn[4] = xor_keystrm_rev(tBufferOut[4], tBufferIn[4],
+                                               _mm256_extract_epi64(L, 2));
+                tBufferIn[5] = xor_keystrm_rev(tBufferOut[5], tBufferIn[5],
+                                               _mm256_extract_epi64(L, 3));
+                tBufferIn[6] = xor_keystrm_rev(tBufferOut[6], tBufferIn[6],
+                                               _mm256_extract_epi64(H, 2));
+                tBufferIn[7] = xor_keystrm_rev(tBufferOut[7], tBufferIn[7],
+                                               _mm256_extract_epi64(H, 3));
+
+                for (j = 0; j < 8; j++)
+                        tBufferOut[j] += SNOW3G_8_BYTES;
+        }
+
+        /* process the remaining of each buffer
+         *  - extract the LFSR and FSM structures
+         *  - Continue process 1 buffer
+         */
+        if (tLenInBytes[0]) {
+                snow3gKeyState1_t ctx1;
+
+                snow3gStateConvert_8(&ctx, &ctx1, 0);
+                f8_snow3g(&ctx1, tBufferIn[0], tBufferOut[0], tLenInBytes[0]);
+        }
+        if (tLenInBytes[1]) {
+                snow3gKeyState1_t ctx2;
+
+                snow3gStateConvert_8(&ctx, &ctx2, 1);
+                f8_snow3g(&ctx2, tBufferIn[1], tBufferOut[1], tLenInBytes[1]);
+        }
+        if (tLenInBytes[2]) {
+                snow3gKeyState1_t ctx3;
+
+                snow3gStateConvert_8(&ctx, &ctx3, 2);
+                f8_snow3g(&ctx3, tBufferIn[2], tBufferOut[2], tLenInBytes[2]);
+        }
+        if (tLenInBytes[3]) {
+                snow3gKeyState1_t ctx4;
+
+                snow3gStateConvert_8(&ctx, &ctx4, 3);
+                f8_snow3g(&ctx4, tBufferIn[3], tBufferOut[3], tLenInBytes[3]);
+        }
+        if (tLenInBytes[4]) {
+                snow3gKeyState1_t ctx5;
+
+                snow3gStateConvert_8(&ctx, &ctx5, 4);
+                f8_snow3g(&ctx5, tBufferIn[4], tBufferOut[4], tLenInBytes[4]);
+        }
+        if (tLenInBytes[5]) {
+                snow3gKeyState1_t ctx6;
+
+                snow3gStateConvert_8(&ctx, &ctx6, 5);
+                f8_snow3g(&ctx6, tBufferIn[5], tBufferOut[5], tLenInBytes[5]);
+        }
+        if (tLenInBytes[6]) {
+                snow3gKeyState1_t ctx7;
+
+                snow3gStateConvert_8(&ctx, &ctx7, 6);
+                f8_snow3g(&ctx7, tBufferIn[6], tBufferOut[6], tLenInBytes[6]);
+        }
+        if (tLenInBytes[7]) {
+                snow3gKeyState1_t ctx8;
+
+                snow3gStateConvert_8(&ctx, &ctx8, 7);
+                f8_snow3g(&ctx8, tBufferIn[7], tBufferOut[7], tLenInBytes[7]);
+        }
+
+#ifdef SAFE_DATA
+        H = _mm256_setzero_si256();
+        L = _mm256_setzero_si256();
+        CLEAR_MEM(&ctx, sizeof(ctx));
+#endif /* SAFE_DATA */
+}
+
+/*---------------------------------------------------------
+ * @description
+ *      Snow3G 8 buffer ks 32 multi:
+ *      Processes 8 packets 32 bytes at a time.
+ *      Uses individual key schedule for each buffer.
+ *---------------------------------------------------------*/
+static inline void
+snow3g_8_buffer_ks_32_multi(uint32_t bytes,
+                            const snow3g_key_schedule_t * const pKey[],
+                            const void * const IV[],
+                            const void * const pBufferIn[],
+                            void *pBufferOut[], const uint32_t *lengthInBytes)
+{
+
+        snow3gKeyState8_t ctx;
+        uint32_t i;
+
+        const uint8_t *tBufferIn[8];
+        uint8_t *tBufferOut[8];
+        uint32_t tLenInBytes[8];
+
+        for (i = 0; i < 8; i++) {
+                tBufferIn[i] = pBufferIn[i];
+                tBufferOut[i] = pBufferOut[i];
+                tLenInBytes[i] = lengthInBytes[i];
+        }
+
+        uint32_t blocks = bytes / 32;
+
+        bytes = blocks * 32; /* rounded down minimum length */
+
+        /* Initialize the schedule from the IV */
+        snow3gStateInitialize_8_multiKey(&ctx, pKey, IV);
+
+        /* Clock FSM and LFSR once, ignore the keystream */
+        __m256i ks[8];
+
+        snow3g_keystream_8_4(&ctx, ks);
+
+        for (i = 0; i < 8; i++)
+                tLenInBytes[i] -= bytes;
+
+        __m256i in[8];
+
+        /* generates 8 sets at a time on all streams */
+        for (i = 0; i < blocks; i++) {
+                int j;
+
+                in[0] = _mm256_loadu_si256((const __m256i *)tBufferIn[0]);
+                in[1] = _mm256_loadu_si256((const __m256i *)tBufferIn[1]);
+                in[2] = _mm256_loadu_si256((const __m256i *)tBufferIn[2]);
+                in[3] = _mm256_loadu_si256((const __m256i *)tBufferIn[3]);
+                in[4] = _mm256_loadu_si256((const __m256i *)tBufferIn[4]);
+                in[5] = _mm256_loadu_si256((const __m256i *)tBufferIn[5]);
+                in[6] = _mm256_loadu_si256((const __m256i *)tBufferIn[6]);
+                in[7] = _mm256_loadu_si256((const __m256i *)tBufferIn[7]);
+
+                snow3g_keystream_8_32(&ctx, ks);
+
+                _mm256_storeu_si256((__m256i *)tBufferOut[0],
+                                    _mm256_xor_si256(in[0], ks[0]));
+                _mm256_storeu_si256((__m256i *)tBufferOut[1],
+                                    _mm256_xor_si256(in[1], ks[1]));
+                _mm256_storeu_si256((__m256i *)tBufferOut[2],
+                                    _mm256_xor_si256(in[2], ks[2]));
+                _mm256_storeu_si256((__m256i *)tBufferOut[3],
+                                    _mm256_xor_si256(in[3], ks[3]));
+                _mm256_storeu_si256((__m256i *)tBufferOut[4],
+                                    _mm256_xor_si256(in[4], ks[4]));
+                _mm256_storeu_si256((__m256i *)tBufferOut[5],
+                                    _mm256_xor_si256(in[5], ks[5]));
+                _mm256_storeu_si256((__m256i *)tBufferOut[6],
+                                    _mm256_xor_si256(in[6], ks[6]));
+                _mm256_storeu_si256((__m256i *)tBufferOut[7],
+                                    _mm256_xor_si256(in[7], ks[7]));
+
+                for (j = 0; j < 8; j++) {
+                        tBufferIn[i] += 32;
+                        tBufferOut[i] += 32;
+                }
+        }
+
+        /* process the remaining of each buffer
+         *  - extract the LFSR and FSM structures
+         *  - Continue process 1 buffer
+         */
+        if (tLenInBytes[0]) {
+                snow3gKeyState1_t ctx1;
+
+                snow3gStateConvert_8(&ctx, &ctx1, 0);
+                f8_snow3g(&ctx1, tBufferIn[0], tBufferOut[0], tLenInBytes[0]);
+        }
+        if (tLenInBytes[1]) {
+                snow3gKeyState1_t ctx2;
+
+                snow3gStateConvert_8(&ctx, &ctx2, 1);
+                f8_snow3g(&ctx2, tBufferIn[1], tBufferOut[1], tLenInBytes[1]);
+        }
+        if (tLenInBytes[2]) {
+                snow3gKeyState1_t ctx3;
+
+                snow3gStateConvert_8(&ctx, &ctx3, 2);
+                f8_snow3g(&ctx3, tBufferIn[2], tBufferOut[2], tLenInBytes[2]);
+        }
+        if (tLenInBytes[3]) {
+                snow3gKeyState1_t ctx4;
+
+                snow3gStateConvert_8(&ctx, &ctx4, 3);
+                f8_snow3g(&ctx4, tBufferIn[3], tBufferOut[3], tLenInBytes[3]);
+        }
+        if (tLenInBytes[4]) {
+                snow3gKeyState1_t ctx5;
+
+                snow3gStateConvert_8(&ctx, &ctx5, 4);
+                f8_snow3g(&ctx5, tBufferIn[4], tBufferOut[4], tLenInBytes[4]);
+        }
+        if (tLenInBytes[5]) {
+                snow3gKeyState1_t ctx6;
+
+                snow3gStateConvert_8(&ctx, &ctx6, 5);
+                f8_snow3g(&ctx6, tBufferIn[5], tBufferOut[5], tLenInBytes[5]);
+        }
+        if (tLenInBytes[6]) {
+                snow3gKeyState1_t ctx7;
+
+                snow3gStateConvert_8(&ctx, &ctx7, 6);
+                f8_snow3g(&ctx7, tBufferIn[6], tBufferOut[6], tLenInBytes[6]);
+        }
+        if (tLenInBytes[7]) {
+                snow3gKeyState1_t ctx8;
+
+                snow3gStateConvert_8(&ctx, &ctx8, 7);
+                f8_snow3g(&ctx8, tBufferIn[7], tBufferOut[7], tLenInBytes[7]);
+        }
+
+#ifdef SAFE_DATA
+        CLEAR_MEM(&ctx, sizeof(ctx));
+        CLEAR_MEM(&ks, sizeof(ks));
+        CLEAR_MEM(&in, sizeof(in));
+#endif /* SAFE_DATA */
+}
+
+/*---------------------------------------------------------
+ * @description
+ *      Snow3G 8 buffer ks 8 multi:
+ *      Processes 8 packets 8 bytes at a time.
+ *      Uses same key schedule for each buffer.
+ *---------------------------------------------------------*/
+static inline void
+snow3g_8_buffer_ks_8(uint32_t bytes,
+                     const snow3g_key_schedule_t *pHandle,
+                     const void *pIV1,
+                     const void *pIV2,
+                     const void *pIV3,
+                     const void *pIV4,
+                     const void *pIV5,
+                     const void *pIV6,
+                     const void *pIV7,
+                     const void *pIV8,
+                     const void *pBufferIn1, void *pBufferOut1,
+                     const uint32_t lengthInBytes1,
+                     const void *pBufferIn2, void *pBufferOut2,
+                     const uint32_t lengthInBytes2,
+                     const void *pBufferIn3, void *pBufferOut3,
+                     const uint32_t lengthInBytes3,
+                     const void *pBufferIn4, void *pBufferOut4,
+                     const uint32_t lengthInBytes4,
+                     const void *pBufferIn5, void *pBufferOut5,
+                     const uint32_t lengthInBytes5,
+                     const void *pBufferIn6, void *pBufferOut6,
+                     const uint32_t lengthInBytes6,
+                     const void *pBufferIn7, void *pBufferOut7,
+                     const uint32_t lengthInBytes7,
+                     const void *pBufferIn8, void *pBufferOut8,
+                     const uint32_t lengthInBytes8)
+{
+
+        uint32_t qwords = bytes / SNOW3G_8_BYTES;
+        __m256i H, L; /* 8 bytes of keystream */
+        snow3gKeyState8_t ctx;
+        int i;
+        uint32_t lenInBytes1 = lengthInBytes1;
+        uint32_t lenInBytes2 = lengthInBytes2;
+        uint32_t lenInBytes3 = lengthInBytes3;
+        uint32_t lenInBytes4 = lengthInBytes4;
+        uint32_t lenInBytes5 = lengthInBytes5;
+        uint32_t lenInBytes6 = lengthInBytes6;
+        uint32_t lenInBytes7 = lengthInBytes7;
+        uint32_t lenInBytes8 = lengthInBytes8;
+        uint8_t *pBufOut1 = pBufferOut1;
+        uint8_t *pBufOut2 = pBufferOut2;
+        uint8_t *pBufOut3 = pBufferOut3;
+        uint8_t *pBufOut4 = pBufferOut4;
+        uint8_t *pBufOut5 = pBufferOut5;
+        uint8_t *pBufOut6 = pBufferOut6;
+        uint8_t *pBufOut7 = pBufferOut7;
+        uint8_t *pBufOut8 = pBufferOut8;
+        const uint8_t *pBufIn1 = pBufferIn1;
+        const uint8_t *pBufIn2 = pBufferIn2;
+        const uint8_t *pBufIn3 = pBufferIn3;
+        const uint8_t *pBufIn4 = pBufferIn4;
+        const uint8_t *pBufIn5 = pBufferIn5;
+        const uint8_t *pBufIn6 = pBufferIn6;
+        const uint8_t *pBufIn7 = pBufferIn7;
+        const uint8_t *pBufIn8 = pBufferIn8;
+
+        bytes = qwords * SNOW3G_8_BYTES; /* rounded down minimum length */
+
+        /* Initialize the schedule from the IV */
+        snow3gStateInitialize_8(&ctx, pHandle, pIV1, pIV2, pIV3,
+                                pIV4, pIV5, pIV6, pIV7, pIV8);
+
+        /* Clock FSM and LFSR once, ignore the keystream */
+        snow3g_keystream_8_4(&ctx, &L);
+
+        lenInBytes1 -= bytes;
+        lenInBytes2 -= bytes;
+        lenInBytes3 -= bytes;
+        lenInBytes4 -= bytes;
+        lenInBytes5 -= bytes;
+        lenInBytes6 -= bytes;
+        lenInBytes7 -= bytes;
+        lenInBytes8 -= bytes;
+
+        /* generates 8 sets at a time on all streams */
+        for (i = qwords; i != 0; i--) {
+                snow3g_keystream_8_8(&ctx, &L, &H);
+
+                pBufIn1 = xor_keystrm_rev(pBufOut1, pBufIn1,
+                                          _mm256_extract_epi64(L, 0));
+                pBufIn2 = xor_keystrm_rev(pBufOut2, pBufIn2,
+                                          _mm256_extract_epi64(L, 1));
+                pBufIn3 = xor_keystrm_rev(pBufOut3, pBufIn3,
+                                          _mm256_extract_epi64(H, 0));
+                pBufIn4 = xor_keystrm_rev(pBufOut4, pBufIn4,
+                                          _mm256_extract_epi64(H, 1));
+                pBufIn5 = xor_keystrm_rev(pBufOut5, pBufIn5,
+                                          _mm256_extract_epi64(L, 2));
+                pBufIn6 = xor_keystrm_rev(pBufOut6, pBufIn6,
+                                          _mm256_extract_epi64(L, 3));
+                pBufIn7 = xor_keystrm_rev(pBufOut7, pBufIn7,
+                                          _mm256_extract_epi64(H, 2));
+                pBufIn8 = xor_keystrm_rev(pBufOut8, pBufIn8,
+                                          _mm256_extract_epi64(H, 3));
+
+                pBufOut1 += SNOW3G_8_BYTES;
+                pBufOut2 += SNOW3G_8_BYTES;
+                pBufOut3 += SNOW3G_8_BYTES;
+                pBufOut4 += SNOW3G_8_BYTES;
+                pBufOut5 += SNOW3G_8_BYTES;
+                pBufOut6 += SNOW3G_8_BYTES;
+                pBufOut7 += SNOW3G_8_BYTES;
+                pBufOut8 += SNOW3G_8_BYTES;
+        }
+
+        /* process the remaining of each buffer
+         *  - extract the LFSR and FSM structures
+         *  - Continue process 1 buffer
+         */
+        if (lenInBytes1) {
+                snow3gKeyState1_t ctx1;
+
+                snow3gStateConvert_8(&ctx, &ctx1, 0);
+                f8_snow3g(&ctx1, pBufIn1, pBufOut1, lenInBytes1);
+        }
+
+        if (lenInBytes2) {
+                snow3gKeyState1_t ctx2;
+
+                snow3gStateConvert_8(&ctx, &ctx2, 1);
+                f8_snow3g(&ctx2, pBufIn2, pBufOut2, lenInBytes2);
+        }
+
+        if (lenInBytes3) {
+                snow3gKeyState1_t ctx3;
+
+                snow3gStateConvert_8(&ctx, &ctx3, 2);
+                f8_snow3g(&ctx3, pBufIn3, pBufOut3, lenInBytes3);
+        }
+
+        if (lenInBytes4) {
+                snow3gKeyState1_t ctx4;
+
+                snow3gStateConvert_8(&ctx, &ctx4, 3);
+                f8_snow3g(&ctx4, pBufIn4, pBufOut4, lenInBytes4);
+        }
+
+        if (lenInBytes5) {
+                snow3gKeyState1_t ctx5;
+
+                snow3gStateConvert_8(&ctx, &ctx5, 4);
+                f8_snow3g(&ctx5, pBufIn5, pBufOut5, lenInBytes5);
+        }
+
+        if (lenInBytes6) {
+                snow3gKeyState1_t ctx6;
+
+                snow3gStateConvert_8(&ctx, &ctx6, 5);
+                f8_snow3g(&ctx6, pBufIn6, pBufOut6, lenInBytes6);
+        }
+
+        if (lenInBytes7) {
+                snow3gKeyState1_t ctx7;
+
+                snow3gStateConvert_8(&ctx, &ctx7, 6);
+                f8_snow3g(&ctx7, pBufIn7, pBufOut7, lenInBytes7);
+        }
+
+        if (lenInBytes8) {
+                snow3gKeyState1_t ctx8;
+
+                snow3gStateConvert_8(&ctx, &ctx8, 7);
+                f8_snow3g(&ctx8, pBufIn8, pBufOut8, lenInBytes8);
+        }
+
+#ifdef SAFE_DATA
+        H = _mm256_setzero_si256();
+        L = _mm256_setzero_si256();
+        CLEAR_MEM(&ctx, sizeof(ctx));
+#endif /* SAFE_DATA */
+}
+
+/*---------------------------------------------------------
+ * @description
+ *      Snow3G 8 buffer ks 32 multi:
+ *      Processes 8 packets 32 bytes at a time.
+ *      Uses same key schedule for each buffer.
+ *---------------------------------------------------------*/
+static inline void
+snow3g_8_buffer_ks_32(uint32_t bytes,
+                      const snow3g_key_schedule_t *pKey,
+                      const void *pIV1, const void *pIV2,
+                      const void *pIV3, const void *pIV4,
+                      const void *pIV5, const void *pIV6,
+                      const void *pIV7, const void *pIV8,
+                      const void *pBufferIn1, void *pBufferOut1,
+                      const uint32_t lengthInBytes1,
+                      const void *pBufferIn2, void *pBufferOut2,
+                      const uint32_t lengthInBytes2,
+                      const void *pBufferIn3, void *pBufferOut3,
+                      const uint32_t lengthInBytes3,
+                      const void *pBufferIn4, void *pBufferOut4,
+                      const uint32_t lengthInBytes4,
+                      const void *pBufferIn5, void *pBufferOut5,
+                      const uint32_t lengthInBytes5,
+                      const void *pBufferIn6, void *pBufferOut6,
+                      const uint32_t lengthInBytes6,
+                      const void *pBufferIn7, void *pBufferOut7,
+                      const uint32_t lengthInBytes7,
+                      const void *pBufferIn8, void *pBufferOut8,
+                      const uint32_t lengthInBytes8)
+{
+        snow3gKeyState8_t ctx;
+        uint32_t i;
+        uint32_t lenInBytes1 = lengthInBytes1;
+        uint32_t lenInBytes2 = lengthInBytes2;
+        uint32_t lenInBytes3 = lengthInBytes3;
+        uint32_t lenInBytes4 = lengthInBytes4;
+        uint32_t lenInBytes5 = lengthInBytes5;
+        uint32_t lenInBytes6 = lengthInBytes6;
+        uint32_t lenInBytes7 = lengthInBytes7;
+        uint32_t lenInBytes8 = lengthInBytes8;
+        uint8_t *pBufOut1 = pBufferOut1;
+        uint8_t *pBufOut2 = pBufferOut2;
+        uint8_t *pBufOut3 = pBufferOut3;
+        uint8_t *pBufOut4 = pBufferOut4;
+        uint8_t *pBufOut5 = pBufferOut5;
+        uint8_t *pBufOut6 = pBufferOut6;
+        uint8_t *pBufOut7 = pBufferOut7;
+        uint8_t *pBufOut8 = pBufferOut8;
+        const uint8_t *pBufIn1 = pBufferIn1;
+        const uint8_t *pBufIn2 = pBufferIn2;
+        const uint8_t *pBufIn3 = pBufferIn3;
+        const uint8_t *pBufIn4 = pBufferIn4;
+        const uint8_t *pBufIn5 = pBufferIn5;
+        const uint8_t *pBufIn6 = pBufferIn6;
+        const uint8_t *pBufIn7 = pBufferIn7;
+        const uint8_t *pBufIn8 = pBufferIn8;
+
+        uint32_t blocks = bytes / 32;
+
+        bytes = blocks * 32; /* rounded down minimum length */
+
+        /* Initialize the schedule from the IV */
+        snow3gStateInitialize_8(&ctx, pKey, pIV1, pIV2, pIV3, pIV4, pIV5, pIV6,
+                                pIV7, pIV8);
+
+        /* Clock FSM and LFSR once, ignore the keystream */
+        __m256i ks[8];
+
+        snow3g_keystream_8_4(&ctx, ks);
+
+        lenInBytes1 -= bytes;
+        lenInBytes2 -= bytes;
+        lenInBytes3 -= bytes;
+        lenInBytes4 -= bytes;
+        lenInBytes5 -= bytes;
+        lenInBytes6 -= bytes;
+        lenInBytes7 -= bytes;
+        lenInBytes8 -= bytes;
+
+        __m256i in[8];
+
+        /* generates 8 sets at a time on all streams */
+        for (i = 0; i < blocks; i++) {
+
+                in[0] = _mm256_loadu_si256((const __m256i *)pBufIn1);
+                in[1] = _mm256_loadu_si256((const __m256i *)pBufIn2);
+                in[2] = _mm256_loadu_si256((const __m256i *)pBufIn3);
+                in[3] = _mm256_loadu_si256((const __m256i *)pBufIn4);
+                in[4] = _mm256_loadu_si256((const __m256i *)pBufIn5);
+                in[5] = _mm256_loadu_si256((const __m256i *)pBufIn6);
+                in[6] = _mm256_loadu_si256((const __m256i *)pBufIn7);
+                in[7] = _mm256_loadu_si256((const __m256i *)pBufIn8);
+
+                snow3g_keystream_8_32(&ctx, ks);
+
+                _mm256_storeu_si256((__m256i *)pBufOut1,
+                                    _mm256_xor_si256(in[0], ks[0]));
+                _mm256_storeu_si256((__m256i *)pBufOut2,
+                                    _mm256_xor_si256(in[1], ks[1]));
+                _mm256_storeu_si256((__m256i *)pBufOut3,
+                                    _mm256_xor_si256(in[2], ks[2]));
+                _mm256_storeu_si256((__m256i *)pBufOut4,
+                                    _mm256_xor_si256(in[3], ks[3]));
+                _mm256_storeu_si256((__m256i *)pBufOut5,
+                                    _mm256_xor_si256(in[4], ks[4]));
+                _mm256_storeu_si256((__m256i *)pBufOut6,
+                                    _mm256_xor_si256(in[5], ks[5]));
+                _mm256_storeu_si256((__m256i *)pBufOut7,
+                                    _mm256_xor_si256(in[6], ks[6]));
+                _mm256_storeu_si256((__m256i *)pBufOut8,
+                                    _mm256_xor_si256(in[7], ks[7]));
+
+                pBufIn1 += 32;
+                pBufIn2 += 32;
+                pBufIn3 += 32;
+                pBufIn4 += 32;
+                pBufIn5 += 32;
+                pBufIn6 += 32;
+                pBufIn7 += 32;
+                pBufIn8 += 32;
+
+                pBufOut1 += 32;
+                pBufOut2 += 32;
+                pBufOut3 += 32;
+                pBufOut4 += 32;
+                pBufOut5 += 32;
+                pBufOut6 += 32;
+                pBufOut7 += 32;
+                pBufOut8 += 32;
+        }
+
+        /* process the remaining of each buffer
+         *  - extract the LFSR and FSM structures
+         *  - Continue process 1 buffer
+         */
+        if (lenInBytes1) {
+                snow3gKeyState1_t ctx1;
+
+                snow3gStateConvert_8(&ctx, &ctx1, 0);
+                f8_snow3g(&ctx1, pBufIn1, pBufOut1, lenInBytes1);
+        }
+
+        if (lenInBytes2) {
+                snow3gKeyState1_t ctx2;
+
+                snow3gStateConvert_8(&ctx, &ctx2, 1);
+                f8_snow3g(&ctx2, pBufIn2, pBufOut2, lenInBytes2);
+        }
+
+        if (lenInBytes3) {
+                snow3gKeyState1_t ctx3;
+
+                snow3gStateConvert_8(&ctx, &ctx3, 2);
+                f8_snow3g(&ctx3, pBufIn3, pBufOut3, lenInBytes3);
+        }
+
+        if (lenInBytes4) {
+                snow3gKeyState1_t ctx4;
+
+                snow3gStateConvert_8(&ctx, &ctx4, 3);
+                f8_snow3g(&ctx4, pBufIn4, pBufOut4, lenInBytes4);
+        }
+
+        if (lenInBytes5) {
+                snow3gKeyState1_t ctx5;
+
+                snow3gStateConvert_8(&ctx, &ctx5, 4);
+                f8_snow3g(&ctx5, pBufIn5, pBufOut5, lenInBytes5);
+        }
+
+        if (lenInBytes6) {
+                snow3gKeyState1_t ctx6;
+
+                snow3gStateConvert_8(&ctx, &ctx6, 5);
+                f8_snow3g(&ctx6, pBufIn6, pBufOut6, lenInBytes6);
+        }
+
+        if (lenInBytes7) {
+                snow3gKeyState1_t ctx7;
+
+                snow3gStateConvert_8(&ctx, &ctx7, 6);
+                f8_snow3g(&ctx7, pBufIn7, pBufOut7, lenInBytes7);
+        }
+
+        if (lenInBytes8) {
+                snow3gKeyState1_t ctx8;
+
+                snow3gStateConvert_8(&ctx, &ctx8, 7);
+                f8_snow3g(&ctx8, pBufIn8, pBufOut8, lenInBytes8);
+        }
+
+#ifdef SAFE_DATA
+        CLEAR_MEM(&ctx, sizeof(ctx));
+        CLEAR_MEM(&ks, sizeof(ks));
+        CLEAR_MEM(&in, sizeof(in));
+#endif /* SAFE_DATA */
+}
+#endif /* AVX2 */
+
+/*---------------------------------------------------------
+ * @description
+ *      Snow3G F8 8 buffer, multi-key:
+ *      Eight packets enc/dec with eight respective key schedules.
+ *      The 8 IVs are independent and are passed as an array of pointers.
+ *      Each buffer and data length are separate.
+ *---------------------------------------------------------*/
+void SNOW3G_F8_8_BUFFER_MULTIKEY(const snow3g_key_schedule_t * const pKey[],
+                                 const void * const IV[],
+                                 const void * const BufferIn[],
+                                 void *BufferOut[],
+                                 const uint32_t lengthInBytes[])
+{
+        int i;
+
+#ifdef SAFE_PARAM
+        if ((pKey == NULL) || (IV == NULL) || (BufferIn == NULL) ||
+            (BufferOut == NULL) || (lengthInBytes == NULL))
+                return;
+
+        for (i = 0; i < 8; i++)
+                if ((pKey[i] == NULL) || (IV[i] == NULL) ||
+                    (BufferIn[i] == NULL) || (BufferOut[i] == NULL) ||
+                    (lengthInBytes[i] == 0) ||
+                    (lengthInBytes[i] > SNOW3G_MAX_BYTELEN))
+                        return;
+#endif
+
+#ifndef AVX2
+        /* basic C workaround for lack of non AVX2 implementation */
+        for (i = 0; i < 8; i++)
+                SNOW3G_F8_1_BUFFER(pKey[i], IV[i], BufferIn[i], BufferOut[i],
+                                   lengthInBytes[i]);
+#else
+        uint32_t bytes = lengthInBytes[0];
+
+        /* find min byte lenght */
+        for (i = 1; i < 8; i++)
+                if (lengthInBytes[i] < bytes)
+                        bytes = lengthInBytes[i];
+
+        if (bytes % 32) {
+                snow3g_8_buffer_ks_8_multi(bytes, pKey, IV, BufferIn, BufferOut,
+                                           lengthInBytes);
+        } else {
+                snow3g_8_buffer_ks_32_multi(bytes, pKey, IV, BufferIn,
+                                            BufferOut, lengthInBytes);
+        }
+#ifdef SAFE_DATA
+        CLEAR_SCRATCH_GPS();
+        CLEAR_SCRATCH_SIMD_REGS();
+#endif
+#endif /* AVX2 */
+}
+
+/*---------------------------------------------------------
+ * @description
+ *      Snow3G F8 8 buffer:
+ *      Eight packets enc/dec with the same key schedule.
+ *      The 8 IVs are independent and are passed as an array of pointers.
+ *      Each buffer and data length are separate.
+ *      Uses AVX instructions.
+ *---------------------------------------------------------*/
+void SNOW3G_F8_8_BUFFER(const snow3g_key_schedule_t *pHandle,
+                        const void *pIV1,
+                        const void *pIV2,
+                        const void *pIV3,
+                        const void *pIV4,
+                        const void *pIV5,
+                        const void *pIV6,
+                        const void *pIV7,
+                        const void *pIV8,
+                        const void *pBufIn1,
+                        void *pBufOut1,
+                        const uint32_t lenInBytes1,
+                        const void *pBufIn2,
+                        void *pBufOut2,
+                        const uint32_t lenInBytes2,
+                        const void *pBufIn3,
+                        void *pBufOut3,
+                        const uint32_t lenInBytes3,
+                        const void *pBufIn4,
+                        void *pBufOut4,
+                        const uint32_t lenInBytes4,
+                        const void *pBufIn5,
+                        void *pBufOut5,
+                        const uint32_t lenInBytes5,
+                        const void *pBufIn6,
+                        void *pBufOut6,
+                        const uint32_t lenInBytes6,
+                        const void *pBufIn7,
+                        void *pBufOut7,
+                        const uint32_t lenInBytes7,
+                        const void *pBufIn8,
+                        void *pBufOut8,
+                        const uint32_t lenInBytes8)
+{
+#ifdef SAFE_PARAM
+        if ((pHandle == NULL) ||
+            (pIV1 == NULL) || (pIV2 == NULL) ||
+            (pIV3 == NULL) || (pIV4 == NULL) ||
+            (pIV5 == NULL) || (pIV6 == NULL) ||
+            (pIV7 == NULL) || (pIV8 == NULL) ||
+            (pBufIn1 == NULL) || (pBufOut1 == NULL) ||
+            (pBufIn2 == NULL) || (pBufOut2 == NULL) ||
+            (pBufIn3 == NULL) || (pBufOut3 == NULL) ||
+            (pBufIn4 == NULL) || (pBufOut4 == NULL) ||
+            (pBufIn5 == NULL) || (pBufOut5 == NULL) ||
+            (pBufIn6 == NULL) || (pBufOut6 == NULL) ||
+            (pBufIn7 == NULL) || (pBufOut7 == NULL) ||
+            (pBufIn8 == NULL) || (pBufOut8 == NULL) ||
+            (lenInBytes1 == 0) || (lenInBytes1 > SNOW3G_MAX_BYTELEN) ||
+            (lenInBytes2 == 0) || (lenInBytes2 > SNOW3G_MAX_BYTELEN) ||
+            (lenInBytes3 == 0) || (lenInBytes3 > SNOW3G_MAX_BYTELEN) ||
+            (lenInBytes4 == 0) || (lenInBytes4 > SNOW3G_MAX_BYTELEN) ||
+            (lenInBytes5 == 0) || (lenInBytes5 > SNOW3G_MAX_BYTELEN) ||
+            (lenInBytes6 == 0) || (lenInBytes6 > SNOW3G_MAX_BYTELEN) ||
+            (lenInBytes7 == 0) || (lenInBytes7 > SNOW3G_MAX_BYTELEN) ||
+            (lenInBytes8 == 0) || (lenInBytes8 > SNOW3G_MAX_BYTELEN))
+                return;
+#endif
+
+#ifdef AVX2
+        uint32_t bytes1 =
+                (lenInBytes1 < lenInBytes2 ? lenInBytes1
+                                           : lenInBytes2); /* number of bytes */
+        uint32_t bytes2 =
+                (lenInBytes3 < lenInBytes4 ? lenInBytes3
+                                           : lenInBytes4); /* number of bytes */
+        uint32_t bytes3 =
+                (lenInBytes5 < lenInBytes6 ? lenInBytes5
+                                           : lenInBytes6); /* number of bytes */
+        uint32_t bytes4 =
+                (lenInBytes7 < lenInBytes8 ? lenInBytes7
+                                           : lenInBytes8); /* number of bytes */
+        uint32_t bytesq1 =
+                (bytes1 < bytes2) ? bytes1 : bytes2; /* min number of bytes */
+        uint32_t bytesq2 = (bytes3 < bytes4) ? bytes3 : bytes4;
+        uint32_t bytes = (bytesq1 < bytesq2) ? bytesq1 : bytesq2;
+
+        if (bytes % 32) {
+                snow3g_8_buffer_ks_8(
+                        bytes, pHandle, pIV1, pIV2, pIV3, pIV4, pIV5, pIV6,
+                        pIV7, pIV8, pBufIn1, pBufOut1, lenInBytes1, pBufIn2,
+                        pBufOut2, lenInBytes2, pBufIn3, pBufOut3, lenInBytes3,
+                        pBufIn4, pBufOut4, lenInBytes4, pBufIn5, pBufOut5,
+                        lenInBytes5, pBufIn6, pBufOut6, lenInBytes6, pBufIn7,
+                        pBufOut7, lenInBytes7, pBufIn8, pBufOut8, lenInBytes8);
+        } else {
+                snow3g_8_buffer_ks_32(
+                        bytes, pHandle, pIV1, pIV2, pIV3, pIV4, pIV5, pIV6,
+                        pIV7, pIV8, pBufIn1, pBufOut1, lenInBytes1, pBufIn2,
+                        pBufOut2, lenInBytes2, pBufIn3, pBufOut3, lenInBytes3,
+                        pBufIn4, pBufOut4, lenInBytes4, pBufIn5, pBufOut5,
+                        lenInBytes5, pBufIn6, pBufOut6, lenInBytes6, pBufIn7,
+                        pBufOut7, lenInBytes7, pBufIn8, pBufOut8, lenInBytes8);
+        }
+#ifdef SAFE_DATA
+        CLEAR_SCRATCH_GPS();
+        CLEAR_SCRATCH_SIMD_REGS();
+#endif
+#else  /* ~AVX2 */
+        SNOW3G_F8_2_BUFFER(pHandle, pIV1, pIV2, pBufIn1, pBufOut1, lenInBytes1,
+                           pBufIn2, pBufOut2, lenInBytes2);
+
+        SNOW3G_F8_2_BUFFER(pHandle, pIV3, pIV4, pBufIn3, pBufOut3, lenInBytes3,
+                           pBufIn4, pBufOut4, lenInBytes4);
+
+        SNOW3G_F8_2_BUFFER(pHandle, pIV5, pIV6, pBufIn5, pBufOut5, lenInBytes5,
+                           pBufIn6, pBufOut6, lenInBytes6);
+
+        SNOW3G_F8_2_BUFFER(pHandle, pIV7, pIV8, pBufIn7, pBufOut7, lenInBytes7,
+                           pBufIn8, pBufOut8, lenInBytes8);
+#endif /* AVX */
+}
+
+/******************************************************************************
+ * @description
+ *      Snow3G F8 multi packet:
+ *      Performs F8 enc/dec on [n] packets. The operation is performed in-place.
+ *      The input IV's are passed in Little Endian format.
+ *      The KeySchedule is in Little Endian format.
+ ******************************************************************************/
+void SNOW3G_F8_N_BUFFER(const snow3g_key_schedule_t *pCtx,
+                        const void * const IV[],
+                        const void * const pBufferIn[],
+                        void *pBufferOut[],
+                        const uint32_t bufLenInBytes[],
+                        const uint32_t packetCount)
+{
+#ifdef SAFE_PARAM
+        uint32_t i;
+
+        if ((pCtx == NULL) || (IV == NULL) || (pBufferIn == NULL) ||
+            (pBufferOut == NULL) || (bufLenInBytes == NULL))
+                return;
+
+        for (i = 0; i < packetCount; i++)
+                if ((IV[i] == NULL) || (pBufferIn[i] == NULL) ||
+                    (pBufferOut[i] == NULL) || (bufLenInBytes[i] == 0) ||
+                    (bufLenInBytes[i] > SNOW3G_MAX_BYTELEN))
+                        return;
+#endif
+        if (packetCount > 16) {
+                pBufferOut[0] = NULL;
+                printf("packetCount too high (%d)\n", packetCount);
+                return;
+        }
+
+        uint32_t packet_index, inner_index, pktCnt = packetCount;
+        int sortNeeded = 0, tempLen = 0;
+        uint8_t *srctempbuff;
+        uint8_t *dsttempbuff;
+        uint8_t *ivtempbuff;
+        uint8_t *pSrcBuf[NUM_PACKETS_16] = {NULL};
+        uint8_t *pDstBuf[NUM_PACKETS_16] = {NULL};
+        uint8_t *pIV[NUM_PACKETS_16] = {NULL};
+        uint32_t lensBuf[NUM_PACKETS_16] = {0};
+
+        memcpy((void *)lensBuf, bufLenInBytes, packetCount * sizeof(uint32_t));
+        memcpy((void *)pSrcBuf, pBufferIn, packetCount * sizeof(void *));
+        memcpy((void *)pDstBuf, pBufferOut, packetCount * sizeof(void *));
+        memcpy((void *)pIV, IV, packetCount * sizeof(void *));
+
+        packet_index = packetCount;
+
+        while (packet_index--) {
+
+                /* check if all packets are sorted by decreasing length */
+                if (packet_index > 0 && lensBuf[packet_index - 1] <
+                                                lensBuf[packet_index]) {
+                        /* this packet array is not correctly sorted */
+                        sortNeeded = 1;
+                }
+        }
+
+        if (sortNeeded) {
+
+                /* sort packets in decreasing buffer size from [0] to
+                   [n]th packet, ** where buffer[0] will contain longest
+                   buffer and buffer[n] will contain the shortest buffer.
+                   4 arrays are swapped :
+                   - pointers to input buffers
+                   - pointers to output buffers
+                   - pointers to input IV's
+                   - input buffer lengths */
+                packet_index = packetCount;
+                while (packet_index--) {
+
+                        inner_index = packet_index;
+                        while (inner_index--) {
+
+                                if (lensBuf[packet_index] >
+                                    lensBuf[inner_index]) {
+
+                                        /* swap buffers to arrange in
+                                           descending order from [0]. */
+                                        srctempbuff = pSrcBuf[packet_index];
+                                        dsttempbuff = pDstBuf[packet_index];
+                                        ivtempbuff = pIV[packet_index];
+                                        tempLen = lensBuf[packet_index];
+
+                                        pSrcBuf[packet_index] =
+                                                pSrcBuf[inner_index];
+                                        pDstBuf[packet_index] =
+                                                pDstBuf[inner_index];
+                                        pIV[packet_index] = pIV[inner_index];
+                                        lensBuf[packet_index] =
+                                                lensBuf[inner_index];
+
+                                        pSrcBuf[inner_index] = srctempbuff;
+                                        pDstBuf[inner_index] = dsttempbuff;
+                                        pIV[inner_index] = ivtempbuff;
+                                        lensBuf[inner_index] = tempLen;
+                                }
+                        } /* for inner packet index (inner bubble-sort) */
+                }         /* for outer packet index (outer bubble-sort) */
+        }                 /* if sortNeeded */
+
+        packet_index = 0;
+        /* process 8 buffers at-a-time */
+#ifdef AVX2
+        while (pktCnt >= 8) {
+                pktCnt -= 8;
+                SNOW3G_F8_8_BUFFER(pCtx, pIV[packet_index],
+                                   pIV[packet_index + 1],
+                                   pIV[packet_index + 2],
+                                   pIV[packet_index + 3],
+                                   pIV[packet_index + 4],
+                                   pIV[packet_index + 5],
+                                   pIV[packet_index + 6],
+                                   pIV[packet_index + 7],
+                                   pSrcBuf[packet_index],
+                                   pDstBuf[packet_index],
+                                   lensBuf[packet_index],
+                                   pSrcBuf[packet_index + 1],
+                                   pDstBuf[packet_index + 1],
+                                   lensBuf[packet_index + 1],
+                                   pSrcBuf[packet_index + 2],
+                                   pDstBuf[packet_index + 2],
+                                   lensBuf[packet_index + 2],
+                                   pSrcBuf[packet_index + 3],
+                                   pDstBuf[packet_index + 3],
+                                   lensBuf[packet_index + 3],
+                                   pSrcBuf[packet_index + 4],
+                                   pDstBuf[packet_index + 4],
+                                   lensBuf[packet_index + 4],
+                                   pSrcBuf[packet_index + 5],
+                                   pDstBuf[packet_index + 5],
+                                   lensBuf[packet_index + 5],
+                                   pSrcBuf[packet_index + 6],
+                                   pDstBuf[packet_index + 6],
+                                   lensBuf[packet_index + 6],
+                                   pSrcBuf[packet_index + 7],
+                                   pDstBuf[packet_index + 7],
+                                   lensBuf[packet_index + 7]);
+                packet_index += 8;
+        }
+#endif
+        /* process 4 buffers at-a-time */
+        while (pktCnt >= 4) {
+                pktCnt -= 4;
+                SNOW3G_F8_4_BUFFER(pCtx, pIV[packet_index + 0],
+                                   pIV[packet_index + 1],
+                                   pIV[packet_index + 2],
+                                   pIV[packet_index + 3],
+                                   pSrcBuf[packet_index + 0],
+                                   pDstBuf[packet_index + 0],
+                                   lensBuf[packet_index + 0],
+                                   pSrcBuf[packet_index + 1],
+                                   pDstBuf[packet_index + 1],
+                                   lensBuf[packet_index + 1],
+                                   pSrcBuf[packet_index + 2],
+                                   pDstBuf[packet_index + 2],
+                                   lensBuf[packet_index + 2],
+                                   pSrcBuf[packet_index + 3],
+                                   pDstBuf[packet_index + 3],
+                                   lensBuf[packet_index + 3]);
+                packet_index += 4;
+        }
+
+        /* process 2 packets at-a-time */
+        while (pktCnt >= 2) {
+                pktCnt -= 2;
+                SNOW3G_F8_2_BUFFER(pCtx, pIV[packet_index + 0],
+                                   pIV[packet_index + 1],
+                                   pSrcBuf[packet_index + 0],
+                                   pDstBuf[packet_index + 0],
+                                   lensBuf[packet_index + 0],
+                                   pSrcBuf[packet_index + 1],
+                                   pDstBuf[packet_index + 1],
+                                   lensBuf[packet_index + 1]);
+                packet_index += 2;
+        }
+
+        /* remaining packets are processed 1 at a time */
+        while (pktCnt--) {
+                SNOW3G_F8_1_BUFFER(pCtx, pIV[packet_index + 0],
+                                   pSrcBuf[packet_index + 0],
+                                   pDstBuf[packet_index + 0],
+                                   lensBuf[packet_index + 0]);
+                packet_index++;
+        }
+}
+
+void SNOW3G_F8_N_BUFFER_MULTIKEY(const snow3g_key_schedule_t * const pCtx[],
+                                 const void * const IV[],
+                                 const void * const pBufferIn[],
+                                 void *pBufferOut[],
+                                 const uint32_t bufLenInBytes[],
+                                 const uint32_t packetCount)
+{
+#ifdef SAFE_PARAM
+        uint32_t i;
+
+        if ((pCtx == NULL) || (IV == NULL) || (pBufferIn == NULL) ||
+            (pBufferOut == NULL) || (bufLenInBytes == NULL))
+                return;
+
+        for (i = 0; i < packetCount; i++)
+                if ((pCtx[i] == NULL) || (IV[i] == NULL) ||
+                    (pBufferIn[i] == NULL) || (pBufferOut[i] == NULL) ||
+                    (bufLenInBytes[i] == 0) ||
+                    (bufLenInBytes[i] > SNOW3G_MAX_BYTELEN))
+                        return;
+#endif
+        if (packetCount > 16) {
+                pBufferOut[0] = NULL;
+                printf("packetCount too high (%d)\n", packetCount);
+                return;
+        }
+
+        uint32_t packet_index, inner_index, pktCnt = packetCount;
+        int sortNeeded = 0, tempLen = 0;
+        uint8_t *srctempbuff;
+        uint8_t *dsttempbuff;
+        uint8_t *ivtempbuff;
+        snow3g_key_schedule_t *pCtxBuf[NUM_PACKETS_16] = {NULL};
+        uint8_t *pSrcBuf[NUM_PACKETS_16] = {NULL};
+        uint8_t *pDstBuf[NUM_PACKETS_16] = {NULL};
+        uint8_t *pIV[NUM_PACKETS_16] = {NULL};
+        uint32_t lensBuf[NUM_PACKETS_16] = {0};
+        snow3g_key_schedule_t *tempCtx;
+
+        memcpy((void *)pCtxBuf, pCtx, packetCount * sizeof(void *));
+        memcpy((void *)lensBuf, bufLenInBytes, packetCount * sizeof(uint32_t));
+        memcpy((void *)pSrcBuf, pBufferIn, packetCount * sizeof(void *));
+        memcpy((void *)pDstBuf, pBufferOut, packetCount * sizeof(void *));
+        memcpy((void *)pIV, IV, packetCount * sizeof(void *));
+
+        packet_index = packetCount;
+
+        while (packet_index--) {
+
+                /* check if all packets are sorted by decreasing length */
+                if (packet_index > 0 && lensBuf[packet_index - 1] <
+                                                lensBuf[packet_index]) {
+                        /* this packet array is not correctly sorted */
+                        sortNeeded = 1;
+                }
+        }
+
+        if (sortNeeded) {
+                /* sort packets in decreasing buffer size from [0] to [n]th
+                   packet, where buffer[0] will contain longest buffer and
+                   buffer[n] will contain the shortest buffer.
+                   4 arrays are swapped :
+                   - pointers to input buffers
+                   - pointers to output buffers
+                   - pointers to input IV's
+                   - input buffer lengths */
+                packet_index = packetCount;
+                while (packet_index--) {
+                        inner_index = packet_index;
+                        while (inner_index--) {
+                                if (lensBuf[packet_index] >
+                                    lensBuf[inner_index]) {
+                                        /* swap buffers to arrange in
+                                           descending order from [0]. */
+                                        srctempbuff = pSrcBuf[packet_index];
+                                        dsttempbuff = pDstBuf[packet_index];
+                                        ivtempbuff = pIV[packet_index];
+                                        tempLen = lensBuf[packet_index];
+                                        tempCtx = pCtxBuf[packet_index];
+
+                                        pSrcBuf[packet_index] =
+                                                pSrcBuf[inner_index];
+                                        pDstBuf[packet_index] =
+                                                pDstBuf[inner_index];
+                                        pIV[packet_index] = pIV[inner_index];
+                                        lensBuf[packet_index] =
+                                                lensBuf[inner_index];
+                                        pCtxBuf[packet_index] =
+                                                pCtxBuf[inner_index];
+
+                                        pSrcBuf[inner_index] = srctempbuff;
+                                        pDstBuf[inner_index] = dsttempbuff;
+                                        pIV[inner_index] = ivtempbuff;
+                                        lensBuf[inner_index] = tempLen;
+                                        pCtxBuf[inner_index] = tempCtx;
+                                }
+                        } /* for inner packet index (inner bubble-sort) */
+                }         /* for outer packet index (outer bubble-sort) */
+        }                 /* if sortNeeded */
+
+        packet_index = 0;
+        /* process 8 buffers at-a-time */
+#ifdef AVX2
+        while (pktCnt >= 8) {
+                pktCnt -= 8;
+                SNOW3G_F8_8_BUFFER_MULTIKEY(
+                        (const snow3g_key_schedule_t * const *)
+                        &pCtxBuf[packet_index],
+                        (const void * const *)&pIV[packet_index],
+                        (const void * const *)&pSrcBuf[packet_index],
+                        (void **)&pDstBuf[packet_index],
+                        &lensBuf[packet_index]);
+                packet_index += 8;
+        }
+#endif
+        /* TODO process 4 buffers at-a-time */
+        /* TODO process 2 packets at-a-time */
+        /* remaining packets are processed 1 at a time */
+        while (pktCnt--) {
+                SNOW3G_F8_1_BUFFER(pCtxBuf[packet_index + 0],
+                                   pIV[packet_index + 0],
+                                   pSrcBuf[packet_index + 0],
+                                   pDstBuf[packet_index + 0],
+                                   lensBuf[packet_index + 0]);
+                packet_index++;
+        }
+}
+
+/*---------------------------------------------------------
+ * @description
+ *      Snow3G F9 1 buffer
+ *      Single buffer digest with IV and precomputed key schedule
+ *---------------------------------------------------------*/
+void SNOW3G_F9_1_BUFFER(const snow3g_key_schedule_t *pHandle,
+                        const void *pIV,
+                        const void *pBufferIn,
+                        const uint64_t lengthInBits,
+                        void *pDigest)
+{
+#ifdef SAFE_PARAM
+        if ((pHandle == NULL) || (pIV == NULL) ||
+            (pBufferIn == NULL) || (pDigest == NULL) ||
+            (lengthInBits == 0) || (lengthInBits > SNOW3G_MAX_BITLEN))
+                return;
+#endif
+        snow3gKeyState1_t ctx;
+        uint32_t z[5];
+        uint64_t lengthInQwords, E, V, P;
+        uint64_t i, rem_bits;
+        const uint64_t *inputBuffer;
+
+        inputBuffer = (const uint64_t *)pBufferIn;
+
+        /* Initialize the snow3g key schedule */
+        snow3gStateInitialize_1(&ctx, pHandle, pIV);
+
+        /*Generate 5 keystream words*/
+        snow3g_f9_keystream_words(&ctx, &z[0]);
+
+        P = ((uint64_t)z[0] << 32) | ((uint64_t)z[1]);
+
+        lengthInQwords = lengthInBits / 64;
+
+        E = 0;
+        /* all blocks except the last one */
+        for (i = 0; i < lengthInQwords; i++) {
+                V = BSWAP64(inputBuffer[i]);
+                E = multiply_and_reduce64(E ^ V, P);
+        }
+
+        /* last bits of last block if any left */
+        rem_bits = lengthInBits % 64;
+        if (rem_bits) {
+                /* last bytes, do not go past end of buffer */
+                memcpy(&V, &inputBuffer[i], (rem_bits + 7) / 8);
+                V = BSWAP64(V);
+                V &= (((uint64_t)-1) << (64 - rem_bits)); /* mask extra bits */
+                E = multiply_and_reduce64(E ^ V, P);
+        }
+
+        /* Multiply by Q */
+        E = multiply_and_reduce64(E ^ lengthInBits,
+                                  (((uint64_t)z[2] << 32) | ((uint64_t)z[3])));
+
+        /* Final MAC */
+        *(uint32_t *)pDigest =
+                (uint32_t)BSWAP64(E ^ ((uint64_t)z[4] << 32));
+#ifdef SAFE_DATA
+        CLEAR_VAR(&E, sizeof(E));
+        CLEAR_VAR(&V, sizeof(V));
+        CLEAR_VAR(&P, sizeof(P));
+        CLEAR_MEM(&z, sizeof(z));
+        CLEAR_MEM(&ctx, sizeof(ctx));
+        CLEAR_SCRATCH_GPS();
+        CLEAR_SCRATCH_SIMD_REGS();
+#endif /* SAFE_DATA */
+}
+
+#endif /* SNOW3G_COMMON_H */