From e6918187568dbd01842d8d1d2c808ce16a894239 Mon Sep 17 00:00:00 2001
From: Daniel Baumann <daniel.baumann@progress-linux.org>
Date: Sun, 21 Apr 2024 13:54:28 +0200
Subject: Adding upstream version 18.2.2.

Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
---
 src/spdk/intel-ipsec-mb/include/aes_common.asm     |  375 +++
 src/spdk/intel-ipsec-mb/include/aesni_emu.h        |  120 +
 src/spdk/intel-ipsec-mb/include/aesni_emu.inc      |  247 ++
 src/spdk/intel-ipsec-mb/include/clear_regs.asm     |  196 ++
 src/spdk/intel-ipsec-mb/include/clear_regs_mem.h   |   53 +
 .../intel-ipsec-mb/include/clear_regs_mem_fns.asm  |  124 +
 src/spdk/intel-ipsec-mb/include/const.inc          |  163 ++
 .../intel-ipsec-mb/include/constant_lookup.asm     |  561 ++++
 src/spdk/intel-ipsec-mb/include/constant_lookup.h  |  173 ++
 src/spdk/intel-ipsec-mb/include/cpu_feature.h      |   52 +
 src/spdk/intel-ipsec-mb/include/datastruct.asm     |  235 ++
 src/spdk/intel-ipsec-mb/include/dbgprint.asm       |  413 +++
 src/spdk/intel-ipsec-mb/include/des_utils.h        |  134 +
 src/spdk/intel-ipsec-mb/include/gcm.h              |  428 +++
 src/spdk/intel-ipsec-mb/include/gcm_defines.asm    |  272 ++
 .../include/gcm_keys_avx2_avx512.asm               |   52 +
 .../intel-ipsec-mb/include/gcm_keys_sse_avx.asm    |   73 +
 .../include/gcm_keys_vaes_avx512.asm               |  231 ++
 src/spdk/intel-ipsec-mb/include/kasumi_internal.h  | 1853 +++++++++++++
 src/spdk/intel-ipsec-mb/include/memcpy.asm         |  613 +++++
 src/spdk/intel-ipsec-mb/include/noaesni.h          |   65 +
 src/spdk/intel-ipsec-mb/include/os.asm             |   58 +
 src/spdk/intel-ipsec-mb/include/reg_sizes.asm      |  300 +++
 src/spdk/intel-ipsec-mb/include/save_xmms.asm      |  132 +
 src/spdk/intel-ipsec-mb/include/save_xmms.h        |   39 +
 src/spdk/intel-ipsec-mb/include/snow3g.h           |  511 ++++
 src/spdk/intel-ipsec-mb/include/snow3g_common.h    | 2840 ++++++++++++++++++++
 src/spdk/intel-ipsec-mb/include/snow3g_internal.h  |  638 +++++
 src/spdk/intel-ipsec-mb/include/transpose_avx2.asm |  218 ++
 .../intel-ipsec-mb/include/transpose_avx512.asm    |  497 ++++
 .../intel-ipsec-mb/include/wireless_common.asm     |  128 +
 src/spdk/intel-ipsec-mb/include/wireless_common.h  |  216 ++
 src/spdk/intel-ipsec-mb/include/zuc_common.asm     |  740 +++++
 src/spdk/intel-ipsec-mb/include/zuc_internal.h     |  432 +++
 34 files changed, 13182 insertions(+)
 create mode 100644 src/spdk/intel-ipsec-mb/include/aes_common.asm
 create mode 100644 src/spdk/intel-ipsec-mb/include/aesni_emu.h
 create mode 100644 src/spdk/intel-ipsec-mb/include/aesni_emu.inc
 create mode 100644 src/spdk/intel-ipsec-mb/include/clear_regs.asm
 create mode 100644 src/spdk/intel-ipsec-mb/include/clear_regs_mem.h
 create mode 100644 src/spdk/intel-ipsec-mb/include/clear_regs_mem_fns.asm
 create mode 100644 src/spdk/intel-ipsec-mb/include/const.inc
 create mode 100644 src/spdk/intel-ipsec-mb/include/constant_lookup.asm
 create mode 100644 src/spdk/intel-ipsec-mb/include/constant_lookup.h
 create mode 100644 src/spdk/intel-ipsec-mb/include/cpu_feature.h
 create mode 100644 src/spdk/intel-ipsec-mb/include/datastruct.asm
 create mode 100644 src/spdk/intel-ipsec-mb/include/dbgprint.asm
 create mode 100644 src/spdk/intel-ipsec-mb/include/des_utils.h
 create mode 100644 src/spdk/intel-ipsec-mb/include/gcm.h
 create mode 100644 src/spdk/intel-ipsec-mb/include/gcm_defines.asm
 create mode 100644 src/spdk/intel-ipsec-mb/include/gcm_keys_avx2_avx512.asm
 create mode 100644 src/spdk/intel-ipsec-mb/include/gcm_keys_sse_avx.asm
 create mode 100644 src/spdk/intel-ipsec-mb/include/gcm_keys_vaes_avx512.asm
 create mode 100755 src/spdk/intel-ipsec-mb/include/kasumi_internal.h
 create mode 100644 src/spdk/intel-ipsec-mb/include/memcpy.asm
 create mode 100644 src/spdk/intel-ipsec-mb/include/noaesni.h
 create mode 100644 src/spdk/intel-ipsec-mb/include/os.asm
 create mode 100644 src/spdk/intel-ipsec-mb/include/reg_sizes.asm
 create mode 100644 src/spdk/intel-ipsec-mb/include/save_xmms.asm
 create mode 100644 src/spdk/intel-ipsec-mb/include/save_xmms.h
 create mode 100644 src/spdk/intel-ipsec-mb/include/snow3g.h
 create mode 100644 src/spdk/intel-ipsec-mb/include/snow3g_common.h
 create mode 100644 src/spdk/intel-ipsec-mb/include/snow3g_internal.h
 create mode 100644 src/spdk/intel-ipsec-mb/include/transpose_avx2.asm
 create mode 100644 src/spdk/intel-ipsec-mb/include/transpose_avx512.asm
 create mode 100644 src/spdk/intel-ipsec-mb/include/wireless_common.asm
 create mode 100644 src/spdk/intel-ipsec-mb/include/wireless_common.h
 create mode 100644 src/spdk/intel-ipsec-mb/include/zuc_common.asm
 create mode 100755 src/spdk/intel-ipsec-mb/include/zuc_internal.h

(limited to 'src/spdk/intel-ipsec-mb/include')

diff --git a/src/spdk/intel-ipsec-mb/include/aes_common.asm b/src/spdk/intel-ipsec-mb/include/aes_common.asm
new file mode 100644
index 000000000..5c8cbb48c
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/include/aes_common.asm
@@ -0,0 +1,375 @@
+;;
+;; Copyright (c) 2019, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;;     * Redistributions of source code must retain the above copyright notice,
+;;       this list of conditions and the following disclaimer.
+;;     * Redistributions in binary form must reproduce the above copyright
+;;       notice, this list of conditions and the following disclaimer in the
+;;       documentation and/or other materials provided with the distribution.
+;;     * Neither the name of Intel Corporation nor the names of its contributors
+;;       may be used to endorse or promote products derived from this software
+;;       without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+%ifndef _AES_COMMON_ASM_
+%define _AES_COMMON_ASM_
+
+%include "include/reg_sizes.asm"
+
+;; =============================================================================
+;; Generic macro to produce code that executes %%OPCODE instruction
+;; on selected number of AES blocks (16 bytes long ) between 0 and 16.
+;; All three operands of the instruction come from registers.
+;; Note: if 3 blocks are left at the end instruction is produced to operate all
+;;       4 blocks (full width of ZMM)
+
+%macro ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16 14
+%define %%NUM_BLOCKS    %1      ; [in] numerical value, number of AES blocks (0 to 16)
+%define %%OPCODE        %2      ; [in] instruction name
+%define %%DST0          %3      ; [out] destination ZMM register
+%define %%DST1          %4      ; [out] destination ZMM register
+%define %%DST2          %5      ; [out] destination ZMM register
+%define %%DST3          %6      ; [out] destination ZMM register
+%define %%SRC1_0        %7      ; [in] source 1 ZMM register
+%define %%SRC1_1        %8      ; [in] source 1 ZMM register
+%define %%SRC1_2        %9      ; [in] source 1 ZMM register
+%define %%SRC1_3        %10     ; [in] source 1 ZMM register
+%define %%SRC2_0        %11     ; [in] source 2 ZMM register
+%define %%SRC2_1        %12     ; [in] source 2 ZMM register
+%define %%SRC2_2        %13     ; [in] source 2 ZMM register
+%define %%SRC2_3        %14     ; [in] source 2 ZMM register
+
+%assign reg_idx     0
+%assign blocks_left %%NUM_BLOCKS
+
+%rep (%%NUM_BLOCKS / 4)
+%xdefine %%DSTREG  %%DST %+ reg_idx
+%xdefine %%SRC1REG %%SRC1_ %+ reg_idx
+%xdefine %%SRC2REG %%SRC2_ %+ reg_idx
+        %%OPCODE        %%DSTREG, %%SRC1REG, %%SRC2REG
+%undef %%DSTREG
+%undef %%SRC1REG
+%undef %%SRC2REG
+%assign reg_idx     (reg_idx + 1)
+%assign blocks_left (blocks_left - 4)
+%endrep
+
+%xdefine %%DSTREG  %%DST %+ reg_idx
+%xdefine %%SRC1REG %%SRC1_ %+ reg_idx
+%xdefine %%SRC2REG %%SRC2_ %+ reg_idx
+
+%if blocks_left == 1
+        %%OPCODE        XWORD(%%DSTREG), XWORD(%%SRC1REG), XWORD(%%SRC2REG)
+%elif blocks_left == 2
+        %%OPCODE        YWORD(%%DSTREG), YWORD(%%SRC1REG), YWORD(%%SRC2REG)
+%elif blocks_left == 3
+        %%OPCODE        %%DSTREG, %%SRC1REG, %%SRC2REG
+%endif
+
+%endmacro
+
+;; =============================================================================
+;; Loads specified number of AES blocks into ZMM registers
+;; %%FLAGS are optional and only affect behavior when 3 trailing blocks are left
+;; - if %%FlAGS not provided then exactly 3 blocks are loaded (move and insert)
+;; - if "load_4_instead_of_3" option is passed then 4 blocks are loaded
+%macro ZMM_LOAD_BLOCKS_0_16 7-8
+%define %%NUM_BLOCKS    %1 ; [in] numerical value, number of AES blocks (0 to 16)
+%define %%INP           %2 ; [in] input data pointer to read from
+%define %%DATA_OFFSET   %3 ; [in] offset to the output pointer (GP or numerical)
+%define %%DST0          %4 ; [out] ZMM register with loaded data
+%define %%DST1          %5 ; [out] ZMM register with loaded data
+%define %%DST2          %6 ; [out] ZMM register with loaded data
+%define %%DST3          %7 ; [out] ZMM register with loaded data
+%define %%FLAGS         %8 ; [in] optional "load_4_instead_of_3"
+
+%assign src_offset  0
+%assign dst_idx     0
+
+%rep (%%NUM_BLOCKS / 4)
+%xdefine %%DSTREG %%DST %+ dst_idx
+        vmovdqu8        %%DSTREG, [%%INP + %%DATA_OFFSET + src_offset]
+%undef %%DSTREG
+%assign src_offset  (src_offset + 64)
+%assign dst_idx     (dst_idx + 1)
+%endrep
+
+%assign blocks_left (%%NUM_BLOCKS % 4)
+%xdefine %%DSTREG %%DST %+ dst_idx
+
+%if blocks_left == 1
+        vmovdqu8        XWORD(%%DSTREG), [%%INP + %%DATA_OFFSET + src_offset]
+%elif blocks_left == 2
+        vmovdqu8        YWORD(%%DSTREG), [%%INP + %%DATA_OFFSET + src_offset]
+%elif blocks_left == 3
+%ifidn %%FLAGS, load_4_instead_of_3
+        vmovdqu8        %%DSTREG, [%%INP + %%DATA_OFFSET + src_offset]
+%else
+        vmovdqu8        YWORD(%%DSTREG), [%%INP + %%DATA_OFFSET + src_offset]
+        vinserti64x2    %%DSTREG, [%%INP + %%DATA_OFFSET + src_offset + 32], 2
+%endif
+%endif
+
+%endmacro
+
+;; =============================================================================
+;; Loads specified number of AES blocks into ZMM registers using mask register
+;; for the last loaded register (xmm, ymm or zmm).
+;; Loads take place at 1 byte granularity.
+%macro ZMM_LOAD_MASKED_BLOCKS_0_16 8
+%define %%NUM_BLOCKS    %1 ; [in] numerical value, number of AES blocks (0 to 16)
+%define %%INP           %2 ; [in] input data pointer to read from
+%define %%DATA_OFFSET   %3 ; [in] offset to the output pointer (GP or numerical)
+%define %%DST0          %4 ; [out] ZMM register with loaded data
+%define %%DST1          %5 ; [out] ZMM register with loaded data
+%define %%DST2          %6 ; [out] ZMM register with loaded data
+%define %%DST3          %7 ; [out] ZMM register with loaded data
+%define %%MASK          %8 ; [in] mask register
+
+%assign src_offset  0
+%assign dst_idx     0
+%assign blocks_left %%NUM_BLOCKS
+
+%if %%NUM_BLOCKS > 0
+%rep (((%%NUM_BLOCKS + 3) / 4) - 1)
+%xdefine %%DSTREG %%DST %+ dst_idx
+        vmovdqu8        %%DSTREG, [%%INP + %%DATA_OFFSET + src_offset]
+%undef %%DSTREG
+%assign src_offset  (src_offset + 64)
+%assign dst_idx     (dst_idx + 1)
+%assign blocks_left (blocks_left - 4)
+%endrep
+%endif  ; %if %%NUM_BLOCKS > 0
+
+%xdefine %%DSTREG %%DST %+ dst_idx
+
+%if blocks_left == 1
+        vmovdqu8        XWORD(%%DSTREG){%%MASK}{z}, [%%INP + %%DATA_OFFSET + src_offset]
+%elif blocks_left == 2
+        vmovdqu8        YWORD(%%DSTREG){%%MASK}{z}, [%%INP + %%DATA_OFFSET + src_offset]
+%elif (blocks_left == 3 || blocks_left == 4)
+        vmovdqu8        %%DSTREG{%%MASK}{z}, [%%INP + %%DATA_OFFSET + src_offset]
+%endif
+
+%endmacro
+
+;; =============================================================================
+;; Stores specified number of AES blocks from ZMM registers
+%macro ZMM_STORE_BLOCKS_0_16 7
+%define %%NUM_BLOCKS    %1 ; [in] numerical value, number of AES blocks (0 to 16)
+%define %%OUTP          %2 ; [in] output data pointer to write to
+%define %%DATA_OFFSET   %3 ; [in] offset to the output pointer (GP or numerical)
+%define %%SRC0          %4 ; [in] ZMM register with data to store
+%define %%SRC1          %5 ; [in] ZMM register with data to store
+%define %%SRC2          %6 ; [in] ZMM register with data to store
+%define %%SRC3          %7 ; [in] ZMM register with data to store
+
+%assign dst_offset  0
+%assign src_idx     0
+
+%rep (%%NUM_BLOCKS / 4)
+%xdefine %%SRCREG %%SRC %+ src_idx
+        vmovdqu8         [%%OUTP + %%DATA_OFFSET + dst_offset], %%SRCREG
+%undef %%SRCREG
+%assign dst_offset  (dst_offset + 64)
+%assign src_idx     (src_idx + 1)
+%endrep
+
+%assign blocks_left (%%NUM_BLOCKS % 4)
+%xdefine %%SRCREG %%SRC %+ src_idx
+
+%if blocks_left == 1
+        vmovdqu8        [%%OUTP + %%DATA_OFFSET + dst_offset], XWORD(%%SRCREG)
+%elif blocks_left == 2
+        vmovdqu8        [%%OUTP + %%DATA_OFFSET + dst_offset], YWORD(%%SRCREG)
+%elif blocks_left == 3
+        vmovdqu8        [%%OUTP + %%DATA_OFFSET + dst_offset], YWORD(%%SRCREG)
+        vextracti32x4   [%%OUTP + %%DATA_OFFSET + dst_offset + 32], %%SRCREG, 2
+%endif
+
+%endmacro
+
+;; =============================================================================
+;; Stores specified number of AES blocks from ZMM registers with mask register
+;; for the last loaded register (xmm, ymm or zmm).
+;; Stores take place at 1 byte granularity.
+%macro ZMM_STORE_MASKED_BLOCKS_0_16 8
+%define %%NUM_BLOCKS    %1 ; [in] numerical value, number of AES blocks (0 to 16)
+%define %%OUTP          %2 ; [in] output data pointer to write to
+%define %%DATA_OFFSET   %3 ; [in] offset to the output pointer (GP or numerical)
+%define %%SRC0          %4 ; [in] ZMM register with data to store
+%define %%SRC1          %5 ; [in] ZMM register with data to store
+%define %%SRC2          %6 ; [in] ZMM register with data to store
+%define %%SRC3          %7 ; [in] ZMM register with data to store
+%define %%MASK          %8 ; [in] mask register
+
+%assign dst_offset  0
+%assign src_idx     0
+%assign blocks_left %%NUM_BLOCKS
+
+%if %%NUM_BLOCKS > 0
+%rep (((%%NUM_BLOCKS + 3) / 4) - 1)
+%xdefine %%SRCREG %%SRC %+ src_idx
+        vmovdqu8         [%%OUTP + %%DATA_OFFSET + dst_offset], %%SRCREG
+%undef %%SRCREG
+%assign dst_offset  (dst_offset + 64)
+%assign src_idx     (src_idx + 1)
+%assign blocks_left (blocks_left - 4)
+%endrep
+%endif  ; %if %%NUM_BLOCKS > 0
+
+%xdefine %%SRCREG %%SRC %+ src_idx
+
+%if blocks_left == 1
+        vmovdqu8        [%%OUTP + %%DATA_OFFSET + dst_offset]{%%MASK}, XWORD(%%SRCREG)
+%elif blocks_left == 2
+        vmovdqu8        [%%OUTP + %%DATA_OFFSET + dst_offset]{%%MASK}, YWORD(%%SRCREG)
+%elif (blocks_left == 3 || blocks_left == 4)
+        vmovdqu8        [%%OUTP + %%DATA_OFFSET + dst_offset]{%%MASK}, %%SRCREG
+%endif
+
+%endmacro
+
+;;; ===========================================================================
+;;; Handles AES encryption rounds
+;;; It handles special cases: the last and first rounds
+;;; Optionally, it performs XOR with data after the last AES round.
+;;; Uses NROUNDS parameterto check what needs to be done for the current round.
+;;; If 3 blocks are trailing then operation on whole ZMM is performed (4 blocks).
+%macro ZMM_AESENC_ROUND_BLOCKS_0_16 12
+%define %%L0B0_3   %1      ; [in/out] zmm; blocks 0 to 3
+%define %%L0B4_7   %2      ; [in/out] zmm; blocks 4 to 7
+%define %%L0B8_11  %3      ; [in/out] zmm; blocks 8 to 11
+%define %%L0B12_15 %4      ; [in/out] zmm; blocks 12 to 15
+%define %%KEY      %5      ; [in] zmm containing round key
+%define %%ROUND    %6      ; [in] round number
+%define %%D0_3     %7      ; [in] zmm or no_data; plain/cipher text blocks 0-3
+%define %%D4_7     %8      ; [in] zmm or no_data; plain/cipher text blocks 4-7
+%define %%D8_11    %9      ; [in] zmm or no_data; plain/cipher text blocks 8-11
+%define %%D12_15   %10     ; [in] zmm or no_data; plain/cipher text blocks 12-15
+%define %%NUMBL    %11     ; [in] number of blocks; numerical value
+%define %%NROUNDS  %12     ; [in] number of rounds; numerical value
+
+;;; === first AES round
+%if (%%ROUND < 1)
+        ;;  round 0
+        ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16 %%NUMBL, vpxorq, \
+                        %%L0B0_3, %%L0B4_7, %%L0B8_11, %%L0B12_15, \
+                        %%L0B0_3, %%L0B4_7, %%L0B8_11, %%L0B12_15, \
+                        %%KEY, %%KEY, %%KEY, %%KEY
+%endif                  ; ROUND 0
+
+;;; === middle AES rounds
+%if (%%ROUND >= 1 && %%ROUND <= %%NROUNDS)
+        ;; rounds 1 to 9/11/13
+        ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16 %%NUMBL, vaesenc, \
+                        %%L0B0_3, %%L0B4_7, %%L0B8_11, %%L0B12_15, \
+                        %%L0B0_3, %%L0B4_7, %%L0B8_11, %%L0B12_15, \
+                        %%KEY, %%KEY, %%KEY, %%KEY
+%endif                  ; rounds 1 to 9/11/13
+
+;;; === last AES round
+%if (%%ROUND > %%NROUNDS)
+        ;; the last round - mix enclast with text xor's
+        ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16 %%NUMBL, vaesenclast, \
+                        %%L0B0_3, %%L0B4_7, %%L0B8_11, %%L0B12_15, \
+                        %%L0B0_3, %%L0B4_7, %%L0B8_11, %%L0B12_15, \
+                        %%KEY, %%KEY, %%KEY, %%KEY
+
+;;; === XOR with data
+%ifnidn %%D0_3, no_data
+%ifnidn %%D4_7, no_data
+%ifnidn %%D8_11, no_data
+%ifnidn %%D12_15, no_data
+        ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16 %%NUMBL, vpxorq, \
+                        %%L0B0_3, %%L0B4_7, %%L0B8_11, %%L0B12_15, \
+                        %%L0B0_3, %%L0B4_7, %%L0B8_11, %%L0B12_15, \
+                        %%D0_3, %%D4_7, %%D8_11, %%D12_15
+%endif                          ; !no_data
+%endif                          ; !no_data
+%endif                          ; !no_data
+%endif                          ; !no_data
+
+%endif                  ; The last round
+
+%endmacro
+
+;;; ===========================================================================
+;;; Handles AES decryption rounds
+;;; It handles special cases: the last and first rounds
+;;; Optionally, it performs XOR with data after the last AES round.
+;;; Uses NROUNDS parameter to check what needs to be done for the current round.
+;;; If 3 blocks are trailing then operation on whole ZMM is performed (4 blocks).
+%macro ZMM_AESDEC_ROUND_BLOCKS_0_16 12
+%define %%L0B0_3   %1      ; [in/out] zmm; blocks 0 to 3
+%define %%L0B4_7   %2      ; [in/out] zmm; blocks 4 to 7
+%define %%L0B8_11  %3      ; [in/out] zmm; blocks 8 to 11
+%define %%L0B12_15 %4      ; [in/out] zmm; blocks 12 to 15
+%define %%KEY      %5      ; [in] zmm containing round key
+%define %%ROUND    %6      ; [in] round number
+%define %%D0_3     %7      ; [in] zmm or no_data; cipher text blocks 0-3
+%define %%D4_7     %8      ; [in] zmm or no_data; cipher text blocks 4-7
+%define %%D8_11    %9      ; [in] zmm or no_data; cipher text blocks 8-11
+%define %%D12_15   %10     ; [in] zmm or no_data; cipher text blocks 12-15
+%define %%NUMBL    %11     ; [in] number of blocks; numerical value
+%define %%NROUNDS  %12     ; [in] number of rounds; numerical value
+
+;;; === first AES round
+%if (%%ROUND < 1)
+        ;;  round 0
+        ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16 %%NUMBL, vpxorq, \
+                        %%L0B0_3, %%L0B4_7, %%L0B8_11, %%L0B12_15, \
+                        %%L0B0_3, %%L0B4_7, %%L0B8_11, %%L0B12_15, \
+                        %%KEY, %%KEY, %%KEY, %%KEY
+%endif                  ; ROUND 0
+
+;;; === middle AES rounds
+%if (%%ROUND >= 1 && %%ROUND <= %%NROUNDS)
+        ;; rounds 1 to 9/11/13
+        ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16 %%NUMBL, vaesdec, \
+                        %%L0B0_3, %%L0B4_7, %%L0B8_11, %%L0B12_15, \
+                        %%L0B0_3, %%L0B4_7, %%L0B8_11, %%L0B12_15, \
+                        %%KEY, %%KEY, %%KEY, %%KEY
+%endif                  ; rounds 1 to 9/11/13
+
+;;; === last AES round
+%if (%%ROUND > %%NROUNDS)
+        ;; the last round - mix enclast with text xor's
+        ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16 %%NUMBL, vaesdeclast, \
+                        %%L0B0_3, %%L0B4_7, %%L0B8_11, %%L0B12_15, \
+                        %%L0B0_3, %%L0B4_7, %%L0B8_11, %%L0B12_15, \
+                        %%KEY, %%KEY, %%KEY, %%KEY
+
+;;; === XOR with data
+%ifnidn %%D0_3, no_data
+%ifnidn %%D4_7, no_data
+%ifnidn %%D8_11, no_data
+%ifnidn %%D12_15, no_data
+        ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16 %%NUMBL, vpxorq, \
+                        %%L0B0_3, %%L0B4_7, %%L0B8_11, %%L0B12_15, \
+                        %%L0B0_3, %%L0B4_7, %%L0B8_11, %%L0B12_15, \
+                        %%D0_3, %%D4_7, %%D8_11, %%D12_15
+%endif                          ; !no_data
+%endif                          ; !no_data
+%endif                          ; !no_data
+%endif                          ; !no_data
+
+%endif                  ; The last round
+
+%endmacro
+
+%endif ;; _AES_COMMON_ASM
diff --git a/src/spdk/intel-ipsec-mb/include/aesni_emu.h b/src/spdk/intel-ipsec-mb/include/aesni_emu.h
new file mode 100644
index 000000000..575fada22
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/include/aesni_emu.h
@@ -0,0 +1,120 @@
+/*******************************************************************************
+  Copyright (c) 2018, Intel Corporation
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are met:
+
+      * Redistributions of source code must retain the above copyright notice,
+        this list of conditions and the following disclaimer.
+      * Redistributions in binary form must reproduce the above copyright
+        notice, this list of conditions and the following disclaimer in the
+        documentation and/or other materials provided with the distribution.
+      * Neither the name of Intel Corporation nor the names of its contributors
+        may be used to endorse or promote products derived from this software
+        without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+  DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+  DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+  SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+  CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+  OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+
+#ifndef _AESNI_EMU_H_
+#define _AESNI_EMU_H_
+#include <stdint.h>
+
+/* Interface to AESNI emulation routines */
+
+/* XMM type definitions and constants */
+
+#define MAX_BYTES_PER_XMM   16
+#define MAX_WORDS_PER_XMM   8
+#define MAX_DWORDS_PER_XMM  4
+#define MAX_QWORDS_PER_XMM  2
+
+union xmm_reg {
+        uint8_t  byte[MAX_BYTES_PER_XMM];
+        uint16_t word[MAX_WORDS_PER_XMM];
+        uint32_t dword[MAX_DWORDS_PER_XMM];
+        uint64_t qword[MAX_QWORDS_PER_XMM];
+};
+
+/* AESNI emulation API */
+
+/**
+ * @brief AESKEYGENASIST instruction emulation function
+ *
+ * Assist in AES round key generation using an 8 bits Round Constant
+ * (RCON) specified in \a imm8, operating on 128 bits of data
+ *
+ * @param dst pointer to 128 bit buffer to store generated key
+ * @param src pointer to 128 bit src key
+ * @param imm8 round constant used to generate key
+ */
+IMB_DLL_LOCAL void emulate_AESKEYGENASSIST(union xmm_reg *dst,
+                                           const union xmm_reg *src,
+                                           const uint32_t imm8);
+
+/**
+ * @brief AESENC instruction emulation function
+ *
+ * Perform one round of an AES encryption flow
+ *
+ * @param dst pointer to 128 bit data (state) to operate on
+ * @param src pointer to 128 bit round key
+ */
+IMB_DLL_LOCAL void emulate_AESENC(union xmm_reg *dst,
+                                  const union xmm_reg *src);
+
+/**
+ * @brief AESENCLAST instruction emulation function
+ *
+ * Perform last round of an AES encryption flow
+ *
+ * @param dst pointer to 128 bit data (state) to operate on
+ * @param src pointer to 128 bit round key
+ */
+IMB_DLL_LOCAL void emulate_AESENCLAST(union xmm_reg *dst,
+                                      const union xmm_reg *src);
+
+/**
+ * @brief AESDEC instruction emulation function
+ *
+ * Perform one round of an AES decryption flow
+ *
+ * @param dst pointer to 128 bit data (state) to operate on
+ * @param src pointer to 128 bit round key
+ */
+IMB_DLL_LOCAL void emulate_AESDEC(union xmm_reg *dst,
+                                  const union xmm_reg *src);
+
+/**
+ * @brief AESDECLAST instruction emulation function
+ *
+ * Perform last round of an AES decryption flow
+ *
+ * @param dst pointer to 128 bit data (state) to operate on
+ * @param src pointer to 128 bit round key
+ */
+IMB_DLL_LOCAL void emulate_AESDECLAST(union xmm_reg *dst,
+                                      const union xmm_reg *src);
+
+/**
+ * @brief AESIMC instruction emulation function
+ *
+ * Perform the InvMixColumn transformation on
+ * a 128 bit round key
+ *
+ * @param dst pointer to 128 bit buffer to store result
+ * @param src pointer to 128 bit round key
+ */
+IMB_DLL_LOCAL void emulate_AESIMC(union xmm_reg *dst,
+                                  const union xmm_reg *src);
+
+#endif /* _AESNI_EMU_H_ */
diff --git a/src/spdk/intel-ipsec-mb/include/aesni_emu.inc b/src/spdk/intel-ipsec-mb/include/aesni_emu.inc
new file mode 100644
index 000000000..5a40180c8
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/include/aesni_emu.inc
@@ -0,0 +1,247 @@
+;;
+;; Copyright (c) 2018, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;;     * Redistributions of source code must retain the above copyright notice,
+;;       this list of conditions and the following disclaimer.
+;;     * Redistributions in binary form must reproduce the above copyright
+;;       notice, this list of conditions and the following disclaimer in the
+;;       documentation and/or other materials provided with the distribution.
+;;     * Neither the name of Intel Corporation nor the names of its contributors
+;;       may be used to endorse or promote products derived from this software
+;;       without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+%ifndef _AESNI_EMU_INC_
+%define _AESNI_EMU_INC_
+
+%include "include/reg_sizes.asm"
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; Utility macros and defines to assist AESNI translation macros
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%define GP0  rax
+%define GP1  rbx
+%define GP2  rcx
+%define GP3  rdx
+%define GP4  rbp
+%define GP5  rsi
+%define GP6  rdi
+%define GP7  r8
+%define GP8  r9
+%define GP9  r10
+%define GP10 r11
+%define GP11 r12
+%define GP12 r13
+%define GP13 r14
+%define GP14 r15
+%define NUM_GP_REGS  15
+%define NUM_XMM_REGS 16
+
+%define GP_SZ   8
+%define XMM_SZ  16
+%define ARG_SZ  16
+
+;; 8 extra bytes added to align to 16 bytes
+%define XMM_OFFSET ((NUM_GP_REGS + 1) * GP_SZ)
+;; ARG1 placed in the stack after all GP and XMM registers
+%define ARG1_OFFSET (XMM_OFFSET + (NUM_XMM_REGS * XMM_SZ))
+;; ARG2 placed in the stack after all GP and XMM registers and ARG1
+%define ARG2_OFFSET (ARG1_OFFSET + ARG_SZ)
+
+%define GP(x) GP %+ x
+%define XMM(x) xmm  %+ x
+
+;; Reserve enough stack space to store all GP and XMM
+;; registers and emulation function arguments
+;; e.g. void emulate_AESXXX(xmm_reg *dst, xmm_reg *src);
+%define RES_STACK_SZ (ARG2_OFFSET + ARG_SZ)
+
+;; Allocate stack space and save GP registers
+%macro SAVE_GP_REGS 0
+        push    rax
+        mov     rax, rsp
+        sub	rsp, RES_STACK_SZ
+        and     rsp, -16
+%assign gp_regs_i 0
+%rep    NUM_GP_REGS
+        mov	[rsp + 8*gp_regs_i], GP(gp_regs_i)
+%assign gp_regs_i gp_regs_i+1
+%endrep
+%endmacro
+
+;; Restore GP registers and stack pointer
+%macro RESTORE_GP_REGS 0
+%assign gp_regs_i 0
+%rep    NUM_GP_REGS
+        mov	GP(gp_regs_i), [rsp + 8*gp_regs_i]
+%assign gp_regs_i gp_regs_i+1
+%endrep
+        mov     rsp, rax
+        pop     rax
+%endmacro
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; Generic macro to translate AESNI instructions to AESNI emulation functions
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%macro EMULATE_AESNI 4
+%define %%func    %1
+%define %%src_dst %2
+%define %%key     %3
+%define %%imm     %4
+
+%ifdef LINUX
+%define %%arg1 rdi
+%define %%arg2 rsi
+%define %%arg3 rdx
+%else
+%define %%arg1 rcx
+%define %%arg2 rdx
+%define %%arg3 r8
+%endif
+
+;; Check if key is reg or ptr
+%assign IS_REG 0
+%assign x 0
+%rep    NUM_XMM_REGS
+%ifidni %%key, XMM(x)
+       %assign IS_REG 1
+       %exitrep
+%endif
+%assign x x+1
+%endrep
+        ;; save GP registers to stack
+        SAVE_GP_REGS
+
+        ;; move function args onto stack before function call
+        movdqa  [rsp + ARG1_OFFSET], %%src_dst
+%if IS_REG
+        movdqa  [rsp + ARG2_OFFSET], %%key
+%else
+        movdqu  %%src_dst, %%key
+        movdqa  [rsp + ARG2_OFFSET], %%src_dst
+%endif
+        lea     %%arg1, [rsp + ARG1_OFFSET]
+        lea     %%arg2, [rsp + ARG2_OFFSET]
+
+        ;; move 8 bit imm rcon for aeskeygenassist
+%ifnum  %%imm
+        mov     BYTE(%%arg3), %%imm
+%endif
+
+;; save XMM registers to stack, as some compilers may use them in "func"
+%assign reg_idx 0
+%rep NUM_XMM_REGS
+	movdqa	[rsp + XMM_OFFSET + (reg_idx * XMM_SZ)], XMM(reg_idx)
+%assign reg_idx reg_idx + 1
+%endrep
+
+;; reserve space on stack for up to 4 arguments on the stack (windows only)
+%ifndef LINUX
+        sub     rsp, 32
+%endif
+        ;; call emulation function
+        call    %%func
+%ifndef LINUX
+        add     rsp, 32
+%endif
+
+;; restore XMM registers from stack
+%assign reg_idx 0
+%rep NUM_XMM_REGS
+	movdqa	XMM(reg_idx), [rsp + XMM_OFFSET + (reg_idx * XMM_SZ)]
+%assign reg_idx reg_idx + 1
+%endrep
+
+	;; Destination XMM gets overwritten with result from func
+        movdqa  %%src_dst, [rsp + ARG1_OFFSET]
+
+        RESTORE_GP_REGS
+%endmacro
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; Macros to translate AESNI instructions to AESNI emulation functions
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;; AESENC translation macro
+%macro EMULATE_AESENC 2
+%define %%src_dst %1
+%define %%key     %2
+        EMULATE_AESNI emulate_AESENC, %%src_dst, %%key, ""
+%endmacro
+
+;; AESENCLAST translation macro
+%macro EMULATE_AESENCLAST 2
+%define %%src_dst %1
+%define %%key     %2
+        EMULATE_AESNI emulate_AESENCLAST, %%src_dst, %%key, ""
+%endmacro
+
+;; AESDEC translation macro
+%macro EMULATE_AESDEC 2
+%define %%src_dst %1
+%define %%key     %2
+        EMULATE_AESNI emulate_AESDEC, %%src_dst, %%key, ""
+%endmacro
+
+;; AESDECLAST translation macro
+%macro EMULATE_AESDECLAST 2
+%define %%src_dst %1
+%define %%key     %2
+        EMULATE_AESNI emulate_AESDECLAST, %%src_dst, %%key, ""
+%endmacro
+
+;; AESIMC translation macro
+%macro EMULATE_AESIMC 2
+%define %%src_dst %1
+%define %%key     %2
+        EMULATE_AESNI emulate_AESIMC, %%src_dst, %%key, ""
+%endmacro
+
+;; AESKEYGENASSIST translation macro
+%macro EMULATE_AESKEYGENASSIST 3
+%define %%src_dst %1
+%define %%key     %2
+%define %%imm     %3
+        EMULATE_AESNI emulate_AESKEYGENASSIST, %%src_dst, %%key, %%imm
+%endmacro
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; AESNI defines
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%ifndef NO_AESNI_RENAME
+%define aesenc          EMULATE_AESENC
+%define aesenclast      EMULATE_AESENCLAST
+%define aesdec          EMULATE_AESDEC
+%define aesdeclast      EMULATE_AESDECLAST
+%define aesimc          EMULATE_AESIMC
+%define aeskeygenassist EMULATE_AESKEYGENASSIST
+%endif
+
+extern emulate_AESENC
+extern emulate_AESENCLAST
+extern emulate_AESDEC
+extern emulate_AESDECLAST
+extern emulate_AESIMC
+extern emulate_AESKEYGENASSIST
+
+%endif ; end ifndef _AESNI_EMU_INC_
diff --git a/src/spdk/intel-ipsec-mb/include/clear_regs.asm b/src/spdk/intel-ipsec-mb/include/clear_regs.asm
new file mode 100644
index 000000000..6cb48c49e
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/include/clear_regs.asm
@@ -0,0 +1,196 @@
+;;
+;; Copyright (c) 2019, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;;     * Redistributions of source code must retain the above copyright notice,
+;;       this list of conditions and the following disclaimer.
+;;     * Redistributions in binary form must reproduce the above copyright
+;;       notice, this list of conditions and the following disclaimer in the
+;;       documentation and/or other materials provided with the distribution.
+;;     * Neither the name of Intel Corporation nor the names of its contributors
+;;       may be used to endorse or promote products derived from this software
+;;       without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+%ifndef _CLEAR_REGS_ASM_
+%define _CLEAR_REGS_ASM_
+
+%include "include/os.asm"
+
+;
+; This macro clears any GP registers passed
+;
+%macro clear_gps 1-16
+%define %%NUM_REGS %0
+%rep %%NUM_REGS
+        xor %1, %1
+%rotate 1
+%endrep
+%endmacro
+
+;
+; This macro clears any XMM registers passed on SSE
+;
+%macro clear_xmms_sse 1-16
+%define %%NUM_REGS %0
+%rep %%NUM_REGS
+        pxor    %1, %1
+%rotate 1
+%endrep
+%endmacro
+
+;
+; This macro clears any XMM registers passed on AVX
+;
+%macro clear_xmms_avx 1-16
+%define %%NUM_REGS %0
+%rep %%NUM_REGS
+        vpxor   %1, %1
+%rotate 1
+%endrep
+%endmacro
+
+;
+; This macro clears any YMM registers passed
+;
+%macro clear_ymms 1-16
+%define %%NUM_REGS %0
+%rep %%NUM_REGS
+        vpxor   %1, %1
+%rotate 1
+%endrep
+%endmacro
+
+;
+; This macro clears any ZMM registers passed
+;
+%macro clear_zmms 1-32
+%define %%NUM_REGS %0
+%rep %%NUM_REGS
+        vpxorq  %1, %1
+%rotate 1
+%endrep
+%endmacro
+
+;
+; This macro clears all scratch GP registers
+; for Windows or Linux
+;
+%macro clear_scratch_gps_asm 0
+        clear_gps rax, rcx, rdx, r8, r9, r10, r11
+%ifdef LINUX
+        clear_gps rdi, rsi
+%endif
+%endmacro
+
+;
+; This macro clears all scratch XMM registers on SSE
+;
+%macro clear_scratch_xmms_sse_asm 0
+%ifdef LINUX
+%assign i 0
+%rep 16
+        pxor    xmm %+ i, xmm %+ i
+%assign i (i+1)
+%endrep
+; On Windows, XMM0-XMM5 registers are scratch registers
+%else
+%assign i 0
+%rep 6
+        pxor    xmm %+ i, xmm %+ i
+%assign i (i+1)
+%endrep
+%endif ; LINUX
+%endmacro
+
+;
+; This macro clears all scratch XMM registers on AVX
+;
+%macro clear_scratch_xmms_avx_asm 0
+%ifdef LINUX
+        vzeroall
+; On Windows, XMM0-XMM5 registers are scratch registers
+%else
+%assign i 0
+%rep 6
+        vpxor   xmm %+ i, xmm %+ i
+%assign i (i+1)
+%endrep
+%endif ; LINUX
+%endmacro
+
+;
+; This macro clears all scratch YMM registers
+;
+; It should be called before restoring the XMM registers
+; for Windows (XMM6-XMM15)
+;
+%macro clear_scratch_ymms_asm 0
+; On Linux, all YMM registers are scratch registers
+%ifdef LINUX
+        vzeroall
+; On Windows, YMM0-YMM5 registers are scratch registers.
+; YMM6-YMM15 upper 128 bits are scratch registers too, but
+; the lower 128 bits are to be restored after calling these function
+; which clears the upper bits too.
+%else
+%assign i 0
+%rep 6
+        vpxor   ymm %+ i, ymm %+ i
+%assign i (i+1)
+%endrep
+%endif ; LINUX
+%endmacro
+
+;
+; This macro clears all scratch ZMM registers
+;
+; It should be called before restoring the XMM registers
+; for Windows (XMM6-XMM15). YMM registers are used
+; on purpose, since XOR'ing YMM registers is faster
+; than XOR'ing ZMM registers, and the operation clears
+; also the upper 256 bits
+;
+%macro clear_scratch_zmms_asm 0
+; On Linux, all ZMM registers are scratch registers
+%ifdef LINUX
+        vzeroall
+        ;; vzeroall only clears the first 16 ZMM registers
+%assign i 16
+%rep 16
+        vpxorq  ymm %+ i, ymm %+ i
+%assign i (i+1)
+%endrep
+; On Windows, ZMM0-ZMM5 and ZMM16-ZMM31 registers are scratch registers.
+; ZMM6-ZMM15 upper 384 bits are scratch registers too, but
+; the lower 128 bits are to be restored after calling these function
+; which clears the upper bits too.
+%else
+%assign i 0
+%rep 6
+        vpxorq  ymm %+ i, ymm %+ i
+%assign i (i+1)
+%endrep
+
+%assign i 16
+%rep 16
+        vpxorq  ymm %+ i, ymm %+ i
+%assign i (i+1)
+%endrep
+%endif ; LINUX
+%endmacro
+
+%endif ;; _CLEAR_REGS_ASM
diff --git a/src/spdk/intel-ipsec-mb/include/clear_regs_mem.h b/src/spdk/intel-ipsec-mb/include/clear_regs_mem.h
new file mode 100644
index 000000000..40f888ec4
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/include/clear_regs_mem.h
@@ -0,0 +1,53 @@
+/*******************************************************************************
+ Copyright (c) 2019, Intel Corporation
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are met:
+
+     * Redistributions of source code must retain the above copyright notice,
+       this list of conditions and the following disclaimer.
+     * Redistributions in binary form must reproduce the above copyright
+       notice, this list of conditions and the following disclaimer in the
+       documentation and/or other materials provided with the distribution.
+     * Neither the name of Intel Corporation nor the names of its contributors
+       may be used to endorse or promote products derived from this software
+       without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+ FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+
+#ifndef CLEAR_REGS_H
+#define CLEAR_REGS_H
+
+#define CLEAR_SCRATCH_GPS clear_scratch_gps
+
+void force_memset_zero(void *mem, const size_t size);
+
+static inline void
+clear_mem(void *mem, const size_t size)
+{
+        force_memset_zero(mem, size);
+}
+
+static inline void
+clear_var(void *var, const size_t size)
+{
+        force_memset_zero(var, size);
+}
+
+void clear_scratch_gps(void);
+void clear_scratch_xmms_sse(void);
+void clear_scratch_xmms_avx(void);
+void clear_scratch_ymms(void);
+void clear_scratch_zmms(void);
+
+#endif /* CLEAR_REGS_H */
diff --git a/src/spdk/intel-ipsec-mb/include/clear_regs_mem_fns.asm b/src/spdk/intel-ipsec-mb/include/clear_regs_mem_fns.asm
new file mode 100644
index 000000000..4fd6f7edb
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/include/clear_regs_mem_fns.asm
@@ -0,0 +1,124 @@
+;;
+;; Copyright (c) 2019, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;;     * Redistributions of source code must retain the above copyright notice,
+;;       this list of conditions and the following disclaimer.
+;;     * Redistributions in binary form must reproduce the above copyright
+;;       notice, this list of conditions and the following disclaimer in the
+;;       documentation and/or other materials provided with the distribution.
+;;     * Neither the name of Intel Corporation nor the names of its contributors
+;;       may be used to endorse or promote products derived from this software
+;;       without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+%include "include/os.asm"
+%include "include/clear_regs.asm"
+
+section .text
+;
+; This function clears all scratch GP registers
+;
+; void clear_scratch_gps(void)
+MKGLOBAL(clear_scratch_gps,function,internal)
+clear_scratch_gps:
+
+        clear_scratch_gps_asm
+
+        ret
+
+;
+; This function clears all scratch XMM registers
+;
+; void clear_scratch_xmms_sse(void)
+MKGLOBAL(clear_scratch_xmms_sse,function,internal)
+clear_scratch_xmms_sse:
+
+        clear_scratch_xmms_sse_asm
+
+        ret
+
+;
+; This function clears all scratch XMM registers
+;
+; It should be called before restoring the XMM registers
+; for Windows (XMM6-XMM15)
+;
+; void clear_scratch_xmms_avx(void)
+MKGLOBAL(clear_scratch_xmms_avx,function,internal)
+clear_scratch_xmms_avx:
+
+        clear_scratch_xmms_avx_asm
+
+        ret
+
+;
+; This function clears all scratch YMM registers
+;
+; It should be called before restoring the XMM registers
+; for Windows (XMM6-XMM15)
+;
+; void clear_scratch_ymms(void)
+MKGLOBAL(clear_scratch_ymms,function,internal)
+clear_scratch_ymms:
+
+        clear_scratch_ymms_asm
+
+        ret
+
+;
+; This function clears all scratch ZMM registers
+;
+; It should be called before restoring the XMM registers
+; for Windows (XMM6-XMM15). YMM registers are used
+; on purpose, since XOR'ing YMM registers is faster
+; than XOR'ing ZMM registers, and the operation clears
+; also the upper 256 bits
+;
+; void clear_scratch_zmms(void)
+MKGLOBAL(clear_scratch_zmms,function,internal)
+clear_scratch_zmms:
+
+        clear_scratch_zmms_asm
+
+        ret
+
+;
+; This function clears all memory passed
+;
+; void force_memset_zero(void *mem, const size_t size)
+MKGLOBAL(force_memset_zero,function,internal)
+force_memset_zero:
+
+%ifdef LINUX
+        mov rcx, rsi
+%else
+        push rdi
+        mov rdi, rcx
+        mov rcx, rdx
+%endif
+        xor eax, eax
+        cld
+        rep stosb
+
+%ifndef LINUX
+        pop rdi
+%endif
+        ret
+
+%ifdef LINUX
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
diff --git a/src/spdk/intel-ipsec-mb/include/const.inc b/src/spdk/intel-ipsec-mb/include/const.inc
new file mode 100644
index 000000000..e77e80d2e
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/include/const.inc
@@ -0,0 +1,163 @@
+;;
+;; Copyright (c) 2018, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;;     * Redistributions of source code must retain the above copyright notice,
+;;       this list of conditions and the following disclaimer.
+;;     * Redistributions in binary form must reproduce the above copyright
+;;       notice, this list of conditions and the following disclaimer in the
+;;       documentation and/or other materials provided with the distribution.
+;;     * Neither the name of Intel Corporation nor the names of its contributors
+;;       may be used to endorse or promote products derived from this software
+;;       without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+%ifndef _CONST_INC_
+%define _CONST_INC_
+
+;;; Tables used to insert word into a SIMD register
+extern len_shift_tab
+extern len_mask_tab
+extern shift_tab_16
+
+;;; Table to do 0x80 byte shift for padding prefix
+extern padding_0x80_tab16
+
+;;; Size of len_shift_tab defined in const.asm module
+%define len_tab_diff 128
+
+; PINSRW_COMMON insert word into 128 bit SIMD register
+%macro PINSRW_COMMON 7
+
+%define %%type          %1 ; instruction type - sse or avx
+%define %%dest          %2 ; dest XMM reg to insert word
+%define %%tmp_simd      %3 ; XMM reg to clobber
+%define %%tmp_gp        %4 ; GP reg to clobber
+%define %%idx           %5 ; word index to insert value into XMM
+%define %%val           %6 ; word value to insert into idx
+%define %%scale_idx     %7 ; flag to set if index is to be scaled x16
+
+%ifidn  %%scale_idx, scale_x16
+        shl     %%idx, 4     ; scale idx up x16
+%endif
+%ifnum  %%val
+        ;; immediate value passed on
+        mov     DWORD(%%tmp_gp), %%val
+%ifidn  %%type, sse
+        movd    %%tmp_simd, DWORD(%%tmp_gp)
+%else
+        vmovd   %%tmp_simd, DWORD(%%tmp_gp)
+%endif
+%else
+        ;; register name passed on
+%ifidn  %%type, sse
+        movd    %%tmp_simd, DWORD(%%val)
+%else
+        vmovd   %%tmp_simd, DWORD(%%val)
+%endif
+%endif
+        lea     %%tmp_gp, [rel len_shift_tab]
+        ;; check type - SSE or AVX
+%ifidn  %%type, sse
+        pshufb  %%tmp_simd, [%%tmp_gp + %%idx]
+        pand    %%dest, [%%tmp_gp + len_tab_diff + %%idx]
+        por     %%dest, %%tmp_simd
+%else
+        vpshufb %%tmp_simd, [%%tmp_gp + %%idx]
+        vpand   %%dest, [%%tmp_gp + len_tab_diff + %%idx]
+        vpor    %%dest, %%tmp_simd
+%endif
+%ifidn  %%scale_idx, scale_x16
+        shr     %%idx, 4     ; reset idx
+%endif
+%endmacro
+
+;;; Call SSE macro
+%define XPINSRW PINSRW_COMMON sse,
+
+;;; Call AVX macro
+%define XVPINSRW PINSRW_COMMON avx,
+
+
+;;; VPINSRW_M256 insert word into 32 byte memory range
+%macro VPINSRW_M256 8
+
+%define %%mem_addr      %1 ; 16 byte aligned memory address to insert word
+%define %%tmp_simd1     %2 ; XMM reg to clobber
+%define %%tmp_simd2     %3 ; XMM reg to clobber
+%define %%tmp_gp        %4 ; GP reg to clobber
+%define %%offset        %5 ; GP reg used to store offset
+%define %%idx           %6 ; word index to insert value
+%define %%val           %7 ; word value to insert into idx
+%define %%scale_idx     %8 ; flag to set if index is to be scaled x16
+
+        mov     %%offset, %%idx
+        and     %%offset, 0x8   ; set offset 0 or 8
+        and     %%idx, 0x7      ; remove offset from idx
+        vmovdqa %%tmp_simd1, [%%mem_addr + %%offset*2]
+        XVPINSRW %%tmp_simd1, %%tmp_simd2, %%tmp_gp, %%idx, %%val, %%scale_idx
+        vmovdqa [%%mem_addr + %%offset*2], %%tmp_simd1
+        or      %%idx, %%offset ; reset offset
+%endmacro
+
+;;; PSLB_COMMON shift bytes 128 bit SIMD register
+%macro PSLB_COMMON 6
+
+%define %%type          %1 ; [in] instruction type - sse or avx
+%define %%dir           %2 ; [in] shift direction - left or right
+%define %%reg           %3 ; [in/out] XMM reg to shift bytes
+%define %%num           %4 ; [in] GP reg containing number of bytes to shift
+%define %%shuf_tab      %5 ; [out] XMM reg to store shuffle table
+%define %%tmp_gp        %6 ; [clobbered] GP reg to clobber
+
+        ;; load shift table into %%shuf_tab
+        lea     %%tmp_gp, [rel shift_tab_16 + 16]
+%ifidn %%dir, left
+        sub     %%tmp_gp, %%num
+%else
+        add     %%tmp_gp, %%num
+%endif
+
+%ifidn  %%type, sse
+        movdqu  %%shuf_tab, [%%tmp_gp]
+        pshufb  %%reg, %%shuf_tab
+%else
+        vmovdqu %%shuf_tab, [%%tmp_gp]
+        vpshufb %%reg, %%shuf_tab
+%endif
+%endmacro
+
+;;; Call SSE left shift macro
+%macro XPSLLB 4
+        PSLB_COMMON sse, left, %1,%2,%3,%4
+%endm
+
+;;; Call SSE right shift macro
+%macro XPSRLB 4
+        PSLB_COMMON sse, right, %1,%2,%3,%4
+%endm
+
+;;; Call AVX left shift macro
+%macro XVPSLLB 4
+        PSLB_COMMON avx, left, %1,%2,%3,%4
+%endm
+
+;;; Call AVX right shift macro
+%macro XVPSRLB 4
+        PSLB_COMMON avx, right, %1,%2,%3,%4
+%endm
+
+%endif ; end ifndef _CONST_INC_
diff --git a/src/spdk/intel-ipsec-mb/include/constant_lookup.asm b/src/spdk/intel-ipsec-mb/include/constant_lookup.asm
new file mode 100644
index 000000000..a3c81dc75
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/include/constant_lookup.asm
@@ -0,0 +1,561 @@
+;;
+;; Copyright (c) 2019, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;;     * Redistributions of source code must retain the above copyright notice,
+;;       this list of conditions and the following disclaimer.
+;;     * Redistributions in binary form must reproduce the above copyright
+;;       notice, this list of conditions and the following disclaimer in the
+;;       documentation and/or other materials provided with the distribution.
+;;     * Neither the name of Intel Corporation nor the names of its contributors
+;;       may be used to endorse or promote products derived from this software
+;;       without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+%include "include/os.asm"
+%include "include/reg_sizes.asm"
+
+section .data
+default rel
+
+align 16
+idx_tab8:
+        db 0x0,  0x1,  0x2,  0x3,  0x4,  0x5,  0x6,  0x7,
+        db 0x8,  0x9,  0xA,  0xB,  0xC,  0xD,  0xE,  0xF,
+
+align 16
+add_16:
+        db 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10,
+        db 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10
+
+align 16
+idx_tab16:
+        dw 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7
+
+align 16
+add_8:
+        dw 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8
+
+align 16
+idx_tab32:
+        dd 0x0,  0x1,  0x2,  0x3
+
+align 16
+add_4:
+        dd 0x4, 0x4, 0x4, 0x4
+
+align 16
+idx_tab64:
+        dq 0x0,  0x1
+
+add_2:
+        dq 0x2, 0x2
+
+align 16
+bcast_mask:
+        db 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01,
+        db 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01
+
+section .text
+
+%ifdef LINUX
+        %define arg1    rdi
+        %define arg2    rsi
+        %define arg3    rdx
+%else
+        %define arg1    rcx
+        %define arg2    rdx
+        %define arg3    r8
+%endif
+
+%define bcast_idx xmm0
+%define xadd      xmm1
+%define accum_val xmm2
+%define xindices  xmm3
+%define xtmp      xmm4
+%define xtmp2     xmm5
+%define tmp       r9
+%define offset    r10
+
+%define table   arg1
+%define idx     arg2
+%define size    arg3
+
+; uint8_t lookup_8bit_sse(const void *table, const uint32_t idx, const uint32_t size);
+; arg 1 : pointer to table to look up
+; arg 2 : index to look up
+; arg 3 : size of table to look up (multiple of 16 bytes)
+MKGLOBAL(lookup_8bit_sse,function,internal)
+lookup_8bit_sse:
+
+        ;; Number of loop iters = matrix size / 4 (number of values in XMM)
+        shr     size, 4
+        je      exit8_sse
+
+        xor     offset, offset
+
+        ;; Broadcast idx to look up
+        movd    bcast_idx, DWORD(idx)
+        pxor    xtmp, xtmp
+        pxor    accum_val, accum_val
+        pshufb  bcast_idx, xtmp
+
+        movdqa  xadd,     [rel add_16]
+        movdqa  xindices, [rel idx_tab8]
+
+loop8_sse:
+        movdqa  xtmp, xindices
+
+        ;; Compare indices with idx
+        ;; This generates a mask with all 0s except for the position where idx matches (all 1s here)
+        pcmpeqb xtmp, bcast_idx
+
+        ;; Load next 16 values
+        movdqa  xtmp2, [table + offset]
+
+        ;; This generates data with all 0s except the value we are looking for in the index to look up
+        pand    xtmp2, xtmp
+
+        por     accum_val, xtmp2
+
+        ;; Get next 16 indices
+        paddb   xindices, xadd
+
+        add     offset, 16
+        dec     size
+
+        jne     loop8_sse
+
+        ;; Extract value from XMM register
+        movdqa  xtmp, accum_val
+        pslldq  xtmp, 8      ; shift left by 64 bits
+        por     accum_val, xtmp
+
+        movdqa  xtmp, accum_val
+        pslldq  xtmp, 4      ; shift left by 32 bits
+        por     accum_val, xtmp
+
+        movdqa  xtmp, accum_val
+        pslldq  xtmp, 2      ; shift left by 16 bits
+        por     accum_val, xtmp
+
+        movdqa  xtmp, accum_val
+        pslldq  xtmp, 1      ; shift left by 8 bits
+        por     accum_val, xtmp
+
+        pextrb  rax, accum_val, 15
+
+exit8_sse:
+        ret
+
+; uint8_t lookup_8bit_avx(const void *table, const uint32_t idx, const uint32_t size);
+; arg 1 : pointer to table to look up
+; arg 2 : index to look up
+; arg 3 : size of table to look up (multiple of 16 bytes)
+MKGLOBAL(lookup_8bit_avx,function,internal)
+lookup_8bit_avx:
+        ;; Number of loop iters = matrix size / 4 (number of values in XMM)
+        shr     size, 4
+        je      exit8_avx
+
+        xor     offset, offset
+
+        ;; Broadcast idx to look up
+        vmovd   bcast_idx, DWORD(idx)
+        vpxor   xtmp, xtmp
+        vpxor   accum_val, accum_val
+        vpshufb bcast_idx, xtmp
+
+        vmovdqa xadd,     [rel add_16]
+        vmovdqa xindices, [rel idx_tab8]
+
+loop8_avx:
+        ;; Compare indices with idx
+        ;; This generates a mask with all 0s except for the position where idx matches (all 1s here)
+        vpcmpeqb xtmp, xindices, bcast_idx
+
+        ;; Load next 16 values
+        vmovdqa xtmp2, [table + offset]
+
+        ;; This generates data with all 0s except the value we are looking for in the index to look up
+        vpand   xtmp2, xtmp
+
+        vpor    accum_val, xtmp2
+
+        ;; Get next 16 indices
+        vpaddb  xindices, xadd
+
+        add     offset, 16
+        dec     size
+
+        jne     loop8_avx
+
+        ;; Extract value from XMM register
+        vpslldq xtmp, accum_val, 8      ; shift left by 64 bits
+        vpor    accum_val, xtmp
+
+        vpslldq xtmp, accum_val, 4      ; shift left by 32 bits
+        vpor    accum_val, xtmp
+
+        vpslldq xtmp, accum_val, 2      ; shift left by 16 bits
+        vpor    accum_val, xtmp
+
+        vpslldq xtmp, accum_val, 1      ; shift left by 8 bits
+        vpor    accum_val, xtmp
+
+        vpextrb rax, accum_val, 15
+
+exit8_avx:
+
+        ret
+
+; uint8_t lookup_16bit_sse(const void *table, const uint32_t idx, const uint32_t size);
+; arg 1 : pointer to table to look up
+; arg 2 : index to look up
+; arg 3 : size of table to look up
+MKGLOBAL(lookup_16bit_sse,function,internal)
+lookup_16bit_sse:
+
+        ;; Number of loop iters = matrix size / 8 (number of values in XMM)
+        shr     size, 3
+        je      exit16_sse
+
+        xor     offset, offset
+
+        ;; Broadcast idx to look up
+        movd    bcast_idx, DWORD(idx)
+        movdqa  xtmp, [rel bcast_mask]
+        pxor    accum_val, accum_val
+        pshufb  bcast_idx, xtmp
+
+        movdqa  xadd,     [rel add_8]
+        movdqa  xindices, [rel idx_tab16]
+
+loop16_sse:
+
+        movdqa  xtmp, xindices
+
+        ;; Compare indices with idx
+        ;; This generates a mask with all 0s except for the position where idx matches (all 1s here)
+        pcmpeqw xtmp, bcast_idx
+
+        ;; Load next 8 values
+        movdqa  xtmp2, [table + offset]
+
+        ;; This generates data with all 0s except the value we are looking for in the index to look up
+        pand    xtmp2, xtmp
+
+        por     accum_val, xtmp2
+
+        ;; Get next 8 indices
+        paddw   xindices, xadd
+        add     offset, 16
+        dec     size
+
+        jne     loop16_sse
+
+        ;; Extract value from XMM register
+        movdqa  xtmp, accum_val
+        pslldq  xtmp, 8      ; shift left by 64 bits
+        por     accum_val, xtmp
+
+        movdqa  xtmp, accum_val
+        pslldq  xtmp, 4      ; shift left by 32 bits
+        por     accum_val, xtmp
+
+        movdqa  xtmp, accum_val
+        pslldq  xtmp, 2      ; shift left by 16 bits
+        por     accum_val, xtmp
+
+        pextrw  rax, accum_val, 7
+
+exit16_sse:
+        ret
+
+; uint8_t lookup_16bit_avx(const void *table, const uint32_t idx, const uint32_t size);
+; arg 1 : pointer to table to look up
+; arg 2 : index to look up
+; arg 3 : size of table to look up
+MKGLOBAL(lookup_16bit_avx,function,internal)
+lookup_16bit_avx:
+
+        ;; Number of loop iters = matrix size / 8 (number of values in XMM)
+        shr     size, 3
+        je      exit16_avx
+
+        xor     offset, offset
+
+        ;; Broadcast idx to look up
+        vmovd   bcast_idx, DWORD(idx)
+        vmovdqa xtmp, [rel bcast_mask]
+        vpxor   accum_val, accum_val
+        vpshufb bcast_idx, xtmp
+
+        vmovdqa xadd,     [rel add_8]
+        vmovdqa xindices, [rel idx_tab16]
+
+loop16_avx:
+
+        ;; Compare indices with idx
+        ;; This generates a mask with all 0s except for the position where idx matches (all 1s here)
+        vpcmpeqw xtmp, xindices, bcast_idx
+
+        ;; Load next 16 values
+        vmovdqa xtmp2, [table + offset]
+
+        ;; This generates data with all 0s except the value we are looking for in the index to look up
+        vpand   xtmp2, xtmp
+
+        vpor    accum_val, xtmp2
+
+        ;; Get next 8 indices
+        vpaddw  xindices, xadd
+        add     offset, 16
+        dec     size
+
+        jne     loop16_avx
+
+        ;; Extract value from XMM register
+        vpslldq xtmp, accum_val, 8 ; shift left by 64 bits
+        vpor    accum_val, xtmp
+
+        vpslldq xtmp, accum_val, 4 ; shift left by 32 bits
+        vpor    accum_val, xtmp
+
+        vpslldq xtmp, accum_val, 2 ; shift left by 16 bits
+        vpor    accum_val, xtmp
+
+        vpextrw rax, accum_val, 7
+
+exit16_avx:
+        ret
+
+; uint32_t lookup_32bit_sse(const void *table, const uint32_t idx, const uint32_t size);
+; arg 1 : pointer to table to look up
+; arg 2 : index to look up
+; arg 3 : size of table to look up
+MKGLOBAL(lookup_32bit_sse,function,internal)
+lookup_32bit_sse:
+
+        ;; Number of loop iters = matrix size / 4 (number of values in XMM)
+        shr     size, 2
+        je      exit32_sse
+
+        xor     offset, offset
+
+        ;; Broadcast idx to look up
+        movd    bcast_idx, DWORD(idx)
+        pxor    accum_val, accum_val
+        pshufd  bcast_idx, bcast_idx, 0
+
+        movdqa  xadd,     [rel add_4]
+        movdqa  xindices, [rel idx_tab32]
+
+loop32_sse:
+        movdqa  xtmp, xindices
+
+        ;; Compare indices with idx
+        ;; This generates a mask with all 0s except for the position where idx matches (all 1s here)
+        pcmpeqd xtmp, bcast_idx
+
+        ;; Load next 4 values
+        movdqa  xtmp2, [table + offset]
+
+        ;; This generates data with all 0s except the value we are looking for in the index to look up
+        pand    xtmp2, xtmp
+
+        por     accum_val, xtmp2
+
+        ;; Get next 4 indices
+        paddd   xindices, xadd
+        add     offset, 16
+        dec     size
+
+        jne     loop32_sse
+
+        ;; Extract value from XMM register
+        movdqa  xtmp, accum_val
+        psrldq  xtmp, 8      ; shift right by 64 bits
+        por     accum_val, xtmp
+
+        movdqa  xtmp, accum_val
+        psrldq  xtmp, 4      ; shift right by 32 bits
+        por     accum_val, xtmp
+
+        movd    eax, accum_val
+
+exit32_sse:
+        ret
+
+
+; uint32_t lookup_32bit_avx(const void *table, const uint32_t idx, const uint32_t size);
+; arg 1 : pointer to table to look up
+; arg 2 : index to look up
+; arg 3 : size of table to look up
+MKGLOBAL(lookup_32bit_avx,function,internal)
+lookup_32bit_avx:
+        ;; Number of loop iters = matrix size / 4 (number of values in XMM)
+        shr     size, 2
+        je      exit32_avx
+
+        xor     offset, offset
+
+        ;; Broadcast idx to look up
+        vmovd   bcast_idx, DWORD(idx)
+        vpxor   accum_val, accum_val
+        vpshufd bcast_idx, bcast_idx, 0
+
+        vmovdqa xadd,     [rel add_4]
+        vmovdqa xindices, [rel idx_tab32]
+
+loop32_avx:
+        ;; Compare indices with idx
+        ;; This generates a mask with all 0s except for the position where idx matches (all 1s here)
+        vpcmpeqd xtmp, xindices, bcast_idx
+
+        ;; Load next 4 values
+        vmovdqa xtmp2, [table + offset]
+
+        ;; This generates data with all 0s except the value we are looking for in the index to look up
+        vpand   xtmp2, xtmp
+
+        vpor    accum_val, xtmp2
+
+        ;; Get next 4 indices
+        vpaddd  xindices, xadd
+        add     offset, 16
+        dec     size
+
+        jne     loop32_avx
+
+        ;; Extract value from XMM register
+        vpsrldq xtmp, accum_val, 8 ; shift right by 64 bits
+        vpor    accum_val, xtmp
+
+        vpsrldq xtmp, accum_val, 4 ; shift right by 32 bits
+        vpor    accum_val, xtmp
+
+        vmovd   eax, accum_val
+
+exit32_avx:
+        ret
+
+
+; uint64_t lookup_64bit_sse(const void *table, const uint32_t idx, const uint32_t size);
+; arg 1 : pointer to table to look up
+; arg 2 : index to look up
+; arg 3 : size of table to look up
+MKGLOBAL(lookup_64bit_sse,function,internal)
+lookup_64bit_sse:
+        ;; Number of loop iters = matrix size / 2 (number of values in XMM)
+        shr     size, 1
+        je      exit64_sse
+
+        xor     offset, offset
+
+        ;; Broadcast idx to look up
+        movq    bcast_idx, idx
+        pxor    accum_val, accum_val
+        pinsrq  bcast_idx, idx, 1
+
+        movdqa  xadd,     [rel add_2]
+        movdqa  xindices, [rel idx_tab64]
+
+loop64_sse:
+        movdqa  xtmp, xindices
+
+        ;; Compare indices with idx
+        ;; This generates a mask with all 0s except for the position where idx matches (all 1s here)
+        pcmpeqq xtmp, bcast_idx
+
+        ;; Load next 2 values
+        movdqa  xtmp2, [table + offset]
+
+        ;; This generates data with all 0s except the value we are looking for in the index to look up
+        pand    xtmp2, xtmp
+
+        por     accum_val, xtmp2
+
+        ;; Get next 2 indices
+        paddq   xindices, xadd
+        add     offset, 16
+        dec     size
+
+        jne     loop64_sse
+
+        ;; Extract value from XMM register
+        movdqa  xtmp, accum_val
+        psrldq  xtmp, 8      ; shift right by 64 bits
+        por     accum_val, xtmp
+
+        movq     rax, accum_val
+
+exit64_sse:
+        ret
+
+
+; uint64_t lookup_64bit_avx(const void *table, const uint32_t idx, const uint32_t size);
+; arg 1 : pointer to table to look up
+; arg 2 : index to look up
+; arg 3 : size of table to look up
+MKGLOBAL(lookup_64bit_avx,function,internal)
+lookup_64bit_avx:
+        ;; Number of loop iters = matrix size / 2 (number of values in XMM)
+        shr     size, 1
+        je      exit64_avx
+
+        xor     offset, offset
+
+        vmovq    bcast_idx, idx
+        vpxor    accum_val, accum_val
+        vpinsrq  bcast_idx, idx, 1
+
+        vmovdqa xadd,     [rel add_2]
+        vmovdqa xindices, [rel idx_tab64]
+
+loop64_avx:
+        ;; Compare indices with idx
+        ;; This generates a mask with all 0s except for the position where idx matches (all 1s here)
+        vpcmpeqq xtmp, xindices, bcast_idx
+
+        ;; Load next 2 values
+        vmovdqa xtmp2, [table + offset]
+
+        ;; This generates data with all 0s except the value we are looking for in the index to look up
+        vpand   xtmp2, xtmp
+
+        vpor    accum_val, xtmp2
+
+        ;; Get next 2 indices
+        vpaddq  xindices, xadd
+        add     offset, 16
+        dec     size
+
+        jne     loop64_avx
+
+        ;; Extract value from XMM register
+        vpsrldq xtmp, accum_val, 8 ; shift right by 64 bits
+        vpor    accum_val, xtmp
+
+        vmovq   rax, accum_val
+
+exit64_avx:
+        ret
+
+
+%ifdef LINUX
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
diff --git a/src/spdk/intel-ipsec-mb/include/constant_lookup.h b/src/spdk/intel-ipsec-mb/include/constant_lookup.h
new file mode 100644
index 000000000..bd56a24d2
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/include/constant_lookup.h
@@ -0,0 +1,173 @@
+/*******************************************************************************
+  Copyright (c) 2019, Intel Corporation
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are met:
+
+      * Redistributions of source code must retain the above copyright notice,
+        this list of conditions and the following disclaimer.
+      * Redistributions in binary form must reproduce the above copyright
+        notice, this list of conditions and the following disclaimer in the
+        documentation and/or other materials provided with the distribution.
+      * Neither the name of Intel Corporation nor the names of its contributors
+        may be used to endorse or promote products derived from this software
+        without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+  DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+  DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+  SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+  CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+  OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+
+#ifndef CONSTANT_LOOKUP_H
+#define CONSTANT_LOOKUP_H
+
+#include "intel-ipsec-mb.h"
+
+#ifdef SAFE_LOOKUP
+#define LOOKUP8_SSE(_table, _idx, _size) \
+        lookup_8bit_sse(_table, _idx, _size)
+#define LOOKUP8_AVX(_table, _idx, _size) \
+        lookup_8bit_avx(_table, _idx, _size)
+#define LOOKUP16_SSE(_table, _idx, _size) \
+        lookup_16bit_sse(_table, _idx, _size)
+#define LOOKUP16_AVX(_table, _idx, _size) \
+        lookup_16bit_avx(_table, _idx, _size)
+#define LOOKUP32_SSE(_table, _idx, _size) \
+        lookup_32bit_sse(_table, _idx, _size)
+#define LOOKUP32_AVX(_table, _idx, _size) \
+        lookup_32bit_avx(_table, _idx, _size)
+#define LOOKUP64_SSE(_table, _idx, _size) \
+        lookup_64bit_sse(_table, _idx, _size)
+#define LOOKUP64_AVX(_table, _idx, _size) \
+        lookup_64bit_avx(_table, _idx, _size)
+#else
+#define LOOKUP8_SSE(_table, _idx, _size) \
+        _table[_idx]
+#define LOOKUP8_AVX(_table, _idx, _size) \
+        _table[_idx]
+#define LOOKUP16_SSE(_table, _idx, _size) \
+        _table[_idx]
+#define LOOKUP16_AVX(_table, _idx, _size) \
+        _table[_idx]
+#define LOOKUP32_SSE(_table, _idx, _size) \
+        _table[_idx]
+#define LOOKUP32_AVX(_table, _idx, _size) \
+        _table[_idx]
+#define LOOKUP64_SSE(_table, _idx, _size) \
+        _table[_idx]
+#define LOOKUP64_AVX(_table, _idx, _size) \
+        _table[_idx]
+#endif
+
+/*
+ * @brief Constant time SSE lookup function on variable size table
+ *        with 8-bit values
+ *
+ * @param[in] table     Pointer to the table to look up (16-byte aligned)
+ * @param[in] idx       Index to look up
+ * @param[in] size      Number of 8 bit elements in the table (multiple of 16)
+ *
+ * @return value to lookup
+ */
+uint8_t
+lookup_8bit_sse(const void *table, const uint32_t idx, const uint32_t size);
+
+/*
+ * @brief Constant time AVX lookup function on variable size table
+ *        with 8-bit values
+ *
+ * @param[in] table     Pointer to the table to look up (16-byte aligned)
+ * @param[in] idx       Index to look up
+ * @param[in] size      Number of 8 bit elements in the table (multiple of 16)
+ *
+ * @return value to lookup
+ */
+uint8_t
+lookup_8bit_avx(const void *table, const uint32_t idx, const uint32_t size);
+
+/*
+ * @brief Constant time SSE lookup function on variable size table
+ *        with 16-bit values
+ *
+ * @param[in] table     Pointer to the table to look up (16-byte aligned)
+ * @param[in] idx       Index to look up
+ * @param[in] size      Number of 16 bit elements in the table (multiple of 8)
+ *
+ * @return value to lookup
+ */
+uint16_t
+lookup_16bit_sse(const void *table, const uint32_t idx, const uint32_t size);
+
+/*
+ * @brief Constant time AVX lookup function on variable size table
+ *        with 16-bit values
+ *
+ * @param[in] table     Pointer to the table to look up (16-byte aligned)
+ * @param[in] idx       Index to look up
+ * @param[in] size      Number of 16 bit elements in the table (multiple of 8)
+ *
+ * @return value to lookup
+ */
+uint16_t
+lookup_16bit_avx(const void *table, const uint32_t idx, const uint32_t size);
+
+/*
+ * @brief Constant time SSE lookup function on
+ *        variable size table with 32-bit values
+ *
+ * @param[in] table     Pointer to the table to look up (16-byte aligned)
+ * @param[in] idx       Index to look up
+ * @param[in] size      Number of 32 bit elements in the table (multiple of 4)
+ *
+ * @return value to lookup
+ */
+uint32_t
+lookup_32bit_sse(const void *table, const uint32_t idx, const uint32_t size);
+
+/*
+ * @brief Constant time AVX lookup function on
+ *        variable size table with 32-bit values
+ *
+ * @param[in] table     Pointer to the table to look up (16-byte aligned)
+ * @param[in] idx       Index to look up
+ * @param[in] size      Number of 32 bit elements in the table (multiple of 4)
+ *
+ * @return value to lookup
+ */
+uint32_t
+lookup_32bit_avx(const void *table, const uint32_t idx, const uint32_t size);
+
+/*
+ * @brief Constant time SSE lookup function on
+ *        variable size table with 64-bit values
+ *
+ * @param[in] table     Pointer to the table to look up (16-byte aligned)
+ * @param[in] idx       Index to look up
+ * @param[in] size      Number of 64 bit elements in the table (multiple of 2)
+ *
+ * @return value to lookup
+ */
+uint64_t
+lookup_64bit_sse(const void *table, const uint32_t idx, const uint32_t size);
+
+/*
+ * @brief Constant time AVX lookup function on
+ *        variable size table with 64-bit values
+ *
+ * @param[in] table     Pointer to the table to look up (16-byte aligned)
+ * @param[in] idx       Index to look up
+ * @param[in] size      Number of 64 bit elements in the table (multiple of 2)
+ *
+ * @return value to lookup
+ */
+uint64_t
+lookup_64bit_avx(const void *table, const uint32_t idx, const uint32_t size);
+
+#endif /* CONSTANT_LOOKUP_H */
diff --git a/src/spdk/intel-ipsec-mb/include/cpu_feature.h b/src/spdk/intel-ipsec-mb/include/cpu_feature.h
new file mode 100644
index 000000000..1347094a7
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/include/cpu_feature.h
@@ -0,0 +1,52 @@
+/*******************************************************************************
+  Copyright (c) 2018, Intel Corporation
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are met:
+
+      * Redistributions of source code must retain the above copyright notice,
+        this list of conditions and the following disclaimer.
+      * Redistributions in binary form must reproduce the above copyright
+        notice, this list of conditions and the following disclaimer in the
+        documentation and/or other materials provided with the distribution.
+      * Neither the name of Intel Corporation nor the names of its contributors
+        may be used to endorse or promote products derived from this software
+        without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+  DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+  DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+  SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+  CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+  OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+
+#include "intel-ipsec-mb.h"
+
+#ifndef CPU_FEATURE_H
+#define CPU_FEATURE_H
+
+/**
+ * @brief Detects hardware features and returns their status
+ *
+ * @return Bitmask representing presence of CPU features/extensions,
+ *         see intel-ipsec-mb.h IMB_FEATURE_xyz definitions for details.
+ */
+IMB_DLL_LOCAL uint64_t cpu_feature_detect(void);
+
+/**
+ * @brief Modifies CPU \a features mask based on requested \a flags
+ *
+ * @param flags bitmask describing CPU feature adjustments
+ * @param features bitmask describing present CPU features
+ *
+ * @return \a features with applied modifications on them via \a flags
+ */
+IMB_DLL_LOCAL uint64_t
+cpu_feature_adjust(const uint64_t flags, uint64_t features);
+
+#endif /* CPU_FEATURE_H */
diff --git a/src/spdk/intel-ipsec-mb/include/datastruct.asm b/src/spdk/intel-ipsec-mb/include/datastruct.asm
new file mode 100644
index 000000000..0ab1113ab
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/include/datastruct.asm
@@ -0,0 +1,235 @@
+;;
+;; Copyright (c) 2012-2018, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;;     * Redistributions of source code must retain the above copyright notice,
+;;       this list of conditions and the following disclaimer.
+;;     * Redistributions in binary form must reproduce the above copyright
+;;       notice, this list of conditions and the following disclaimer in the
+;;       documentation and/or other materials provided with the distribution.
+;;     * Neither the name of Intel Corporation nor the names of its contributors
+;;       may be used to endorse or promote products derived from this software
+;;       without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+; Macros for defining data structures
+
+; Usage example
+
+;START_FIELDS	; JOB_AES
+;;;	name		size	align
+;FIELD	_plaintext,	8,	8	; pointer to plaintext
+;FIELD	_ciphertext,	8,	8	; pointer to ciphertext
+;FIELD	_IV,		16,	8	; IV
+;FIELD	_keys,		8,	8	; pointer to keys
+;FIELD	_len,		4,	4	; length in bytes
+;FIELD	_status,	4,	4	; status enumeration
+;FIELD	_user_data,	8,	8	; pointer to user data
+;UNION  _union,         size1,  align1, \
+;	                size2,  align2, \
+;	                size3,  align3, \
+;	                ...
+;END_FIELDS
+;%assign _JOB_AES_size	_FIELD_OFFSET
+;%assign _JOB_AES_align	_STRUCT_ALIGN
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; Alternate "struc-like" syntax:
+;	STRUCT job_aes2
+;	RES_Q	.plaintext,	1
+;	RES_Q	.ciphertext, 	1
+;	RES_DQ	.IV,		1
+;	RES_B	.nested,	_JOB_AES_SIZE, _JOB_AES_ALIGN
+;	RES_U	.union,		size1, align1, \
+;				size2, align2, \
+;				...
+;	ENDSTRUCT
+;	; Following only needed if nesting
+;	%assign job_aes2_size	_FIELD_OFFSET
+;	%assign job_aes2_align	_STRUCT_ALIGN
+;
+; RES_* macros take a name, a count and an optional alignment.
+; The count in in terms of the base size of the macro, and the
+; default alignment is the base size.
+; The macros are:
+; Macro    Base size
+; RES_B	    1
+; RES_W	    2
+; RES_D     4
+; RES_Q     8
+; RES_DQ   16
+; RES_Y    32
+; RES_Z    64
+;
+; RES_U defines a union. It's arguments are a name and two or more
+; pairs of "size, alignment"
+;
+; The two assigns are only needed if this structure is being nested
+; within another. Even if the assigns are not done, one can still use
+; STRUCT_NAME_size as the size of the structure.
+;
+; Note that for nesting, you still need to assign to STRUCT_NAME_size.
+;
+; The differences between this and using "struc" directly are that each
+; type is implicitly aligned to its natural length (although this can be
+; over-ridden with an explicit third parameter), and that the structure
+; is padded at the end to its overall alignment.
+;
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%ifndef _DATASTRUCT_ASM_
+%define _DATASTRUCT_ASM_
+
+;; START_FIELDS
+%macro START_FIELDS 0
+%assign _FIELD_OFFSET 0
+%assign _STRUCT_ALIGN 0
+%endm
+
+;; FIELD name size align
+%macro FIELD 3
+%define %%name  %1
+%define %%size  %2
+%define %%align %3
+
+%assign _FIELD_OFFSET (_FIELD_OFFSET + (%%align) - 1) & (~ ((%%align)-1))
+%%name	equ	_FIELD_OFFSET
+%assign _FIELD_OFFSET _FIELD_OFFSET + (%%size)
+%if (%%align > _STRUCT_ALIGN)
+%assign _STRUCT_ALIGN %%align
+%endif
+%endm
+
+;; END_FIELDS
+%macro END_FIELDS 0
+%assign _FIELD_OFFSET (_FIELD_OFFSET + _STRUCT_ALIGN-1) & (~ (_STRUCT_ALIGN-1))
+%endm
+
+%macro UNION 5-*
+%if (0 == (%0 & 1))
+	%error EVEN number of parameters to UNION Macro
+	%err
+%endif
+%rotate 1
+	%assign _UNION_SIZE %1
+	%assign _UNION_ALIGN %2
+%rep (%0 - 3)/2
+	%rotate 2
+	%if (%1 > _UNION_SIZE)
+		%assign _UNION_SIZE %1
+	%endif
+	%if (%2 > _UNION_ALIGN)
+		%assign _UNION_ALIGN %2
+	%endif
+%endrep
+%rotate 2
+FIELD	%1, _UNION_SIZE, _UNION_ALIGN
+%endm
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%macro STRUCT 1
+START_FIELDS
+struc %1
+%endm
+
+%macro ENDSTRUCT 0
+%assign %%tmp _FIELD_OFFSET
+END_FIELDS
+%assign %%tmp (_FIELD_OFFSET - %%tmp)
+%if (%%tmp > 0)
+	resb	%%tmp
+%endif
+endstruc
+%endm
+
+;; RES_int name size align
+%macro RES_int 3
+%define %%name  %1
+%define %%size  %2
+%define %%align %3
+
+%assign _FIELD_OFFSET (_FIELD_OFFSET + (%%align) - 1) & (~ ((%%align)-1))
+align %%align
+%%name	resb	%%size
+%assign _FIELD_OFFSET _FIELD_OFFSET + (%%size)
+%if (%%align > _STRUCT_ALIGN)
+%assign _STRUCT_ALIGN %%align
+%endif
+%endm
+
+
+
+; macro RES_B name, size [, align]
+%macro RES_B 2-3 1
+RES_int %1, %2, %3
+%endm
+
+; macro RES_W name, size [, align]
+%macro RES_W 2-3 2
+RES_int %1, 2*(%2), %3
+%endm
+
+; macro RES_D name, size [, align]
+%macro RES_D 2-3 4
+RES_int %1, 4*(%2), %3
+%endm
+
+; macro RES_Q name, size [, align]
+%macro RES_Q 2-3 8
+RES_int %1, 8*(%2), %3
+%endm
+
+; macro RES_DQ name, size [, align]
+%macro RES_DQ 2-3 16
+RES_int %1, 16*(%2), %3
+%endm
+
+; macro RES_Y name, size [, align]
+%macro RES_Y 2-3 32
+RES_int %1, 32*(%2), %3
+%endm
+
+; macro RES_Z name, size [, align]
+%macro RES_Z 2-3 64
+RES_int %1, 64*(%2), %3
+%endm
+
+
+%macro RES_U 5-*
+%if (0 == (%0 & 1))
+	%error EVEN number of parameters to RES_U Macro
+	%err
+%endif
+%rotate 1
+	%assign _UNION_SIZE %1
+	%assign _UNION_ALIGN %2
+%rep (%0 - 3)/2
+	%rotate 2
+	%if (%1 > _UNION_SIZE)
+		%assign _UNION_SIZE %1
+	%endif
+	%if (%2 > _UNION_ALIGN)
+		%assign _UNION_ALIGN %2
+	%endif
+%endrep
+%rotate 2
+RES_int	%1, _UNION_SIZE, _UNION_ALIGN
+%endm
+
+%endif ; end ifdef _DATASTRUCT_ASM_
diff --git a/src/spdk/intel-ipsec-mb/include/dbgprint.asm b/src/spdk/intel-ipsec-mb/include/dbgprint.asm
new file mode 100644
index 000000000..d14eb0ebc
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/include/dbgprint.asm
@@ -0,0 +1,413 @@
+;;
+;; Copyright (c) 2012-2018, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;;     * Redistributions of source code must retain the above copyright notice,
+;;       this list of conditions and the following disclaimer.
+;;     * Redistributions in binary form must reproduce the above copyright
+;;       notice, this list of conditions and the following disclaimer in the
+;;       documentation and/or other materials provided with the distribution.
+;;     * Neither the name of Intel Corporation nor the names of its contributors
+;;       may be used to endorse or promote products derived from this software
+;;       without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+; Macros for "printing" for debug purposes from within asm code
+;
+; The basic macros are:
+;   DBGPRINT16, DBGPRINT32, DBGPRINT64, DBGPRINT_XMM, DBGPRINT_YMM, DBGPRINT_ZMM
+; These are called with 1 or more arguments, all of which are of the
+; size/type as specified in the name. E.g.
+;   DBGPRINT64 reg1, reg2, reg3, ...
+;
+; There is also a macro DEBUGPRINTL that takes one argument, a string. E.g.
+;   DBGPRINTL "hit this point in the code"
+;
+; There are also variations on these with the "DBGPRINT" suffixed with "L", e.g.
+; DBGPRINTL64. These take two or more arguments, where the first is a string,
+; and the rest are of the specified type, e.g.
+;   DBGPRINTL64 "Rindex", Rindex
+; Essentially, this is the same as a DBGPRINTL followed by DBGPRINT64.
+;
+; If DO_DBGPRINT is defined, then the macros write the debug information into
+; a buffer. If DO_DBGPRINT is *not* defined, then the macros expand to nothing.
+;
+; CAVEAT: The macros need a GPR. Currently, it uses R15. If the first register
+; argument is R15, then it will use R14. This means that if you try
+;   DBGPRINTL64 "text", rax, r15
+; you will not get the proper value of r15.
+; One way to avoid this issue is to not use multiple registers on the same line
+; if the register types are GPR (i.e. this is not an issue for printing XMM
+; registers). E.g the above could be done with:
+;   DBGPRINTL64 "test", rax
+;   DBGPRINT64 r15
+;
+; Note also that the macros only check for r15. Thus is you tried something
+; like (after token expansion):
+;   DBGPRINT32 r15d
+; you won't get the right results. If you want to display r15d, you should
+; print it as the 64-bit r15.
+;
+; To actually print the data, from your C code include the file
+; "dbgprint.h". The default buffer size is 16kB. If you want to change
+; that, #define DBG_BUFFER_SIZE before including "dbgprint.h".
+;
+; Then, (after your asm routine(s) have returned, call
+;   print_debug()    or    print_debug(file pointer)
+; If you do not specify a file pointer, it defaults to stdout.
+;
+; Printing the debug data also resets the write pointer to the beginning,
+; effectively "deleting" the previous messages.
+;
+%ifndef DBGPRINT_ASM_INCLUDED
+%define DBGPRINT_ASM_INCLUDED
+
+;%define DO_DBGPRINT
+%ifdef DO_DBGPRINT
+extern pDebugBuffer
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; DBGPRINT_INT size, param, ...
+%macro DBGPRINT_INT 2-*
+%ifidni %2,r15
+%xdefine %%reg r14
+%else
+%xdefine %%reg r15
+%endif
+%xdefine %%size %1
+%rotate 1
+	push	%%reg
+	mov	%%reg, [pDebugBuffer]
+%rep %0 - 1
+	mov	byte [%%reg], %%size
+	%if (%%size == 2)
+	mov	word [%%reg+1], %1
+	%elif (%%size == 4)
+	mov	dword [%%reg+1], %1
+	%elif (%%size == 8)
+	mov	qword [%%reg+1], %1
+	%elif (%%size == 16)
+	movdqu	oword [%%reg+1], %1
+	%elif (%%size == 32)
+	vmovdqu	[%%reg+1], %1
+	%elif (%%size == 64)
+	vmovdqu32 [%%reg+1], %1
+	%else
+	%error invalid size %%size
+	%endif
+	add	%%reg, %%size+1
+%rotate 1
+%endrep
+	mov	[pDebugBuffer], %%reg
+	pop	%%reg
+%endmacro
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; DBGPRINTL_INT size, label, param, ...
+%macro DBGPRINTL_INT 3-*
+%ifidni %3,r15
+%xdefine %%reg r14
+%else
+%xdefine %%reg r15
+%endif
+%xdefine %%size %1
+%rotate 1
+	push	%%reg
+	mov	%%reg, [pDebugBuffer]
+
+	mov	byte [%%reg], 0x57
+section .data
+%%lab: db %1, 0
+section .text
+	mov	qword [%%reg+1], %%lab
+	add	%%reg, 8+1
+%rotate 1
+
+%rep %0 - 2
+	mov	byte [%%reg], %%size
+%if (%%size == 2)
+	mov	word [%%reg+1], %1
+%elif (%%size == 4)
+	mov	dword [%%reg+1], %1
+%elif (%%size == 8)
+	mov	qword [%%reg+1], %1
+%elif (%%size == 16)
+	movdqu	oword [%%reg+1], %1
+%elif (%%size == 32)
+	vmovdqu	[%%reg+1], %1
+%elif (%%size == 64)
+	vmovdqu32 [%%reg+1], %1
+%else
+%error invalid size %%size
+%endif
+	add	%%reg, %%size+1
+%rotate 1
+%endrep
+	mov	[pDebugBuffer], %%reg
+	pop	%%reg
+%endmacro
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; DBGPRINTL* data, ...
+%macro DBGPRINT16 1+
+	DBGPRINT_INT 2, %1
+%endmacro
+%macro DBGPRINT32 1+
+	DBGPRINT_INT 4, %1
+%endmacro
+%macro DBGPRINT64 1+
+	DBGPRINT_INT 8, %1
+%endmacro
+%macro DBGPRINT_XMM 1+
+	DBGPRINT_INT 16, %1
+%endmacro
+%macro DBGPRINT_YMM 1+
+	DBGPRINT_INT 32, %1
+%endmacro
+%macro DBGPRINT_ZMM 1+
+	DBGPRINT_INT 64, %1
+%endmacro
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; DBGPRINTL* label, data, ...
+%macro DBGPRINTL16 2+
+	DBGPRINTL_INT 2, %1, %2
+%endmacro
+%macro DBGPRINTL32 2+
+	DBGPRINTL_INT 4, %1, %2
+%endmacro
+%macro DBGPRINTL64 2+
+	DBGPRINTL_INT 8, %1, %2
+%endmacro
+%macro DBGPRINTL_XMM 2+
+	DBGPRINTL_INT 16, %1, %2
+%endmacro
+%macro DBGPRINTL_YMM 2+
+	DBGPRINTL_INT 32, %1, %2
+%endmacro
+%macro DBGPRINTL_ZMM 2+
+	DBGPRINTL_INT 64, %1, %2
+%endmacro
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro DBGPRINTL 1
+	push	r15
+	mov	r15, [pDebugBuffer]
+
+	mov	byte [r15], 0x57
+section .data
+%%lab: db %1, 0
+section .text
+	mov	qword [r15+1], %%lab
+	add	r15, 8+1
+
+	mov	[pDebugBuffer], r15
+	pop	r15
+%endmacro
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%else
+%macro DBGPRINT16 1+
+%endmacro
+%macro DBGPRINT32 1+
+%endmacro
+%macro DBGPRINT64 1+
+%endmacro
+%macro DBGPRINT_XMM 1+
+%endmacro
+%macro DBGPRINT_YMM 1+
+%endmacro
+%macro DBGPRINT_ZMM 1+
+%endmacro
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro DBGPRINTL16 2+
+%endmacro
+%macro DBGPRINTL32 2+
+%endmacro
+%macro DBGPRINTL64 2+
+%endmacro
+%macro DBGPRINTL_XMM 2+
+%endmacro
+%macro DBGPRINTL_YMM 2+
+%endmacro
+%macro DBGPRINTL_ZMM 2+
+%endmacro
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro DBGPRINTL 1
+%endmacro
+%endif
+
+
+
+%if 0 ; OLD
+%macro DBGPRINTL_ZMM 2-*
+	push	rax
+	mov	rax, [pDebugBuffer]
+
+	mov	byte [rax], 0x57
+section .data
+%%lab: db %1, 0
+section .text
+	mov	qword [rax+1], %%lab
+	add	rax, 8+1
+%rotate 1
+
+%rep %0 - 1
+	mov	byte [rax], 64
+	vmovdqu32 [rax+1], %1
+%rotate 1
+	add	rax, 64+1
+%endrep
+	mov	[pDebugBuffer], rax
+	pop	rax
+%endmacro
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro DBGPRINT_ZMM 1-*
+	push	rax
+	mov	rax, [pDebugBuffer]
+%rep %0
+	mov	byte [rax], 64
+	vmovdqu32 [rax+1], %1
+%rotate 1
+	add	rax, 64+1
+%endrep
+	mov	[pDebugBuffer], rax
+	pop	rax
+%endmacro
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro DBGPRINT_YMM 1-*
+	push	rax
+	mov	rax, [pDebugBuffer]
+%rep %0
+	mov	byte [rax], 32
+	vmovdqu	[rax+1], %1
+%rotate 1
+	add	rax, 32+1
+%endrep
+	mov	[pDebugBuffer], rax
+	pop	rax
+%endmacro
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro DBGPRINT_XMM 1-*
+	push	rax
+	mov	rax, [pDebugBuffer]
+%rep %0
+	mov	byte [rax], 16
+	vmovdqu	oword [rax+1], %1
+%rotate 1
+	add	rax, 16+1
+%endrep
+	mov	[pDebugBuffer], rax
+	pop	rax
+%endmacro
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro DBGPRINTL64 2-*
+	push	rax
+	mov	rax, [pDebugBuffer]
+
+	mov	byte [rax], 0x57
+section .data
+%%lab: db %1, 0
+section .text
+	mov	qword [rax+1], %%lab
+	add	rax, 8+1
+%rotate 1
+
+%rep %0 - 1
+	mov	byte [rax], 8
+	mov	qword [rax+1], %1
+%rotate 1
+	add	rax, 8+1
+%endrep
+	mov	[pDebugBuffer], rax
+	pop	rax
+%endmacro
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro DBGPRINT64 1-*
+	push	rax
+	mov	rax, [pDebugBuffer]
+%rep %0
+	mov	byte [rax], 8
+	mov	qword [rax+1], %1
+%rotate 1
+	add	rax, 8+1
+%endrep
+	mov	[pDebugBuffer], rax
+	pop	rax
+%endmacro
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro DBGPRINT32 1-*
+	push	rax
+	mov	rax, [pDebugBuffer]
+%rep %0
+	mov	byte [rax], 4
+	mov	dword [rax+1], %1
+%rotate 1
+	add	rax, 4+1
+%endrep
+	mov	[pDebugBuffer], rax
+	pop	rax
+%endmacro
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro DBGPRINT16 1-*
+	push	rax
+	mov	rax, [pDebugBuffer]
+%rep %0
+	mov	byte [rax], 2
+	mov	word [rax+1], %1
+%rotate 1
+	add	rax, 2+1
+%endrep
+	mov	[pDebugBuffer], rax
+	pop	rax
+%endmacro
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro DBGPRINT_LAB 1
+	push	rax
+	mov	rax, [pDebugBuffer]
+
+	mov	byte [rax], 0x57
+section .data
+%%lab: db %1, 0
+section .text
+	mov	qword [rax+1], %%lab
+	add	rax, 8+1
+
+	mov	[pDebugBuffer], rax
+	pop	rax
+%endmacro
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro DBGHIST 2
+	inc	dword [%1 + 4 * %2]
+%endmacro
+%macro DBGPRINT_ZMM 1-*
+%endmacro
+%macro DBGPRINT_YMM 1-*
+%endmacro
+%macro DBGPRINT_XMM 1-*
+%endmacro
+%macro DBGPRINT64 1-*
+%endmacro
+%macro DBGPRINT32 1-*
+%endmacro
+%macro DBGPRINT16 1-*
+%endmacro
+%macro DBGHIST 2
+%endmacro
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%endif ; ifdef 0 ; OLD
+
+%endif ; DBGPRINT_ASM_INCLUDED
diff --git a/src/spdk/intel-ipsec-mb/include/des_utils.h b/src/spdk/intel-ipsec-mb/include/des_utils.h
new file mode 100644
index 000000000..4358132d0
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/include/des_utils.h
@@ -0,0 +1,134 @@
+/*******************************************************************************
+  Copyright (c) 2017-2018, Intel Corporation
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are met:
+
+      * Redistributions of source code must retain the above copyright notice,
+        this list of conditions and the following disclaimer.
+      * Redistributions in binary form must reproduce the above copyright
+        notice, this list of conditions and the following disclaimer in the
+        documentation and/or other materials provided with the distribution.
+      * Neither the name of Intel Corporation nor the names of its contributors
+        may be used to endorse or promote products derived from this software
+        without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+  DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+  DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+  SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+  CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+  OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+
+/* DES utility functions and macros */
+
+#ifndef DES_UTILS_H
+#define DES_UTILS_H
+
+#include <stdint.h>
+#include "intel-ipsec-mb.h"
+
+/**
+ * @brief Gets selected bit value out of a 64-bit word
+ *
+ * @param val 64-bit word
+ * @param n bit number (0 to 63) to get value of
+ *
+ * @return n-th bit value (0 or 1 value only)
+ */
+__forceinline
+uint64_t bit_get64b(const uint64_t val, const unsigned n)
+{
+        IMB_ASSERT(n < 64);
+        return (val >> n) & UINT64_C(1);
+}
+
+/**
+ * @brief Sets selected bit in a 64-bit word
+ *
+ * @param val 64-bit word
+ * @param n bit number (0 to 63) to get value of
+ * @param b bit value (0 or 1)
+ *
+ * @return val with n-th bit set to value b
+ */
+__forceinline
+uint64_t bit_set64b(const uint64_t val, const unsigned n, const uint64_t b)
+{
+        const uint64_t m = UINT64_C(1) << n;
+
+        IMB_ASSERT(n < 64);
+        return (val & (~m)) | (b << n);
+}
+
+/**
+ * @brief Permutes bits in a 64-bit word as described by pattern
+ *
+ * The function goes through pattern array from index 0 to 'size' (max 63).
+ * It sets output bit number 'index' to value of
+ * bit number 'pattern[index] - 1' from 'in'.
+ *
+ * @param in 64-bit word to be permuted
+ * @param pattern pointer to array defining the permutation
+ * @param size is size of the permutation pattern
+ *
+ * @return permuted in word as described by the pattern
+ */
+__forceinline
+uint64_t permute_64b(const uint64_t in, const uint8_t *pattern, const int size)
+{
+        uint64_t out = 0;
+        int n = 0;
+
+        IMB_ASSERT(size <= 64);
+
+        for (n = 0; n < size; n++) {
+                /* '-1' is required as bit numbers in FIPS start with 1 not 0 */
+                const int m = ((int) pattern[n]) - 1;
+                const uint64_t bit_val = bit_get64b(in, m);
+
+                out = bit_set64b(out, n, bit_val);
+        }
+
+        return out;
+}
+
+static const uint8_t reflect_tab[16] = {
+        /* [ 0] 0000 => 0000 */ 0, /* [ 1] 0001 => 1000 */ 8,
+        /* [ 2] 0010 => 0100 */ 4, /* [ 3] 0011 => 1100 */ 12,
+        /* [ 4] 0100 => 0010 */ 2, /* [ 5] 0101 => 1010 */ 10,
+        /* [ 6] 0110 => 0110 */ 6, /* [ 7] 0111 => 1110 */ 14,
+        /* [ 8] 1000 => 0001 */ 1, /* [ 9] 1001 => 1001 */ 9,
+        /* [10] 1010 => 0101 */ 5, /* [11] 1011 => 1101 */ 13,
+        /* [12] 1100 => 0011 */ 3, /* [13] 1101 => 1011 */ 11,
+        /* [14] 1110 => 0111 */ 7, /* [15] 1111 => 1111 */ 15
+};
+
+__forceinline
+uint8_t reflect_8b(const uint8_t pb)
+{
+        return reflect_tab[pb >> 4] | (reflect_tab[pb & 15] << 4);
+}
+
+__forceinline
+uint64_t load64_reflect(const void *key)
+{
+        const uint8_t *kb = (const uint8_t *) key;
+
+        return ((uint64_t) reflect_8b(kb[0])) |
+                ((uint64_t) reflect_8b(kb[1])) << 8 |
+                ((uint64_t) reflect_8b(kb[2])) << 16 |
+                ((uint64_t) reflect_8b(kb[3])) << 24 |
+                ((uint64_t) reflect_8b(kb[4])) << 32 |
+                ((uint64_t) reflect_8b(kb[5])) << 40 |
+                ((uint64_t) reflect_8b(kb[6])) << 48 |
+                ((uint64_t) reflect_8b(kb[7])) << 56;
+}
+
+
+#endif /* DES_UTILS_H */
diff --git a/src/spdk/intel-ipsec-mb/include/gcm.h b/src/spdk/intel-ipsec-mb/include/gcm.h
new file mode 100644
index 000000000..bcc13cb3a
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/include/gcm.h
@@ -0,0 +1,428 @@
+/*******************************************************************************
+  Copyright (c) 2018-2019, Intel Corporation
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are met:
+
+      * Redistributions of source code must retain the above copyright notice,
+        this list of conditions and the following disclaimer.
+      * Redistributions in binary form must reproduce the above copyright
+        notice, this list of conditions and the following disclaimer in the
+        documentation and/or other materials provided with the distribution.
+      * Neither the name of Intel Corporation nor the names of its contributors
+        may be used to endorse or promote products derived from this software
+        without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+  DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+  DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+  SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+  CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+  OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+
+#include "intel-ipsec-mb.h"
+
+#ifndef NO_GCM
+
+#ifndef _GCM_H_
+#define _GCM_H_
+
+/*
+ * AVX512+VAES+VPCLMULQDQ GCM API
+ * - intentionally this is not exposed in intel-ipsec-mb.h
+ * - available through IMB_GCM_xxx() macros from intel-ipsec-mb.h
+ */
+IMB_DLL_EXPORT void
+aes_gcm_enc_128_vaes_avx512(const struct gcm_key_data *key_data,
+                            struct gcm_context_data *context_data,
+                            uint8_t *out, uint8_t const *in, uint64_t len,
+                            const uint8_t *iv,
+                            uint8_t const *aad, uint64_t aad_len,
+                            uint8_t *auth_tag, uint64_t auth_tag_len);
+IMB_DLL_EXPORT void
+aes_gcm_enc_192_vaes_avx512(const struct gcm_key_data *key_data,
+                            struct gcm_context_data *context_data,
+                            uint8_t *out, uint8_t const *in, uint64_t len,
+                            const uint8_t *iv,
+                            uint8_t const *aad, uint64_t aad_len,
+                            uint8_t *auth_tag, uint64_t auth_tag_len);
+IMB_DLL_EXPORT void
+aes_gcm_enc_256_vaes_avx512(const struct gcm_key_data *key_data,
+                            struct gcm_context_data *context_data,
+                            uint8_t *out, uint8_t const *in, uint64_t len,
+                            const uint8_t *iv,
+                            uint8_t const *aad, uint64_t aad_len,
+                            uint8_t *auth_tag, uint64_t auth_tag_len);
+IMB_DLL_EXPORT void
+aes_gcm_dec_128_vaes_avx512(const struct gcm_key_data *key_data,
+                            struct gcm_context_data *context_data,
+                            uint8_t *out, uint8_t const *in, uint64_t len,
+                            const uint8_t *iv,
+                            uint8_t const *aad, uint64_t aad_len,
+                            uint8_t *auth_tag, uint64_t auth_tag_len);
+IMB_DLL_EXPORT void
+aes_gcm_dec_192_vaes_avx512(const struct gcm_key_data *key_data,
+                            struct gcm_context_data *context_data,
+                            uint8_t *out, uint8_t const *in, uint64_t len,
+                            const uint8_t *iv,
+                            uint8_t const *aad, uint64_t aad_len,
+                            uint8_t *auth_tag, uint64_t auth_tag_len);
+IMB_DLL_EXPORT void
+aes_gcm_dec_256_vaes_avx512(const struct gcm_key_data *key_data,
+                            struct gcm_context_data *context_data,
+                            uint8_t *out, uint8_t const *in, uint64_t len,
+                            const uint8_t *iv,
+                            uint8_t const *aad, uint64_t aad_len,
+                            uint8_t *auth_tag, uint64_t auth_tag_len);
+
+IMB_DLL_EXPORT void
+aes_gcm_init_128_vaes_avx512(const struct gcm_key_data *key_data,
+                             struct gcm_context_data *context_data,
+                             const uint8_t *iv, uint8_t const *aad,
+                             uint64_t aad_len);
+IMB_DLL_EXPORT void
+aes_gcm_init_192_vaes_avx512(const struct gcm_key_data *key_data,
+                             struct gcm_context_data *context_data,
+                             const uint8_t *iv, uint8_t const *aad,
+                             uint64_t aad_len);
+IMB_DLL_EXPORT void
+aes_gcm_init_256_vaes_avx512(const struct gcm_key_data *key_data,
+                             struct gcm_context_data *context_data,
+                             const uint8_t *iv, uint8_t const *aad,
+                             uint64_t aad_len);
+IMB_DLL_EXPORT void
+aes_gcm_enc_128_update_vaes_avx512(const struct gcm_key_data *key_data,
+                                   struct gcm_context_data *context_data,
+                                   uint8_t *out, const uint8_t *in,
+                                   uint64_t len);
+IMB_DLL_EXPORT void
+aes_gcm_enc_192_update_vaes_avx512(const struct gcm_key_data *key_data,
+                                   struct gcm_context_data *context_data,
+                                   uint8_t *out, const uint8_t *in,
+                                   uint64_t len);
+IMB_DLL_EXPORT void
+aes_gcm_enc_256_update_vaes_avx512(const struct gcm_key_data *key_data,
+                                   struct gcm_context_data *context_data,
+                                   uint8_t *out, const uint8_t *in,
+                                   uint64_t len);
+IMB_DLL_EXPORT void
+aes_gcm_dec_128_update_vaes_avx512(const struct gcm_key_data *key_data,
+                                   struct gcm_context_data *context_data,
+                                   uint8_t *out, const uint8_t *in,
+                                   uint64_t len);
+IMB_DLL_EXPORT void
+aes_gcm_dec_192_update_vaes_avx512(const struct gcm_key_data *key_data,
+                                   struct gcm_context_data *context_data,
+                                   uint8_t *out, const uint8_t *in,
+                                   uint64_t len);
+IMB_DLL_EXPORT void
+aes_gcm_dec_256_update_vaes_avx512(const struct gcm_key_data *key_data,
+                                   struct gcm_context_data *context_data,
+                                   uint8_t *out, const uint8_t *in,
+                                   uint64_t len);
+IMB_DLL_EXPORT void
+aes_gcm_enc_128_finalize_vaes_avx512(const struct gcm_key_data *key_data,
+                                     struct gcm_context_data *context_data,
+                                     uint8_t *auth_tag, uint64_t auth_tag_len);
+IMB_DLL_EXPORT void
+aes_gcm_enc_192_finalize_vaes_avx512(const struct gcm_key_data *key_data,
+                                     struct gcm_context_data *context_data,
+                                     uint8_t *auth_tag, uint64_t auth_tag_len);
+IMB_DLL_EXPORT void
+aes_gcm_enc_256_finalize_vaes_avx512(const struct gcm_key_data *key_data,
+                                     struct gcm_context_data *context_data,
+                                     uint8_t *auth_tag, uint64_t auth_tag_len);
+IMB_DLL_EXPORT void
+aes_gcm_dec_128_finalize_vaes_avx512(const struct gcm_key_data *key_data,
+                                     struct gcm_context_data *context_data,
+                                     uint8_t *auth_tag, uint64_t auth_tag_len);
+IMB_DLL_EXPORT void
+aes_gcm_dec_192_finalize_vaes_avx512(const struct gcm_key_data *key_data,
+                                     struct gcm_context_data *context_data,
+                                     uint8_t *auth_tag, uint64_t auth_tag_len);
+IMB_DLL_EXPORT void
+aes_gcm_dec_256_finalize_vaes_avx512(const struct gcm_key_data *key_data,
+                                     struct gcm_context_data *context_data,
+                                     uint8_t *auth_tag, uint64_t auth_tag_len);
+IMB_DLL_EXPORT void
+aes_gcm_precomp_128_vaes_avx512(struct gcm_key_data *key_data);
+IMB_DLL_EXPORT void
+aes_gcm_precomp_192_vaes_avx512(struct gcm_key_data *key_data);
+IMB_DLL_EXPORT void
+aes_gcm_precomp_256_vaes_avx512(struct gcm_key_data *key_data);
+
+IMB_DLL_EXPORT void
+aes_gcm_pre_128_vaes_avx512(const void *key, struct gcm_key_data *key_data);
+IMB_DLL_EXPORT void
+aes_gcm_pre_192_vaes_avx512(const void *key, struct gcm_key_data *key_data);
+IMB_DLL_EXPORT void
+aes_gcm_pre_256_vaes_avx512(const void *key, struct gcm_key_data *key_data);
+
+/*
+ * AVX512 GCM API
+ * - intentionally this is not exposed in intel-ipsec-mb.h
+ * - available through IMB_GCM_xxx() macros from intel-ipsec-mb.h
+ */
+IMB_DLL_EXPORT void
+aes_gcm_enc_128_avx512(const struct gcm_key_data *key_data,
+                       struct gcm_context_data *context_data,
+                       uint8_t *out, uint8_t const *in, uint64_t len,
+                       const uint8_t *iv,
+                       uint8_t const *aad, uint64_t aad_len,
+                       uint8_t *auth_tag, uint64_t auth_tag_len);
+IMB_DLL_EXPORT void
+aes_gcm_enc_192_avx512(const struct gcm_key_data *key_data,
+                       struct gcm_context_data *context_data,
+                       uint8_t *out, uint8_t const *in, uint64_t len,
+                       const uint8_t *iv,
+                       uint8_t const *aad, uint64_t aad_len,
+                       uint8_t *auth_tag, uint64_t auth_tag_len);
+IMB_DLL_EXPORT void
+aes_gcm_enc_256_avx512(const struct gcm_key_data *key_data,
+                       struct gcm_context_data *context_data,
+                       uint8_t *out, uint8_t const *in, uint64_t len,
+                       const uint8_t *iv,
+                       uint8_t const *aad, uint64_t aad_len,
+                       uint8_t *auth_tag, uint64_t auth_tag_len);
+IMB_DLL_EXPORT void
+aes_gcm_dec_128_avx512(const struct gcm_key_data *key_data,
+                       struct gcm_context_data *context_data,
+                       uint8_t *out, uint8_t const *in, uint64_t len,
+                       const uint8_t *iv,
+                       uint8_t const *aad, uint64_t aad_len,
+                       uint8_t *auth_tag, uint64_t auth_tag_len);
+IMB_DLL_EXPORT void
+aes_gcm_dec_192_avx512(const struct gcm_key_data *key_data,
+                       struct gcm_context_data *context_data,
+                       uint8_t *out, uint8_t const *in, uint64_t len,
+                       const uint8_t *iv,
+                       uint8_t const *aad, uint64_t aad_len,
+                       uint8_t *auth_tag, uint64_t auth_tag_len);
+IMB_DLL_EXPORT void
+aes_gcm_dec_256_avx512(const struct gcm_key_data *key_data,
+                       struct gcm_context_data *context_data,
+                       uint8_t *out, uint8_t const *in, uint64_t len,
+                       const uint8_t *iv,
+                       uint8_t const *aad, uint64_t aad_len,
+                       uint8_t *auth_tag, uint64_t auth_tag_len);
+
+IMB_DLL_EXPORT void
+aes_gcm_init_128_avx512(const struct gcm_key_data *key_data,
+                        struct gcm_context_data *context_data,
+                        const uint8_t *iv, uint8_t const *aad,
+                        uint64_t aad_len);
+IMB_DLL_EXPORT void
+aes_gcm_init_192_avx512(const struct gcm_key_data *key_data,
+                        struct gcm_context_data *context_data,
+                        const uint8_t *iv, uint8_t const *aad,
+                        uint64_t aad_len);
+IMB_DLL_EXPORT void
+aes_gcm_init_256_avx512(const struct gcm_key_data *key_data,
+                        struct gcm_context_data *context_data,
+                        const uint8_t *iv, uint8_t const *aad,
+                        uint64_t aad_len);
+IMB_DLL_EXPORT void
+aes_gcm_enc_128_update_avx512(const struct gcm_key_data *key_data,
+                              struct gcm_context_data *context_data,
+                              uint8_t *out, const uint8_t *in,
+                              uint64_t len);
+IMB_DLL_EXPORT void
+aes_gcm_enc_192_update_avx512(const struct gcm_key_data *key_data,
+                              struct gcm_context_data *context_data,
+                              uint8_t *out, const uint8_t *in,
+                              uint64_t len);
+IMB_DLL_EXPORT void
+aes_gcm_enc_256_update_avx512(const struct gcm_key_data *key_data,
+                              struct gcm_context_data *context_data,
+                              uint8_t *out, const uint8_t *in,
+                              uint64_t len);
+IMB_DLL_EXPORT void
+aes_gcm_dec_128_update_avx512(const struct gcm_key_data *key_data,
+                              struct gcm_context_data *context_data,
+                              uint8_t *out, const uint8_t *in,
+                              uint64_t len);
+IMB_DLL_EXPORT void
+aes_gcm_dec_192_update_avx512(const struct gcm_key_data *key_data,
+                              struct gcm_context_data *context_data,
+                              uint8_t *out, const uint8_t *in,
+                              uint64_t len);
+IMB_DLL_EXPORT void
+aes_gcm_dec_256_update_avx512(const struct gcm_key_data *key_data,
+                              struct gcm_context_data *context_data,
+                              uint8_t *out, const uint8_t *in,
+                              uint64_t len);
+IMB_DLL_EXPORT void
+aes_gcm_enc_128_finalize_avx512(const struct gcm_key_data *key_data,
+                                struct gcm_context_data *context_data,
+                                uint8_t *auth_tag, uint64_t auth_tag_len);
+IMB_DLL_EXPORT void
+aes_gcm_enc_192_finalize_avx512(const struct gcm_key_data *key_data,
+                                struct gcm_context_data *context_data,
+                                uint8_t *auth_tag, uint64_t auth_tag_len);
+IMB_DLL_EXPORT void
+aes_gcm_enc_256_finalize_avx512(const struct gcm_key_data *key_data,
+                                struct gcm_context_data *context_data,
+                                uint8_t *auth_tag, uint64_t auth_tag_len);
+IMB_DLL_EXPORT void
+aes_gcm_dec_128_finalize_avx512(const struct gcm_key_data *key_data,
+                                struct gcm_context_data *context_data,
+                                uint8_t *auth_tag, uint64_t auth_tag_len);
+IMB_DLL_EXPORT void
+aes_gcm_dec_192_finalize_avx512(const struct gcm_key_data *key_data,
+                                struct gcm_context_data *context_data,
+                                uint8_t *auth_tag, uint64_t auth_tag_len);
+IMB_DLL_EXPORT void
+aes_gcm_dec_256_finalize_avx512(const struct gcm_key_data *key_data,
+                                struct gcm_context_data *context_data,
+                                uint8_t *auth_tag, uint64_t auth_tag_len);
+IMB_DLL_EXPORT void
+aes_gcm_precomp_128_avx512(struct gcm_key_data *key_data);
+IMB_DLL_EXPORT void
+aes_gcm_precomp_192_avx512(struct gcm_key_data *key_data);
+IMB_DLL_EXPORT void
+aes_gcm_precomp_256_avx512(struct gcm_key_data *key_data);
+
+IMB_DLL_EXPORT void
+aes_gcm_pre_128_avx512(const void *key, struct gcm_key_data *key_data);
+IMB_DLL_EXPORT void
+aes_gcm_pre_192_avx512(const void *key, struct gcm_key_data *key_data);
+IMB_DLL_EXPORT void
+aes_gcm_pre_256_avx512(const void *key, struct gcm_key_data *key_data);
+
+/*
+ * AESNI emulation GCM API (based on SSE acrhitecture)
+ * - intentionally this is not exposed in intel-ipsec-mb.h
+ * - available through IMB_GCM_xxx() macros from intel-ipsec-mb.h
+ */
+IMB_DLL_EXPORT void
+aes_gcm_enc_128_sse_no_aesni(const struct gcm_key_data *key_data,
+                             struct gcm_context_data *context_data,
+                             uint8_t *out, uint8_t const *in, uint64_t len,
+                             const uint8_t *iv, uint8_t const *aad,
+                             uint64_t aad_len, uint8_t *auth_tag,
+                             uint64_t auth_tag_len);
+IMB_DLL_EXPORT void
+aes_gcm_enc_192_sse_no_aesni(const struct gcm_key_data *key_data,
+                             struct gcm_context_data *context_data,
+                             uint8_t *out, uint8_t const *in, uint64_t len,
+                             const uint8_t *iv, uint8_t const *aad,
+                             uint64_t aad_len, uint8_t *auth_tag,
+                             uint64_t auth_tag_len);
+IMB_DLL_EXPORT void
+aes_gcm_enc_256_sse_no_aesni(const struct gcm_key_data *key_data,
+                             struct gcm_context_data *context_data,
+                             uint8_t *out, uint8_t const *in, uint64_t len,
+                             const uint8_t *iv,
+                             uint8_t const *aad, uint64_t aad_len,
+                             uint8_t *auth_tag, uint64_t auth_tag_len);
+IMB_DLL_EXPORT void
+aes_gcm_dec_128_sse_no_aesni(const struct gcm_key_data *key_data,
+                             struct gcm_context_data *context_data,
+                             uint8_t *out, uint8_t const *in, uint64_t len,
+                             const uint8_t *iv, uint8_t const *aad,
+                             uint64_t aad_len, uint8_t *auth_tag,
+                             uint64_t auth_tag_len);
+IMB_DLL_EXPORT void
+aes_gcm_dec_192_sse_no_aesni(const struct gcm_key_data *key_data,
+                             struct gcm_context_data *context_data,
+                             uint8_t *out, uint8_t const *in, uint64_t len,
+                             const uint8_t *iv, uint8_t const *aad,
+                             uint64_t aad_len, uint8_t *auth_tag,
+                             uint64_t auth_tag_len);
+IMB_DLL_EXPORT void
+aes_gcm_dec_256_sse_no_aesni(const struct gcm_key_data *key_data,
+                             struct gcm_context_data *context_data,
+                             uint8_t *out, uint8_t const *in, uint64_t len,
+                             const uint8_t *iv, uint8_t const *aad,
+                             uint64_t aad_len, uint8_t *auth_tag,
+                             uint64_t auth_tag_len);
+IMB_DLL_EXPORT void
+aes_gcm_init_128_sse_no_aesni(const struct gcm_key_data *key_data,
+                              struct gcm_context_data *context_data,
+                              const uint8_t *iv, uint8_t const *aad,
+                              uint64_t aad_len);
+IMB_DLL_EXPORT void
+aes_gcm_init_192_sse_no_aesni(const struct gcm_key_data *key_data,
+                              struct gcm_context_data *context_data,
+                              const uint8_t *iv, uint8_t const *aad,
+                              uint64_t aad_len);
+IMB_DLL_EXPORT void
+aes_gcm_init_256_sse_no_aesni(const struct gcm_key_data *key_data,
+                              struct gcm_context_data *context_data,
+                              const uint8_t *iv, uint8_t const *aad,
+                              uint64_t aad_len);
+IMB_DLL_EXPORT void
+aes_gcm_enc_128_update_sse_no_aesni(const struct gcm_key_data *key_data,
+                                    struct gcm_context_data *context_data,
+                                    uint8_t *out, const uint8_t *in,
+                                    uint64_t len);
+IMB_DLL_EXPORT void
+aes_gcm_enc_192_update_sse_no_aesni(const struct gcm_key_data *key_data,
+                                    struct gcm_context_data *context_data,
+                                    uint8_t *out, const uint8_t *in,
+                                    uint64_t len);
+IMB_DLL_EXPORT void
+aes_gcm_enc_256_update_sse_no_aesni(const struct gcm_key_data *key_data,
+                                    struct gcm_context_data *context_data,
+                                    uint8_t *out, const uint8_t *in,
+                                    uint64_t len);
+IMB_DLL_EXPORT void
+aes_gcm_dec_128_update_sse_no_aesni(const struct gcm_key_data *key_data,
+                                    struct gcm_context_data *context_data,
+                                    uint8_t *out, const uint8_t *in,
+                                    uint64_t len);
+IMB_DLL_EXPORT void
+aes_gcm_dec_192_update_sse_no_aesni(const struct gcm_key_data *key_data,
+                                    struct gcm_context_data *context_data,
+                                    uint8_t *out, const uint8_t *in,
+                                    uint64_t len);
+IMB_DLL_EXPORT void
+aes_gcm_dec_256_update_sse_no_aesni(const struct gcm_key_data *key_data,
+                                    struct gcm_context_data *context_data,
+                                    uint8_t *out, const uint8_t *in,
+                                    uint64_t len);
+IMB_DLL_EXPORT void
+aes_gcm_enc_128_finalize_sse_no_aesni(const struct gcm_key_data *key_data,
+                                      struct gcm_context_data *context_data,
+                                      uint8_t *auth_tag, uint64_t auth_tag_len);
+IMB_DLL_EXPORT void
+aes_gcm_enc_192_finalize_sse_no_aesni(const struct gcm_key_data *key_data,
+                                      struct gcm_context_data *context_data,
+                                      uint8_t *auth_tag, uint64_t auth_tag_len);
+IMB_DLL_EXPORT void
+aes_gcm_enc_256_finalize_sse_no_aesni(const struct gcm_key_data *key_data,
+                                      struct gcm_context_data *context_data,
+                                      uint8_t *auth_tag, uint64_t auth_tag_len);
+IMB_DLL_EXPORT void
+aes_gcm_dec_128_finalize_sse_no_aesni(const struct gcm_key_data *key_data,
+                                      struct gcm_context_data *context_data,
+                                      uint8_t *auth_tag, uint64_t auth_tag_len);
+IMB_DLL_EXPORT void
+aes_gcm_dec_192_finalize_sse_no_aesni(const struct gcm_key_data *key_data,
+                                      struct gcm_context_data *context_data,
+                                      uint8_t *auth_tag, uint64_t auth_tag_len);
+IMB_DLL_EXPORT void
+aes_gcm_dec_256_finalize_sse_no_aesni(const struct gcm_key_data *key_data,
+                                      struct gcm_context_data *context_data,
+                                      uint8_t *auth_tag, uint64_t auth_tag_len);
+IMB_DLL_EXPORT void
+aes_gcm_precomp_128_sse_no_aesni(struct gcm_key_data *key_data);
+IMB_DLL_EXPORT void
+aes_gcm_precomp_192_sse_no_aesni(struct gcm_key_data *key_data);
+IMB_DLL_EXPORT void
+aes_gcm_precomp_256_sse_no_aesni(struct gcm_key_data *key_data);
+
+IMB_DLL_EXPORT void
+aes_gcm_pre_128_sse_no_aesni(const void *key, struct gcm_key_data *key_data);
+IMB_DLL_EXPORT void
+aes_gcm_pre_192_sse_no_aesni(const void *key, struct gcm_key_data *key_data);
+IMB_DLL_EXPORT void
+aes_gcm_pre_256_sse_no_aesni(const void *key, struct gcm_key_data *key_data);
+
+#endif /* _GCM_H_ */
+#endif /* NO_GCM */
diff --git a/src/spdk/intel-ipsec-mb/include/gcm_defines.asm b/src/spdk/intel-ipsec-mb/include/gcm_defines.asm
new file mode 100644
index 000000000..31a961729
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/include/gcm_defines.asm
@@ -0,0 +1,272 @@
+;;
+;; Copyright (c) 2012-2019, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;;     * Redistributions of source code must retain the above copyright notice,
+;;       this list of conditions and the following disclaimer.
+;;     * Redistributions in binary form must reproduce the above copyright
+;;       notice, this list of conditions and the following disclaimer in the
+;;       documentation and/or other materials provided with the distribution.
+;;     * Neither the name of Intel Corporation nor the names of its contributors
+;;       may be used to endorse or promote products derived from this software
+;;       without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+%ifndef GCM_DEFINES_ASM_INCLUDED
+%define GCM_DEFINES_ASM_INCLUDED
+
+;
+; Authors:
+;       Erdinc Ozturk
+;       Vinodh Gopal
+;       James Guilford
+
+section .data
+default rel
+
+align 16
+POLY:   dq     0x0000000000000001, 0xC200000000000000
+
+align 64
+POLY2:
+        dq     0x00000001C2000000, 0xC200000000000000
+        dq     0x00000001C2000000, 0xC200000000000000
+        dq     0x00000001C2000000, 0xC200000000000000
+        dq     0x00000001C2000000, 0xC200000000000000
+
+align 16
+TWOONE: dq     0x0000000000000001, 0x0000000100000000
+
+;;; @note Order of these constants should not change.
+;;; More specifically, ALL_F should follow SHIFT_MASK, and ZERO should follow ALL_F
+align 64
+SHUF_MASK:
+        dq     0x08090A0B0C0D0E0F, 0x0001020304050607
+        dq     0x08090A0B0C0D0E0F, 0x0001020304050607
+        dq     0x08090A0B0C0D0E0F, 0x0001020304050607
+        dq     0x08090A0B0C0D0E0F, 0x0001020304050607
+
+align 16
+SHIFT_MASK:
+        dq     0x0706050403020100, 0x0f0e0d0c0b0a0908
+
+ALL_F:
+        dq     0xffffffffffffffff, 0xffffffffffffffff
+
+ZERO:
+        dq     0x0000000000000000, 0x0000000000000000
+
+align 16
+ONE:
+        dq     0x0000000000000001, 0x0000000000000000
+
+align 16
+TWO:
+        dq     0x0000000000000002, 0x0000000000000000
+
+align 16
+ONEf:
+        dq     0x0000000000000000, 0x0100000000000000
+
+align 16
+TWOf:
+        dq     0x0000000000000000, 0x0200000000000000
+
+align 64
+ddq_add_1234:
+        dq	0x0000000000000001, 0x0000000000000000
+        dq	0x0000000000000002, 0x0000000000000000
+        dq	0x0000000000000003, 0x0000000000000000
+        dq	0x0000000000000004, 0x0000000000000000
+
+align 64
+ddq_add_5678:
+        dq	0x0000000000000005, 0x0000000000000000
+        dq	0x0000000000000006, 0x0000000000000000
+        dq	0x0000000000000007, 0x0000000000000000
+        dq	0x0000000000000008, 0x0000000000000000
+
+align 64
+ddq_add_4444:
+        dq	0x0000000000000004, 0x0000000000000000
+        dq	0x0000000000000004, 0x0000000000000000
+        dq	0x0000000000000004, 0x0000000000000000
+        dq	0x0000000000000004, 0x0000000000000000
+
+align 64
+ddq_add_8888:
+        dq	0x0000000000000008, 0x0000000000000000
+        dq	0x0000000000000008, 0x0000000000000000
+        dq	0x0000000000000008, 0x0000000000000000
+        dq	0x0000000000000008, 0x0000000000000000
+
+align 64
+ddq_addbe_1234:
+        dq	0x0000000000000000, 0x0100000000000000
+        dq	0x0000000000000000, 0x0200000000000000
+        dq	0x0000000000000000, 0x0300000000000000
+        dq	0x0000000000000000, 0x0400000000000000
+
+align 64
+ddq_addbe_5678:
+        dq	0x0000000000000000, 0x0500000000000000
+        dq	0x0000000000000000, 0x0600000000000000
+        dq	0x0000000000000000, 0x0700000000000000
+        dq	0x0000000000000000, 0x0800000000000000
+
+align 64
+ddq_addbe_4444:
+        dq	0x0000000000000000, 0x0400000000000000
+        dq	0x0000000000000000, 0x0400000000000000
+        dq	0x0000000000000000, 0x0400000000000000
+        dq	0x0000000000000000, 0x0400000000000000
+
+align 64
+ddq_addbe_8888:
+        dq	0x0000000000000000, 0x0800000000000000
+        dq	0x0000000000000000, 0x0800000000000000
+        dq	0x0000000000000000, 0x0800000000000000
+        dq	0x0000000000000000, 0x0800000000000000
+
+align 64
+byte_len_to_mask_table:
+        dw      0x0000, 0x0001, 0x0003, 0x0007,
+        dw      0x000f, 0x001f, 0x003f, 0x007f,
+        dw      0x00ff, 0x01ff, 0x03ff, 0x07ff,
+        dw      0x0fff, 0x1fff, 0x3fff, 0x7fff,
+        dw      0xffff
+
+align 64
+byte64_len_to_mask_table:
+        dq      0x0000000000000000, 0x0000000000000001
+        dq      0x0000000000000003, 0x0000000000000007
+        dq      0x000000000000000f, 0x000000000000001f
+        dq      0x000000000000003f, 0x000000000000007f
+        dq      0x00000000000000ff, 0x00000000000001ff
+        dq      0x00000000000003ff, 0x00000000000007ff
+        dq      0x0000000000000fff, 0x0000000000001fff
+        dq      0x0000000000003fff, 0x0000000000007fff
+        dq      0x000000000000ffff, 0x000000000001ffff
+        dq      0x000000000003ffff, 0x000000000007ffff
+        dq      0x00000000000fffff, 0x00000000001fffff
+        dq      0x00000000003fffff, 0x00000000007fffff
+        dq      0x0000000000ffffff, 0x0000000001ffffff
+        dq      0x0000000003ffffff, 0x0000000007ffffff
+        dq      0x000000000fffffff, 0x000000001fffffff
+        dq      0x000000003fffffff, 0x000000007fffffff
+        dq      0x00000000ffffffff, 0x00000001ffffffff
+        dq      0x00000003ffffffff, 0x00000007ffffffff
+        dq      0x0000000fffffffff, 0x0000001fffffffff
+        dq      0x0000003fffffffff, 0x0000007fffffffff
+        dq      0x000000ffffffffff, 0x000001ffffffffff
+        dq      0x000003ffffffffff, 0x000007ffffffffff
+        dq      0x00000fffffffffff, 0x00001fffffffffff
+        dq      0x00003fffffffffff, 0x00007fffffffffff
+        dq      0x0000ffffffffffff, 0x0001ffffffffffff
+        dq      0x0003ffffffffffff, 0x0007ffffffffffff
+        dq      0x000fffffffffffff, 0x001fffffffffffff
+        dq      0x003fffffffffffff, 0x007fffffffffffff
+        dq      0x00ffffffffffffff, 0x01ffffffffffffff
+        dq      0x03ffffffffffffff, 0x07ffffffffffffff
+        dq      0x0fffffffffffffff, 0x1fffffffffffffff
+        dq      0x3fffffffffffffff, 0x7fffffffffffffff
+        dq      0xffffffffffffffff
+
+align 64
+mask_out_top_block:
+        dq      0xffffffffffffffff, 0xffffffffffffffff
+        dq      0xffffffffffffffff, 0xffffffffffffffff
+        dq      0xffffffffffffffff, 0xffffffffffffffff
+        dq      0x0000000000000000, 0x0000000000000000
+
+section .text
+
+;;define the fields of gcm_context_data struct
+;; struct gcm_context_data {
+;;         // init, update and finalize context data
+;;         uint8_t  aad_hash[GCM_BLOCK_LEN];
+;;         uint64_t aad_length;
+;;         uint64_t in_length;
+;;         uint8_t  partial_block_enc_key[GCM_BLOCK_LEN];
+;;         uint8_t  orig_IV[GCM_BLOCK_LEN];
+;;         uint8_t  current_counter[GCM_BLOCK_LEN];
+;;         uint64_t  partial_block_length;
+;; };
+
+%define AadHash		(16*0)	  ; store current Hash of data which has been input
+%define AadLen		(16*1)	  ; store length of input data which will not be encrypted or decrypted
+%define InLen		((16*1)+8); store length of input data which will be encrypted or decrypted
+%define PBlockEncKey	(16*2)	  ; encryption key for the partial block at the end of the previous update
+%define OrigIV		(16*3)	  ; input IV
+%define CurCount	(16*4)	  ; Current counter for generation of encryption key
+%define PBlockLen	(16*5)	  ; length of partial block at the end of the previous update
+
+%define reg(q) xmm %+ q
+%define regy(q) ymm %+ q
+%define regz(q) zmm %+ q
+
+%ifdef WIN_ABI
+	%xdefine arg1 rcx
+	%xdefine arg2 rdx
+	%xdefine arg3 r8
+	%xdefine arg4 r9
+	%xdefine arg5  qword [r14 + STACK_OFFSET + 8*5]
+	%xdefine arg6  qword [r14 + STACK_OFFSET + 8*6]
+	%xdefine arg7  qword [r14 + STACK_OFFSET + 8*7]
+	%xdefine arg8  qword [r14 + STACK_OFFSET + 8*8]
+	%xdefine arg9  qword [r14 + STACK_OFFSET + 8*9]
+	%xdefine arg10 qword [r14 + STACK_OFFSET + 8*10]
+%else
+	%xdefine arg1 rdi
+	%xdefine arg2 rsi
+	%xdefine arg3 rdx
+	%xdefine arg4 rcx
+	%xdefine arg5 r8
+	%xdefine arg6 r9
+	%xdefine arg7  qword [r14 + STACK_OFFSET + 8*1]
+	%xdefine arg8  qword [r14 + STACK_OFFSET + 8*2]
+	%xdefine arg9  qword [r14 + STACK_OFFSET + 8*3]
+	%xdefine arg10 qword [r14 + STACK_OFFSET + 8*4]
+%endif
+
+%ifdef NT_LDST
+	%define NT_LD
+	%define NT_ST
+%endif
+
+;;; Use Non-temporal load/stor
+%ifdef NT_LD
+	%define	XLDR	 movntdqa
+	%define	VXLDR	 vmovntdqa
+	%define	VX512LDR vmovntdqa
+%else
+	%define	XLDR	 movdqu
+	%define	VXLDR	 vmovdqu
+	%define	VX512LDR vmovdqu8
+%endif
+
+;;; Use Non-temporal load/stor
+%ifdef NT_ST
+	%define	XSTR	 movntdq
+	%define	VXSTR	 vmovntdq
+	%define	VX512STR vmovntdq
+%else
+	%define	XSTR	 movdqu
+	%define	VXSTR	 vmovdqu
+	%define	VX512STR vmovdqu8
+%endif
+
+%endif ; GCM_DEFINES_ASM_INCLUDED
diff --git a/src/spdk/intel-ipsec-mb/include/gcm_keys_avx2_avx512.asm b/src/spdk/intel-ipsec-mb/include/gcm_keys_avx2_avx512.asm
new file mode 100644
index 000000000..d812e53bd
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/include/gcm_keys_avx2_avx512.asm
@@ -0,0 +1,52 @@
+;;
+;; Copyright (c) 2019, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;;     * Redistributions of source code must retain the above copyright notice,
+;;       this list of conditions and the following disclaimer.
+;;     * Redistributions in binary form must reproduce the above copyright
+;;       notice, this list of conditions and the following disclaimer in the
+;;       documentation and/or other materials provided with the distribution.
+;;     * Neither the name of Intel Corporation nor the names of its contributors
+;;       may be used to endorse or promote products derived from this software
+;;       without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+%ifndef GCM_KEYS_AVX2_AVX512_INCLUDED
+%define GCM_KEYS_AVX2_AVX512_INCLUDED
+
+;; Define the fields of gcm_key_data struct:
+;; uint8_t expanded_keys[GCM_ENC_KEY_LEN * GCM_KEY_SETS];
+;; uint8_t shifted_hkey_8[GCM_ENC_KEY_LEN]; // HashKey^8 <<1 mod poly
+;; uint8_t shifted_hkey_7[GCM_ENC_KEY_LEN]; // HashKey^7 <<1 mod poly
+;; uint8_t shifted_hkey_6[GCM_ENC_KEY_LEN]; // HashKey^6 <<1 mod poly
+;; uint8_t shifted_hkey_5[GCM_ENC_KEY_LEN]; // HashKey^5 <<1 mod poly
+;; uint8_t shifted_hkey_4[GCM_ENC_KEY_LEN]; // HashKey^4 <<1 mod poly
+;; uint8_t shifted_hkey_3[GCM_ENC_KEY_LEN]; // HashKey^3 <<1 mod poly
+;; uint8_t shifted_hkey_2[GCM_ENC_KEY_LEN]; // HashKey^2 <<1 mod poly
+;; uint8_t shifted_hkey_1[GCM_ENC_KEY_LEN]; // HashKey   <<1 mod poly
+
+%define HashKey_8       (16*15)  ; HashKey^8 <<1 mod poly
+%define HashKey_7       (16*16)  ; HashKey^7 <<1 mod poly
+%define HashKey_6       (16*17)  ; HashKey^6 <<1 mod poly
+%define HashKey_5       (16*18)  ; HashKey^5 <<1 mod poly
+%define HashKey_4       (16*19)  ; HashKey^4 <<1 mod poly
+%define HashKey_3       (16*20)  ; HashKey^3 <<1 mod poly
+%define HashKey_2       (16*21)  ; HashKey^2 <<1 mod poly
+%define HashKey_1       (16*22)  ; HashKey <<1 mod poly
+%define HashKey         (16*22)  ; HashKey <<1 mod poly
+
+%endif ; GCM_KEYS_AVX2_AVX512_INCLUDED
diff --git a/src/spdk/intel-ipsec-mb/include/gcm_keys_sse_avx.asm b/src/spdk/intel-ipsec-mb/include/gcm_keys_sse_avx.asm
new file mode 100644
index 000000000..f7531e5a7
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/include/gcm_keys_sse_avx.asm
@@ -0,0 +1,73 @@
+;;
+;; Copyright (c) 2019, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;;     * Redistributions of source code must retain the above copyright notice,
+;;       this list of conditions and the following disclaimer.
+;;     * Redistributions in binary form must reproduce the above copyright
+;;       notice, this list of conditions and the following disclaimer in the
+;;       documentation and/or other materials provided with the distribution.
+;;     * Neither the name of Intel Corporation nor the names of its contributors
+;;       may be used to endorse or promote products derived from this software
+;;       without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+%ifndef GCM_KEYS_SSE_AVX_INCLUDED
+%define GCM_KEYS_SSE_AVX_INCLUDED
+
+;; Define the fields of gcm_key_data struct:
+;; uint8_t expanded_keys[GCM_ENC_KEY_LEN * GCM_KEY_SETS];
+;; uint8_t shifted_hkey_8[GCM_ENC_KEY_LEN]; // HashKey^8 <<1 mod poly
+;; uint8_t shifted_hkey_7[GCM_ENC_KEY_LEN]; // HashKey^7 <<1 mod poly
+;; uint8_t shifted_hkey_6[GCM_ENC_KEY_LEN]; // HashKey^6 <<1 mod poly
+;; uint8_t shifted_hkey_5[GCM_ENC_KEY_LEN]; // HashKey^5 <<1 mod poly
+;; uint8_t shifted_hkey_4[GCM_ENC_KEY_LEN]; // HashKey^4 <<1 mod poly
+;; uint8_t shifted_hkey_3[GCM_ENC_KEY_LEN]; // HashKey^3 <<1 mod poly
+;; uint8_t shifted_hkey_2[GCM_ENC_KEY_LEN]; // HashKey^2 <<1 mod poly
+;; uint8_t shifted_hkey_1[GCM_ENC_KEY_LEN]; // HashKey   <<1 mod poly
+;; uint8_t shifted_hkey_1_k[GCM_ENC_KEY_LEN]; // XOR of High and Low 64 bits of HashKey <<1 mod poly (Karatsuba)
+;; uint8_t shifted_hkey_2_k[GCM_ENC_KEY_LEN]; // XOR of High and Low 64 bits of  HashKey^2 <<1 mod poly (Karatsuba)
+;; uint8_t shifted_hkey_3_k[GCM_ENC_KEY_LEN]; // XOR of High and Low 64 bits of  HashKey^3 <<1 mod poly (Karatsuba)
+;; uint8_t shifted_hkey_4_k[GCM_ENC_KEY_LEN]; // XOR of High and Low 64 bits of  HashKey^4 <<1 mod poly (Karatsuba)
+;; uint8_t shifted_hkey_5_k[GCM_ENC_KEY_LEN]; // XOR of High and Low 64 bits of  HashKey^5 <<1 mod poly (Karatsuba)
+;; uint8_t shifted_hkey_6_k[GCM_ENC_KEY_LEN]; // XOR of High and Low 64 bits of  HashKey^6 <<1 mod poly (Karatsuba)
+;; uint8_t shifted_hkey_7_k[GCM_ENC_KEY_LEN]; // XOR of High and Low 64 bits of  HashKey^7 <<1 mod poly (Karatsuba)
+;; uint8_t shifted_hkey_8_k[GCM_ENC_KEY_LEN]; // XOR of High and Low 64 bits of  HashKey^8 <<1 mod poly (Karatsuba)
+
+;;
+;; Key structure holds up to 8 ghash keys
+;;
+%define HashKey_8   (16*15)   ; HashKey^8 <<1 mod poly
+%define HashKey_7   (16*16)   ; HashKey^7 <<1 mod poly
+%define HashKey_6   (16*17)   ; HashKey^6 <<1 mod poly
+%define HashKey_5   (16*18)   ; HashKey^5 <<1 mod poly
+%define HashKey_4   (16*19)   ; HashKey^4 <<1 mod poly
+%define HashKey_3   (16*20)   ; HashKey^3 <<1 mod poly
+%define HashKey_2   (16*21)   ; HashKey^2 <<1 mod poly
+%define HashKey_1   (16*22)   ; HashKey <<1 mod poly
+%define HashKey     (16*22)   ; HashKey <<1 mod poly
+;; ghash keys for Karatsuba multiply
+%define HashKey_k   (16*23)   ; XOR of High 64 bits and Low 64 bits of HashKey <<1 mod poly
+%define HashKey_1_k (16*23)   ; XOR of High 64 bits and Low 64 bits of HashKey <<1 mod poly
+%define HashKey_2_k (16*24)   ; XOR of High 64 bits and Low 64 bits of HashKey^2 <<1 mod poly
+%define HashKey_3_k (16*25)   ; XOR of High 64 bits and Low 64 bits of HashKey^3 <<1 mod poly
+%define HashKey_4_k (16*26)   ; XOR of High 64 bits and Low 64 bits of HashKey^4 <<1 mod poly
+%define HashKey_5_k (16*27)   ; XOR of High 64 bits and Low 64 bits of HashKey^5 <<1 mod poly
+%define HashKey_6_k (16*28)   ; XOR of High 64 bits and Low 64 bits of HashKey^6 <<1 mod poly
+%define HashKey_7_k (16*29)   ; XOR of High 64 bits and Low 64 bits of HashKey^7 <<1 mod poly
+%define HashKey_8_k (16*30)   ; XOR of High 64 bits and Low 64 bits of HashKey^8 <<1 mod poly
+
+%endif ; GCM_KEYS_SSE_AVX_INCLUDED
diff --git a/src/spdk/intel-ipsec-mb/include/gcm_keys_vaes_avx512.asm b/src/spdk/intel-ipsec-mb/include/gcm_keys_vaes_avx512.asm
new file mode 100644
index 000000000..4aea2f5c9
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/include/gcm_keys_vaes_avx512.asm
@@ -0,0 +1,231 @@
+;;
+;; Copyright (c) 2019, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;;     * Redistributions of source code must retain the above copyright notice,
+;;       this list of conditions and the following disclaimer.
+;;     * Redistributions in binary form must reproduce the above copyright
+;;       notice, this list of conditions and the following disclaimer in the
+;;       documentation and/or other materials provided with the distribution.
+;;     * Neither the name of Intel Corporation nor the names of its contributors
+;;       may be used to endorse or promote products derived from this software
+;;       without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+%ifndef GCM_KEYS_VAES_AVX512_INCLUDED
+%define GCM_KEYS_VAES_AVX512_INCLUDED
+
+;; Define the fields of gcm_key_data struct:
+;; uint8_t expanded_keys[GCM_ENC_KEY_LEN * GCM_KEY_SETS];
+;; uint8_t shifted_hkey_9_128[GCM_ENC_KEY_LEN * (128 - 8)];
+;; uint8_t shifted_hkey_8[GCM_ENC_KEY_LEN]; // HashKey^8 <<1 mod poly
+;; uint8_t shifted_hkey_7[GCM_ENC_KEY_LEN]; // HashKey^7 <<1 mod poly
+;; uint8_t shifted_hkey_6[GCM_ENC_KEY_LEN]; // HashKey^6 <<1 mod poly
+;; uint8_t shifted_hkey_5[GCM_ENC_KEY_LEN]; // HashKey^5 <<1 mod poly
+;; uint8_t shifted_hkey_4[GCM_ENC_KEY_LEN]; // HashKey^4 <<1 mod poly
+;; uint8_t shifted_hkey_3[GCM_ENC_KEY_LEN]; // HashKey^3 <<1 mod poly
+;; uint8_t shifted_hkey_2[GCM_ENC_KEY_LEN]; // HashKey^2 <<1 mod poly
+;; uint8_t shifted_hkey_1[GCM_ENC_KEY_LEN]; // HashKey   <<1 mod poly
+
+%ifdef GCM_BIG_DATA
+;;
+;; Key structure holds up to 128 ghash keys
+;;
+%define HashKey_128     (16*15)   ; HashKey^128 <<1 mod poly
+%define HashKey_127     (16*16)   ; HashKey^127 <<1 mod poly
+%define HashKey_126     (16*17)   ; HashKey^126 <<1 mod poly
+%define HashKey_125     (16*18)   ; HashKey^125 <<1 mod poly
+%define HashKey_124     (16*19)   ; HashKey^124 <<1 mod poly
+%define HashKey_123     (16*20)   ; HashKey^123 <<1 mod poly
+%define HashKey_122     (16*21)   ; HashKey^122 <<1 mod poly
+%define HashKey_121     (16*22)   ; HashKey^121 <<1 mod poly
+%define HashKey_120     (16*23)   ; HashKey^120 <<1 mod poly
+%define HashKey_119     (16*24)   ; HashKey^119 <<1 mod poly
+%define HashKey_118     (16*25)   ; HashKey^118 <<1 mod poly
+%define HashKey_117     (16*26)   ; HashKey^117 <<1 mod poly
+%define HashKey_116     (16*27)   ; HashKey^116 <<1 mod poly
+%define HashKey_115     (16*28)   ; HashKey^115 <<1 mod poly
+%define HashKey_114     (16*29)   ; HashKey^114 <<1 mod poly
+%define HashKey_113     (16*30)   ; HashKey^113 <<1 mod poly
+%define HashKey_112     (16*31)   ; HashKey^112 <<1 mod poly
+%define HashKey_111     (16*32)   ; HashKey^111 <<1 mod poly
+%define HashKey_110     (16*33)   ; HashKey^110 <<1 mod poly
+%define HashKey_109     (16*34)   ; HashKey^109 <<1 mod poly
+%define HashKey_108     (16*35)   ; HashKey^108 <<1 mod poly
+%define HashKey_107     (16*36)   ; HashKey^107 <<1 mod poly
+%define HashKey_106     (16*37)   ; HashKey^106 <<1 mod poly
+%define HashKey_105     (16*38)   ; HashKey^105 <<1 mod poly
+%define HashKey_104     (16*39)   ; HashKey^104 <<1 mod poly
+%define HashKey_103     (16*40)   ; HashKey^103 <<1 mod poly
+%define HashKey_102     (16*41)   ; HashKey^102 <<1 mod poly
+%define HashKey_101     (16*42)   ; HashKey^101 <<1 mod poly
+%define HashKey_100     (16*43)   ; HashKey^100 <<1 mod poly
+%define HashKey_99      (16*44)   ; HashKey^99 <<1 mod poly
+%define HashKey_98      (16*45)   ; HashKey^98 <<1 mod poly
+%define HashKey_97      (16*46)   ; HashKey^97 <<1 mod poly
+%define HashKey_96      (16*47)   ; HashKey^96 <<1 mod poly
+%define HashKey_95      (16*48)   ; HashKey^95 <<1 mod poly
+%define HashKey_94      (16*49)   ; HashKey^94 <<1 mod poly
+%define HashKey_93      (16*50)   ; HashKey^93 <<1 mod poly
+%define HashKey_92      (16*51)   ; HashKey^92 <<1 mod poly
+%define HashKey_91      (16*52)   ; HashKey^91 <<1 mod poly
+%define HashKey_90      (16*53)   ; HashKey^90 <<1 mod poly
+%define HashKey_89      (16*54)   ; HashKey^89 <<1 mod poly
+%define HashKey_88      (16*55)   ; HashKey^88 <<1 mod poly
+%define HashKey_87      (16*56)   ; HashKey^87 <<1 mod poly
+%define HashKey_86      (16*57)   ; HashKey^86 <<1 mod poly
+%define HashKey_85      (16*58)   ; HashKey^85 <<1 mod poly
+%define HashKey_84      (16*59)   ; HashKey^84 <<1 mod poly
+%define HashKey_83      (16*60)   ; HashKey^83 <<1 mod poly
+%define HashKey_82      (16*61)   ; HashKey^82 <<1 mod poly
+%define HashKey_81      (16*62)   ; HashKey^81 <<1 mod poly
+%define HashKey_80      (16*63)   ; HashKey^80 <<1 mod poly
+%define HashKey_79      (16*64)   ; HashKey^79 <<1 mod poly
+%define HashKey_78      (16*65)   ; HashKey^78 <<1 mod poly
+%define HashKey_77      (16*66)   ; HashKey^77 <<1 mod poly
+%define HashKey_76      (16*67)   ; HashKey^76 <<1 mod poly
+%define HashKey_75      (16*68)   ; HashKey^75 <<1 mod poly
+%define HashKey_74      (16*69)   ; HashKey^74 <<1 mod poly
+%define HashKey_73      (16*70)   ; HashKey^73 <<1 mod poly
+%define HashKey_72      (16*71)   ; HashKey^72 <<1 mod poly
+%define HashKey_71      (16*72)   ; HashKey^71 <<1 mod poly
+%define HashKey_70      (16*73)   ; HashKey^70 <<1 mod poly
+%define HashKey_69      (16*74)   ; HashKey^69 <<1 mod poly
+%define HashKey_68      (16*75)   ; HashKey^68 <<1 mod poly
+%define HashKey_67      (16*76)   ; HashKey^67 <<1 mod poly
+%define HashKey_66      (16*77)   ; HashKey^66 <<1 mod poly
+%define HashKey_65      (16*78)   ; HashKey^65 <<1 mod poly
+%define HashKey_64      (16*79)   ; HashKey^64 <<1 mod poly
+%define HashKey_63      (16*80)   ; HashKey^63 <<1 mod poly
+%define HashKey_62      (16*81)   ; HashKey^62 <<1 mod poly
+%define HashKey_61      (16*82)   ; HashKey^61 <<1 mod poly
+%define HashKey_60      (16*83)   ; HashKey^60 <<1 mod poly
+%define HashKey_59      (16*84)   ; HashKey^59 <<1 mod poly
+%define HashKey_58      (16*85)   ; HashKey^58 <<1 mod poly
+%define HashKey_57      (16*86)   ; HashKey^57 <<1 mod poly
+%define HashKey_56      (16*87)   ; HashKey^56 <<1 mod poly
+%define HashKey_55      (16*88)   ; HashKey^55 <<1 mod poly
+%define HashKey_54      (16*89)   ; HashKey^54 <<1 mod poly
+%define HashKey_53      (16*90)   ; HashKey^53 <<1 mod poly
+%define HashKey_52      (16*91)   ; HashKey^52 <<1 mod poly
+%define HashKey_51      (16*92)   ; HashKey^51 <<1 mod poly
+%define HashKey_50      (16*93)   ; HashKey^50 <<1 mod poly
+%define HashKey_49      (16*94)   ; HashKey^49 <<1 mod poly
+%define HashKey_48      (16*95)   ; HashKey^48 <<1 mod poly
+%define HashKey_47      (16*96)   ; HashKey^47 <<1 mod poly
+%define HashKey_46      (16*97)   ; HashKey^46 <<1 mod poly
+%define HashKey_45      (16*98)   ; HashKey^45 <<1 mod poly
+%define HashKey_44      (16*99)   ; HashKey^44 <<1 mod poly
+%define HashKey_43      (16*100)  ; HashKey^43 <<1 mod poly
+%define HashKey_42      (16*101)  ; HashKey^42 <<1 mod poly
+%define HashKey_41      (16*102)  ; HashKey^41 <<1 mod poly
+%define HashKey_40      (16*103)  ; HashKey^40 <<1 mod poly
+%define HashKey_39      (16*104)  ; HashKey^39 <<1 mod poly
+%define HashKey_38      (16*105)  ; HashKey^38 <<1 mod poly
+%define HashKey_37      (16*106)  ; HashKey^37 <<1 mod poly
+%define HashKey_36      (16*107)  ; HashKey^36 <<1 mod poly
+%define HashKey_35      (16*108)  ; HashKey^35 <<1 mod poly
+%define HashKey_34      (16*109)  ; HashKey^34 <<1 mod poly
+%define HashKey_33      (16*110)  ; HashKey^33 <<1 mod poly
+%define HashKey_32      (16*111)  ; HashKey^32 <<1 mod poly
+%define HashKey_31      (16*112)  ; HashKey^31 <<1 mod poly
+%define HashKey_30      (16*113)  ; HashKey^30 <<1 mod poly
+%define HashKey_29      (16*114)  ; HashKey^29 <<1 mod poly
+%define HashKey_28      (16*115)  ; HashKey^28 <<1 mod poly
+%define HashKey_27      (16*116)  ; HashKey^27 <<1 mod poly
+%define HashKey_26      (16*117)  ; HashKey^26 <<1 mod poly
+%define HashKey_25      (16*118)  ; HashKey^25 <<1 mod poly
+%define HashKey_24      (16*119)  ; HashKey^24 <<1 mod poly
+%define HashKey_23      (16*120)  ; HashKey^23 <<1 mod poly
+%define HashKey_22      (16*121)  ; HashKey^22 <<1 mod poly
+%define HashKey_21      (16*122)  ; HashKey^21 <<1 mod poly
+%define HashKey_20      (16*123)  ; HashKey^20 <<1 mod poly
+%define HashKey_19      (16*124)  ; HashKey^19 <<1 mod poly
+%define HashKey_18      (16*125)  ; HashKey^18 <<1 mod poly
+%define HashKey_17      (16*126)  ; HashKey^17 <<1 mod poly
+%define HashKey_16      (16*127)  ; HashKey^16 <<1 mod poly
+%define HashKey_15      (16*128)  ; HashKey^15 <<1 mod poly
+%define HashKey_14      (16*129)  ; HashKey^14 <<1 mod poly
+%define HashKey_13      (16*130)  ; HashKey^13 <<1 mod poly
+%define HashKey_12      (16*131)  ; HashKey^12 <<1 mod poly
+%define HashKey_11      (16*132)  ; HashKey^11 <<1 mod poly
+%define HashKey_10      (16*133)  ; HashKey^10 <<1 mod poly
+%define HashKey_9       (16*134)  ; HashKey^9 <<1 mod poly
+%define HashKey_8       (16*135)  ; HashKey^8 <<1 mod poly
+%define HashKey_7       (16*136)  ; HashKey^7 <<1 mod poly
+%define HashKey_6       (16*137)  ; HashKey^6 <<1 mod poly
+%define HashKey_5       (16*138)  ; HashKey^5 <<1 mod poly
+%define HashKey_4       (16*139)  ; HashKey^4 <<1 mod poly
+%define HashKey_3       (16*140)  ; HashKey^3 <<1 mod poly
+%define HashKey_2       (16*141)  ; HashKey^2 <<1 mod poly
+%define HashKey_1       (16*142)  ; HashKey <<1 mod poly
+%define HashKey         (16*142)  ; HashKey <<1 mod poly
+%else
+;;
+;; Key structure holds up to 48 ghash keys
+;;
+%define HashKey_48      (16*15)   ; HashKey^48 <<1 mod poly
+%define HashKey_47      (16*16)   ; HashKey^47 <<1 mod poly
+%define HashKey_46      (16*17)   ; HashKey^46 <<1 mod poly
+%define HashKey_45      (16*18)   ; HashKey^45 <<1 mod poly
+%define HashKey_44      (16*19)   ; HashKey^44 <<1 mod poly
+%define HashKey_43      (16*20)   ; HashKey^43 <<1 mod poly
+%define HashKey_42      (16*21)   ; HashKey^42 <<1 mod poly
+%define HashKey_41      (16*22)   ; HashKey^41 <<1 mod poly
+%define HashKey_40      (16*23)   ; HashKey^40 <<1 mod poly
+%define HashKey_39      (16*24)   ; HashKey^39 <<1 mod poly
+%define HashKey_38      (16*25)   ; HashKey^38 <<1 mod poly
+%define HashKey_37      (16*26)   ; HashKey^37 <<1 mod poly
+%define HashKey_36      (16*27)   ; HashKey^36 <<1 mod poly
+%define HashKey_35      (16*28)   ; HashKey^35 <<1 mod poly
+%define HashKey_34      (16*29)   ; HashKey^34 <<1 mod poly
+%define HashKey_33      (16*30)   ; HashKey^33 <<1 mod poly
+%define HashKey_32      (16*31)   ; HashKey^32 <<1 mod poly
+%define HashKey_31      (16*32)   ; HashKey^31 <<1 mod poly
+%define HashKey_30      (16*33)   ; HashKey^30 <<1 mod poly
+%define HashKey_29      (16*34)   ; HashKey^29 <<1 mod poly
+%define HashKey_28      (16*35)   ; HashKey^28 <<1 mod poly
+%define HashKey_27      (16*36)   ; HashKey^27 <<1 mod poly
+%define HashKey_26      (16*37)   ; HashKey^26 <<1 mod poly
+%define HashKey_25      (16*38)   ; HashKey^25 <<1 mod poly
+%define HashKey_24      (16*39)   ; HashKey^24 <<1 mod poly
+%define HashKey_23      (16*40)   ; HashKey^23 <<1 mod poly
+%define HashKey_22      (16*41)   ; HashKey^22 <<1 mod poly
+%define HashKey_21      (16*42)   ; HashKey^21 <<1 mod poly
+%define HashKey_20      (16*43)   ; HashKey^20 <<1 mod poly
+%define HashKey_19      (16*44)   ; HashKey^19 <<1 mod poly
+%define HashKey_18      (16*45)   ; HashKey^18 <<1 mod poly
+%define HashKey_17      (16*46)   ; HashKey^17 <<1 mod poly
+%define HashKey_16      (16*47)   ; HashKey^16 <<1 mod poly
+%define HashKey_15      (16*48)   ; HashKey^15 <<1 mod poly
+%define HashKey_14      (16*49)   ; HashKey^14 <<1 mod poly
+%define HashKey_13      (16*50)   ; HashKey^13 <<1 mod poly
+%define HashKey_12      (16*51)   ; HashKey^12 <<1 mod poly
+%define HashKey_11      (16*52)   ; HashKey^11 <<1 mod poly
+%define HashKey_10      (16*53)   ; HashKey^10 <<1 mod poly
+%define HashKey_9       (16*54)   ; HashKey^9 <<1 mod poly
+%define HashKey_8       (16*55)   ; HashKey^8 <<1 mod poly
+%define HashKey_7       (16*56)   ; HashKey^7 <<1 mod poly
+%define HashKey_6       (16*57)   ; HashKey^6 <<1 mod poly
+%define HashKey_5       (16*58)   ; HashKey^5 <<1 mod poly
+%define HashKey_4       (16*59)   ; HashKey^4 <<1 mod poly
+%define HashKey_3       (16*60)   ; HashKey^3 <<1 mod poly
+%define HashKey_2       (16*61)   ; HashKey^2 <<1 mod poly
+%define HashKey_1       (16*62)   ; HashKey <<1 mod poly
+%define HashKey         (16*62)   ; HashKey <<1 mod poly
+%endif  ; !GCM_BIG_DATA
+
+%endif ; GCM_KEYS_VAES_AVX512_INCLUDED
diff --git a/src/spdk/intel-ipsec-mb/include/kasumi_internal.h b/src/spdk/intel-ipsec-mb/include/kasumi_internal.h
new file mode 100755
index 000000000..87b114d88
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/include/kasumi_internal.h
@@ -0,0 +1,1853 @@
+/*******************************************************************************
+  Copyright (c) 2009-2019, Intel Corporation
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are met:
+
+      * Redistributions of source code must retain the above copyright notice,
+        this list of conditions and the following disclaimer.
+      * Redistributions in binary form must reproduce the above copyright
+        notice, this list of conditions and the following disclaimer in the
+        documentation and/or other materials provided with the distribution.
+      * Neither the name of Intel Corporation nor the names of its contributors
+        may be used to endorse or promote products derived from this software
+        without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+  DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+  DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+  SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+  CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+  OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+
+
+/*---------------------------------------------------------
+* Kasumi_internal.h
+*---------------------------------------------------------*/
+
+#ifndef _KASUMI_INTERNAL_H_
+#define _KASUMI_INTERNAL_H_
+
+#include <sys/types.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "intel-ipsec-mb.h"
+#include "wireless_common.h"
+#include "include/clear_regs_mem.h"
+#include "include/constant_lookup.h"
+
+/*---------------------------------------------------------------------
+* Kasumi Inner S-Boxes
+*---------------------------------------------------------------------*/
+
+/* Table version based on a small table, no cache trash */
+static const uint16_t sso_kasumi_S7e[] = {
+        0x6c00, 0x6601, 0x7802, 0x7603, 0x2404, 0x4e05, 0xb006, 0xce07,
+        0x5c08, 0x1e09, 0x6a0a, 0xac0b, 0x1c0c, 0x3e0d, 0xea0e, 0x5c0f,
+        0x4e10, 0xc011, 0x6a12, 0xc213, 0x0214, 0xac15, 0xae16, 0x3617,
+        0x6e18, 0xa019, 0x681a, 0x001b, 0x0a1c, 0xe41d, 0xc41e, 0x9c1f,
+        0x2a20, 0x5021, 0xb622, 0xd823, 0x2024, 0x3225, 0x3826, 0x2e27,
+        0x9a28, 0xac29, 0x042a, 0xa62b, 0x882c, 0xd62d, 0xd22e, 0x082f,
+        0x4830, 0x9631, 0xf432, 0x1c33, 0x4634, 0xb035, 0x7636, 0xa637,
+        0xea38, 0x7039, 0x543a, 0x783b, 0xdc3c, 0x6e3d, 0xae3e, 0xba3f,
+        0x6a40, 0x6a41, 0x1c42, 0x9043, 0x3a44, 0x5e45, 0x8c46, 0x7447,
+        0x7c48, 0x5449, 0x384a, 0x1c4b, 0xa44c, 0xe84d, 0x604e, 0x304f,
+        0x4050, 0xc451, 0x8652, 0xac53, 0x1654, 0xb655, 0x1856, 0x0657,
+        0x0658, 0xa259, 0xf25a, 0x785b, 0xf85c, 0x785d, 0x845e, 0x3a5f,
+        0x0c60, 0xfc61, 0xf062, 0x9c63, 0x5e64, 0xc265, 0x6666, 0x7667,
+        0x9a68, 0x4669, 0x746a, 0xb46b, 0x506c, 0xe06d, 0x3a6e, 0x866f,
+        0x6070, 0x3471, 0x3c72, 0xd673, 0x3474, 0x4c75, 0xa476, 0x7277,
+        0xa478, 0xd479, 0xea7a, 0xa47b, 0x487c, 0x147d, 0x8a7e, 0xf87f,
+        0x6c00, 0x6601, 0x7802, 0x7603, 0x2404, 0x4e05, 0xb006, 0xce07,
+        0x5c08, 0x1e09, 0x6a0a, 0xac0b, 0x1c0c, 0x3e0d, 0xea0e, 0x5c0f,
+        0x4e10, 0xc011, 0x6a12, 0xc213, 0x0214, 0xac15, 0xae16, 0x3617,
+        0x6e18, 0xa019, 0x681a, 0x001b, 0x0a1c, 0xe41d, 0xc41e, 0x9c1f,
+        0x2a20, 0x5021, 0xb622, 0xd823, 0x2024, 0x3225, 0x3826, 0x2e27,
+        0x9a28, 0xac29, 0x042a, 0xa62b, 0x882c, 0xd62d, 0xd22e, 0x082f,
+        0x4830, 0x9631, 0xf432, 0x1c33, 0x4634, 0xb035, 0x7636, 0xa637,
+        0xea38, 0x7039, 0x543a, 0x783b, 0xdc3c, 0x6e3d, 0xae3e, 0xba3f,
+        0x6a40, 0x6a41, 0x1c42, 0x9043, 0x3a44, 0x5e45, 0x8c46, 0x7447,
+        0x7c48, 0x5449, 0x384a, 0x1c4b, 0xa44c, 0xe84d, 0x604e, 0x304f,
+        0x4050, 0xc451, 0x8652, 0xac53, 0x1654, 0xb655, 0x1856, 0x0657,
+        0x0658, 0xa259, 0xf25a, 0x785b, 0xf85c, 0x785d, 0x845e, 0x3a5f,
+        0x0c60, 0xfc61, 0xf062, 0x9c63, 0x5e64, 0xc265, 0x6666, 0x7667,
+        0x9a68, 0x4669, 0x746a, 0xb46b, 0x506c, 0xe06d, 0x3a6e, 0x866f,
+        0x6070, 0x3471, 0x3c72, 0xd673, 0x3474, 0x4c75, 0xa476, 0x7277,
+        0xa478, 0xd479, 0xea7a, 0xa47b, 0x487c, 0x147d, 0x8a7e, 0xf87f
+};
+
+static const uint16_t sso_kasumi_S9e[] = {
+        0x4ea7, 0xdeef, 0x42a1, 0xf77b, 0x0f87, 0x9d4e, 0x1209, 0xa552,
+        0x4c26, 0xc4e2, 0x6030, 0xcd66, 0x89c4, 0x0381, 0xb45a, 0x1b8d,
+        0x6eb7, 0xfafd, 0x2693, 0x974b, 0x3f9f, 0xa954, 0x6633, 0xd56a,
+        0x6532, 0xe9f4, 0x0d06, 0xa452, 0xb0d8, 0x3e9f, 0xc964, 0x62b1,
+        0x5eaf, 0xe2f1, 0xd3e9, 0x4a25, 0x9cce, 0x2211, 0x0000, 0x9b4d,
+        0x582c, 0xfcfe, 0xf57a, 0x743a, 0x1e8f, 0xb8dc, 0xa251, 0x2190,
+        0xbe5f, 0x0603, 0x773b, 0xeaf5, 0x6c36, 0xd6eb, 0xb4da, 0x2b95,
+        0xb1d8, 0x1108, 0x58ac, 0xddee, 0xe773, 0x4522, 0x1f8f, 0x984c,
+        0x4aa5, 0x8ac5, 0x178b, 0xf279, 0x0301, 0xc1e0, 0x4fa7, 0xa8d4,
+        0xe0f0, 0x381c, 0x9dce, 0x60b0, 0x2d96, 0xf7fb, 0x4120, 0xbedf,
+        0xebf5, 0x2f97, 0xf2f9, 0x1309, 0xb259, 0x74ba, 0xbadd, 0x59ac,
+        0x48a4, 0x944a, 0x71b8, 0x88c4, 0x95ca, 0x4ba5, 0xbd5e, 0x46a3,
+        0xd0e8, 0x3c9e, 0x0c86, 0xc562, 0x1a0d, 0xf4fa, 0xd7eb, 0x1c8e,
+        0x7ebf, 0x8a45, 0x82c1, 0x53a9, 0x3098, 0xc6e3, 0xdd6e, 0x0e87,
+        0xb158, 0x592c, 0x2914, 0xe4f2, 0x6bb5, 0x8140, 0xe271, 0x2d16,
+        0x160b, 0xe6f3, 0xae57, 0x7b3d, 0x4824, 0xba5d, 0xe1f0, 0x361b,
+        0xcfe7, 0x7dbe, 0xc5e2, 0x5229, 0x8844, 0x389c, 0x93c9, 0x0683,
+        0x8d46, 0x2793, 0xa753, 0x2814, 0x4e27, 0xe673, 0x75ba, 0xf87c,
+        0xb7db, 0x0180, 0xf9fc, 0x6a35, 0xe070, 0x54aa, 0xbfdf, 0x2e97,
+        0xfc7e, 0x52a9, 0x9249, 0x190c, 0x2f17, 0x8341, 0x50a8, 0xd96c,
+        0xd76b, 0x4924, 0x5c2e, 0xe7f3, 0x1389, 0x8f47, 0x8944, 0x3018,
+        0x91c8, 0x170b, 0x3a9d, 0x99cc, 0xd1e8, 0x55aa, 0x6b35, 0xcae5,
+        0x6fb7, 0xf5fa, 0xa0d0, 0x1f0f, 0xbb5d, 0x2391, 0x65b2, 0xd8ec,
+        0x2010, 0xa2d1, 0xcf67, 0x6834, 0x7038, 0xf078, 0x8ec7, 0x2b15,
+        0xa3d1, 0x41a0, 0xf8fc, 0x3f1f, 0xecf6, 0x0c06, 0xa653, 0x6331,
+        0x49a4, 0xb359, 0x3299, 0xedf6, 0x8241, 0x7a3d, 0xe8f4, 0x351a,
+        0x5aad, 0xbcde, 0x45a2, 0x8643, 0x0582, 0xe170, 0x0b05, 0xca65,
+        0xb9dc, 0x4723, 0x86c3, 0x5dae, 0x6231, 0x9e4f, 0x4ca6, 0x954a,
+        0x3118, 0xff7f, 0xeb75, 0x0080, 0xfd7e, 0x3198, 0x369b, 0xdfef,
+        0xdf6f, 0x0984, 0x2512, 0xd66b, 0x97cb, 0x43a1, 0x7c3e, 0x8dc6,
+        0x0884, 0xc2e1, 0x96cb, 0x793c, 0xd4ea, 0x1c0e, 0x5b2d, 0xb65b,
+        0xeff7, 0x3d1e, 0x51a8, 0xa6d3, 0xb75b, 0x6733, 0x188c, 0xed76,
+        0x4623, 0xce67, 0xfa7d, 0x57ab, 0x2613, 0xacd6, 0x8bc5, 0x2492,
+        0xe5f2, 0x753a, 0x79bc, 0xcce6, 0x0100, 0x9349, 0x8cc6, 0x3b1d,
+        0x6432, 0xe874, 0x9c4e, 0x359a, 0x140a, 0x9acd, 0xfdfe, 0x56ab,
+        0xcee7, 0x5a2d, 0x168b, 0xa7d3, 0x3a1d, 0xac56, 0xf3f9, 0x4020,
+        0x9048, 0x341a, 0xad56, 0x2c96, 0x7339, 0xd5ea, 0x5faf, 0xdcee,
+        0x379b, 0x8b45, 0x2a95, 0xb3d9, 0x5028, 0xee77, 0x5cae, 0xc763,
+        0x72b9, 0xd2e9, 0x0b85, 0x8e47, 0x81c0, 0x2311, 0xe974, 0x6e37,
+        0xdc6e, 0x64b2, 0x8542, 0x180c, 0xabd5, 0x1188, 0xe371, 0x7cbe,
+        0x0201, 0xda6d, 0xef77, 0x1289, 0x6ab5, 0xb058, 0x964b, 0x6934,
+        0x0904, 0xc9e4, 0xc462, 0x2110, 0xe572, 0x2713, 0x399c, 0xde6f,
+        0xa150, 0x7d3e, 0x0804, 0xf1f8, 0xd9ec, 0x0703, 0x6130, 0x9a4d,
+        0xa351, 0x67b3, 0x2a15, 0xcb65, 0x5f2f, 0x994c, 0xc7e3, 0x2412,
+        0x5e2f, 0xaa55, 0x3219, 0xe3f1, 0xb5da, 0x4321, 0xc864, 0x1b0d,
+        0x5128, 0xbdde, 0x1d0e, 0xd46a, 0x3e1f, 0xd068, 0x63b1, 0xa854,
+        0x3d9e, 0xcde6, 0x158a, 0xc060, 0xc663, 0x349a, 0xffff, 0x2894,
+        0x3b9d, 0xd369, 0x3399, 0xfeff, 0x44a2, 0xaed7, 0x5d2e, 0x92c9,
+        0x150a, 0xbf5f, 0xaf57, 0x2090, 0x73b9, 0xdb6d, 0xd86c, 0x552a,
+        0xf6fb, 0x4422, 0x6cb6, 0xfbfd, 0x148a, 0xa4d2, 0x9f4f, 0x0a85,
+        0x6f37, 0xc160, 0x9148, 0x1a8d, 0x198c, 0xb55a, 0xf67b, 0x7f3f,
+        0x85c2, 0x3319, 0x5bad, 0xc8e4, 0x77bb, 0xc3e1, 0xb85c, 0x2994,
+        0xcbe5, 0x4da6, 0xf0f8, 0x5329, 0x2e17, 0xaad5, 0x0482, 0xa5d2,
+        0x2c16, 0xb2d9, 0x371b, 0x8c46, 0x4d26, 0xd168, 0x47a3, 0xfe7f,
+        0x7138, 0xf379, 0x0e07, 0xa9d4, 0x84c2, 0x0402, 0xea75, 0x4f27,
+        0x9fcf, 0x0502, 0xc0e0, 0x7fbf, 0xeef7, 0x76bb, 0xa050, 0x1d8e,
+        0x391c, 0xc361, 0xd269, 0x0d86, 0x572b, 0xafd7, 0xadd6, 0x70b8,
+        0x7239, 0x90c8, 0xb95c, 0x7e3f, 0x98cc, 0x78bc, 0x4221, 0x87c3,
+        0xc261, 0x3c1e, 0x6d36, 0xb6db, 0xbc5e, 0x40a0, 0x0281, 0xdbed,
+        0x8040, 0x66b3, 0x0f07, 0xcc66, 0x7abd, 0x9ecf, 0xe472, 0x2592,
+        0x6db6, 0xbbdd, 0x0783, 0xf47a, 0x80c0, 0x542a, 0xfb7d, 0x0a05,
+        0x2291, 0xec76, 0x68b4, 0x83c1, 0x4b25, 0x8743, 0x1088, 0xf97c,
+        0x562b, 0x8442, 0x783c, 0x8fc7, 0xab55, 0x7bbd, 0x94ca, 0x61b0,
+        0x1008, 0xdaed, 0x1e0f, 0xf178, 0x69b4, 0xa1d0, 0x763b, 0x9bcd
+};
+
+/* Range of input data for KASUMI is from 1 to 20000 bits */
+#define KASUMI_MIN_LEN     1
+#define KASUMI_MAX_LEN     20000
+
+/* KASUMI cipher definitions */
+#define NUM_KASUMI_ROUNDS           (8)     /* 8 rounds in the kasumi spec */
+#define QWORDSIZEINBITS	            (64)
+#define QWORDSIZEINBYTES            (8)
+#define LAST_PADDING_BIT            (1)
+
+#define BYTESIZE     (8)
+#define BITSIZE(x)   ((int)(sizeof(x)*BYTESIZE))
+
+/*--------- 16 bit rotate left ------------------------------------------*/
+#define ROL16(a,b) (uint16_t)((a<<b)|(a>>(16-b)))
+
+/*----- a 64-bit structure to help with kasumi endian issues -----*/
+typedef union _ku64 {
+	uint64_t b64[1];
+	uint32_t b32[2];
+	uint16_t b16[4];
+	uint8_t b8[8];
+} kasumi_union_t;
+
+typedef union SafeBuffer {
+        uint64_t b64;
+        uint32_t b32[2];
+        uint8_t b8[KASUMI_BLOCK_SIZE];
+} SafeBuf;
+
+/*---------------------------------------------------------------------
+* Inline 16-bit left rotation
+*---------------------------------------------------------------------*/
+
+#define ROL16(a,b) (uint16_t)((a<<b)|(a>>(16-b)))
+
+#define FIp1(data, key1, key2, key3)                                           \
+        do {                                                                   \
+                uint16_t datal, datah;                                         \
+                                                                               \
+                (data) ^= (key1);                                              \
+                datal = LOOKUP16_SSE(sso_kasumi_S7e, (uint8_t)(data), 256);    \
+                datah = LOOKUP16_SSE(sso_kasumi_S9e, (data) >> 7, 512);        \
+                (data) = datal ^ datah;                                        \
+                (data) ^= (key2);                                              \
+                datal = LOOKUP16_SSE(sso_kasumi_S7e, (data) >> 9, 256);        \
+                datah = LOOKUP16_SSE(sso_kasumi_S9e, (data) & 0x1FF, 512);     \
+                (data) = datal ^ datah;                                        \
+                (data) ^= (key3);                                              \
+        } while (0)
+
+#define FIp2(data1, data2, key1, key2, key3, key4)                             \
+        do {                                                                   \
+                FIp1(data1, key1, key2, key3);                                 \
+                FIp1(data2, key1, key2, key4);                                 \
+        } while (0)
+
+#define FLpi(key1, key2, res_h, res_l)                                         \
+        do {                                                                   \
+                uint16_t l, r;                                                 \
+                r = (res_l) & (key1);                                          \
+                r = (res_h) ^ ROL16(r, 1);                                   \
+                l = r | (key2);                                                \
+                (res_h) = (res_l) ^ ROL16(l, 1);                             \
+                (res_l) = r;                                                   \
+        } while (0)
+
+#define FLp1(index, h, l)                                                      \
+        do {                                                                   \
+                uint16_t ka = *(index + 0);                                    \
+                uint16_t kb = *(index + 1);                                    \
+                FLpi(ka, kb, h, l);                                            \
+        } while (0)
+
+#define FLp2(index, h1, l1, h2, l2)                                            \
+        do {                                                                   \
+                uint16_t ka = *(index + 0);                                    \
+                uint16_t kb = *(index + 1);                                    \
+                FLpi(ka, kb, h1, l1);                                          \
+                FLpi(ka, kb, h2, l2);                                          \
+        } while (0)
+
+#define FLp3(index, h1, l1, h2, l2, h3, l3)                                    \
+        do {                                                                   \
+                uint16_t ka = *(index + 0);                                    \
+                uint16_t kb = *(index + 1);                                    \
+                FLpi(ka, kb, h1, l1);                                          \
+                FLpi(ka, kb, h2, l2);                                          \
+                FLpi(ka, kb, h3, l3);                                          \
+        } while (0)
+
+#define FLp4(index, h1, l1, h2, l2, h3, l3, h4, l4)                            \
+        do {                                                                   \
+                FLp2(index, h1, l1, h2, l2);                                   \
+                FLp2(index, h3, l3, h4, l4);                                   \
+        } while (0)
+
+#define FOp1(index, h, l)                                                      \
+        do {                                                                   \
+                FIp1(h, *(index + 2), *(index + 3), l);                        \
+                FIp1(l, *(index + 4), *(index + 5), h);                        \
+                FIp1(h, *(index + 6), *(index + 7), l);                        \
+        } while (0)
+
+#define FOp2(index, h1, l1, h2, l2)                                            \
+        do {                                                                   \
+                uint16_t ka = *(index + 2);                                    \
+                uint16_t kb = *(index + 3);                                    \
+                FIp2(h1, h2, ka, kb, l1, l2);                                  \
+                ka = *(index + 4);                                             \
+                kb = *(index + 5);                                             \
+                FIp2(l1, l2, ka, kb, h1, h2);                                  \
+                ka = *(index + 6);                                             \
+                kb = *(index + 7);                                             \
+                FIp2(h1, h2, ka, kb, l1, l2);                                  \
+        } while (0)
+
+#define FOp3(index, h1, l1, h2, l2, h3, l3)                                    \
+        do {                                                                   \
+                uint16_t ka = *(index + 2);                                    \
+                uint16_t kb = *(index + 3);                                    \
+                FIp2(h1, h2, ka, kb, l1, l2);                                  \
+                FIp1(h3, ka, kb, l3);                                          \
+                ka = *(index + 4);                                             \
+                kb = *(index + 5);                                             \
+                FIp2(l1, l2, ka, kb, h1, h2);                                  \
+                FIp1(l3, ka, kb, h3);                                          \
+                ka = *(index + 6);                                             \
+                kb = *(index + 7);                                             \
+                FIp2(h1, h2, ka, kb, l1, l2);                                  \
+                FIp1(h3, ka, kb, l3);                                          \
+        } while (0)
+
+#define FOp4(index, h1, l1, h2, l2, h3, l3, h4, l4)                            \
+        do {                                                                   \
+                uint16_t ka = *(index + 2);                                    \
+                uint16_t kb = *(index + 3);                                    \
+                FIp2(h1, h2, ka, kb, l1, l2);                                  \
+                FIp2(h3, h4, ka, kb, l3, l4);                                  \
+                ka = *(index + 4);                                             \
+                kb = *(index + 5);                                             \
+                FIp2(l1, l2, ka, kb, h1, h2);                                  \
+                FIp2(l3, l4, ka, kb, h3, h4);                                  \
+                ka = *(index + 6);                                             \
+                kb = *(index + 7);                                             \
+                FIp2(h1, h2, ka, kb, l1, l2);                                  \
+                FIp2(h3, h4, ka, kb, l3, l4);                                  \
+        } while (0)
+
+/**
+ *******************************************************************************
+ * @description
+ * This function performs the Kasumi operation on the given block using the key
+ * that is already scheduled in the context
+ *
+ * @param[in]       pContext     Context where the scheduled keys are stored
+ * @param[in/out]   pData        Block to be enc/dec
+ *
+ ******************************************************************************/
+static void kasumi_1_block(const uint16_t *context, uint16_t *data)
+{
+    const uint16_t *end = context + KASUMI_KEY_SCHEDULE_SIZE;
+    uint16_t temp_l, temp_h;
+
+    /* 4 iterations odd/even */
+    do {
+        temp_l = data[3];
+        temp_h = data[2];
+        FLp1(context, temp_h, temp_l);
+        FOp1(context, temp_h, temp_l);
+        context += 8;
+        data[1] ^= temp_l;
+        data[0] ^= temp_h;
+
+        temp_h = data[1];
+        temp_l = data[0];
+        FOp1(context, temp_h, temp_l);
+        FLp1(context, temp_h, temp_l);
+        context += 8;
+        data[3] ^= temp_h;
+        data[2] ^= temp_l;
+    } while (context < end);
+}
+
+/**
+ ******************************************************************************
+ * @description
+ * This function performs the Kasumi operation on the given blocks using the key
+ * that is already scheduled in the context
+ *
+ * @param[in]       pContext     Context where the scheduled keys are stored
+ * @param[in/out]   pData1       First block to be enc/dec
+ * @param[in/out]   pData2       Second block to be enc/dec
+ *
+ ******************************************************************************/
+static void
+kasumi_2_blocks(const uint16_t *context, uint16_t *data1, uint16_t *data2)
+{
+    const uint16_t *end = context + KASUMI_KEY_SCHEDULE_SIZE;
+    uint16_t temp1_l, temp1_h;
+    uint16_t temp2_l, temp2_h;
+
+    /* 4 iterations odd/even , with fine grain interleave */
+    do {
+        /* even */
+        temp1_l = data1[3];
+        temp1_h = data1[2];
+        temp2_l = data2[3];
+        temp2_h = data2[2];
+        FLp2(context, temp1_h, temp1_l, temp2_h, temp2_l);
+        FOp2(context, temp1_h, temp1_l, temp2_h, temp2_l);
+        context += 8;
+        data1[1] ^= temp1_l;
+        data1[0] ^= temp1_h;
+        data2[1] ^= temp2_l;
+        data2[0] ^= temp2_h;
+
+        /* odd */
+        temp1_h = data1[1];
+        temp1_l = data1[0];
+        temp2_h = data2[1];
+        temp2_l = data2[0];
+        FOp2(context, temp1_h, temp1_l, temp2_h, temp2_l);
+        FLp2(context, temp1_h, temp1_l, temp2_h, temp2_l);
+        context += 8;
+        data1[3] ^= temp1_h;
+        data1[2] ^= temp1_l;
+        data2[3] ^= temp2_h;
+        data2[2] ^= temp2_l;
+    } while (context < end);
+}
+
+
+/**
+ *******************************************************************************
+ * @description
+ * This function performs the Kasumi operation on the given blocks using the key
+ * that is already scheduled in the context
+ *
+ * @param[in]       pContext     Context where the scheduled keys are stored
+ * @param[in/out]   pData1       First block to be enc/dec
+ * @param[in/out]   pData2       Second block to be enc/dec
+ * @param[in/out]   pData3       Third block to be enc/dec
+ *
+ ******************************************************************************/
+static void
+kasumi_3_blocks(const uint16_t *context, uint16_t *data1,
+                uint16_t *data2, uint16_t *data3)
+{
+        /* Case when the conmpiler is able to interleave efficiently */
+        const uint16_t *end = context + KASUMI_KEY_SCHEDULE_SIZE;
+        uint16_t temp1_l, temp1_h;
+        uint16_t temp2_l, temp2_h;
+        uint16_t temp3_l, temp3_h;
+
+        /* 4 iterations odd/even , with fine grain interleave */
+        do {
+                temp1_l = data1[3];
+                temp1_h = data1[2];
+                temp2_l = data2[3];
+                temp2_h = data2[2];
+                temp3_l = data3[3];
+                temp3_h = data3[2];
+                FLp3(context, temp1_h, temp1_l, temp2_h, temp2_l, temp3_h,
+                     temp3_l);
+                FOp3(context, temp1_h, temp1_l, temp2_h, temp2_l, temp3_h,
+                     temp3_l);
+                context += 8;
+                data1[1] ^= temp1_l;
+                data1[0] ^= temp1_h;
+                data2[1] ^= temp2_l;
+                data2[0] ^= temp2_h;
+                data3[1] ^= temp3_l;
+                data3[0] ^= temp3_h;
+
+                temp1_h = data1[1];
+                temp1_l = data1[0];
+                temp2_h = data2[1];
+                temp2_l = data2[0];
+                temp3_h = data3[1];
+                temp3_l = data3[0];
+                FOp3(context, temp1_h, temp1_l, temp2_h, temp2_l, temp3_h,
+                     temp3_l);
+                FLp3(context, temp1_h, temp1_l, temp2_h, temp2_l, temp3_h,
+                     temp3_l);
+                context += 8;
+                data1[3] ^= temp1_h;
+                data1[2] ^= temp1_l;
+                data2[3] ^= temp2_h;
+                data2[2] ^= temp2_l;
+                data3[3] ^= temp3_h;
+                data3[2] ^= temp3_l;
+        } while (context < end);
+}
+
+/**
+ *******************************************************************************
+ * @description
+ * This function performs the Kasumi operation on the given blocks using the key
+ * that is already scheduled in the context
+ *
+ * @param[in]       pContext    Context where the scheduled keys are stored
+ * @param[in]       ppData      Pointer to an array of addresses of blocks
+ *
+ ******************************************************************************/
+static void
+kasumi_4_blocks(const uint16_t *context, uint16_t **ppData)
+{
+    /* Case when the conmpiler is unable to interleave efficiently */
+    kasumi_2_blocks (context, ppData[0], ppData[1]);
+    kasumi_2_blocks (context, ppData[2], ppData[3]);
+}
+
+/**
+ ******************************************************************************
+ * @description
+ * This function performs the Kasumi operation on the given blocks using the key
+ * that is already scheduled in the context
+ *
+ * @param[in]       pContext    Context where the scheduled keys are stored
+ * @param[in]       ppData      Pointer to an array of addresses of blocks
+ *
+ ******************************************************************************/
+static void
+kasumi_8_blocks(const uint16_t *context, uint16_t **ppData)
+{
+    kasumi_4_blocks (context, &ppData[0]);
+    kasumi_4_blocks (context, &ppData[4]);
+}
+
+/******************************************************************************
+* @description
+*   Multiple wrappers for the Kasumi rounds on up to 16 blocks of 64 bits at a
+*time.
+*
+*   Depending on the variable packet lengths, different wrappers get called.
+*   It has been measured that 1 packet is faster than 2, 2 packets is faster
+*than 3
+*   3 packets is faster than 4, and so on ...
+*   It has also been measured that 6 = 4+2 packets is faster than 8
+*   It has also been measured that 7 packets are processed faster as 8 packets,
+*
+*   If the assumptions are not verified, it is easy to implmement
+*   the right function and reference it in wrapperArray.
+*
+*******************************************************************************/
+static void
+kasumi_f8_1_buffer_wrapper(const uint16_t *context, uint16_t **data)
+{
+        kasumi_1_block(context, data[0]);
+}
+
+static void
+kasumi_f8_2_buffer_wrapper(const uint16_t *context, uint16_t **data)
+{
+        kasumi_2_blocks(context, data[0], data[1]);
+}
+
+static void
+kasumi_f8_3_buffer_wrapper(const uint16_t *context, uint16_t **data)
+{
+        kasumi_3_blocks(context, data[0], data[1], data[2]);
+}
+
+static void
+kasumi_f8_5_buffer_wrapper(const uint16_t *context, uint16_t **data)
+{
+        kasumi_4_blocks(context, &data[0]);
+        kasumi_1_block(context, data[4]);
+}
+
+static void
+kasumi_f8_6_buffer_wrapper(const uint16_t *context, uint16_t **data)
+{
+        /* It is also assumed 6 = 4+2 packets is faster than 8 */
+        kasumi_4_blocks(context, &data[0]);
+        kasumi_2_blocks(context, data[4], data[5]);
+}
+
+static void
+kasumi_f8_7_buffer_wrapper(const uint16_t *context, uint16_t **data)
+{
+        kasumi_4_blocks(context, &data[0]);
+        kasumi_3_blocks(context, data[4], data[5], data[6]);
+}
+
+static void
+kasumi_f8_9_buffer_wrapper(const uint16_t *context, uint16_t **data)
+{
+
+        kasumi_8_blocks(context, &data[0]);
+        kasumi_1_block(context, data[8]);
+}
+
+static void
+kasumi_f8_10_buffer_wrapper(const uint16_t *context, uint16_t **data)
+{
+        kasumi_8_blocks(context, &data[0]);
+        kasumi_2_blocks(context, data[8], data[9]);
+}
+
+static void
+kasumi_f8_11_buffer_wrapper(const uint16_t *context, uint16_t **data)
+{
+        kasumi_8_blocks(context, &data[0]);
+        kasumi_3_blocks(context, data[8], data[9], data[10]);
+}
+
+static void
+kasumi_f8_12_buffer_wrapper(const uint16_t *context, uint16_t **data)
+{
+        kasumi_8_blocks(context, &data[0]);
+        kasumi_4_blocks(context, &data[8]);
+}
+
+static void
+kasumi_f8_13_buffer_wrapper(const uint16_t *context, uint16_t **data)
+{
+
+        kasumi_8_blocks(context, &data[0]);
+        kasumi_4_blocks(context, &data[8]);
+        kasumi_1_block(context, data[12]);
+}
+
+static void
+kasumi_f8_14_buffer_wrapper(const uint16_t *context, uint16_t **data)
+{
+        kasumi_8_blocks(context, &data[0]);
+        kasumi_4_blocks(context, &data[8]);
+        kasumi_2_blocks(context, data[12], data[13]);
+}
+
+static void
+kasumi_f8_15_buffer_wrapper(const uint16_t *context, uint16_t **data)
+{
+        kasumi_8_blocks(context, &data[0]);
+        kasumi_4_blocks(context, &data[8]);
+        kasumi_3_blocks(context, data[12], data[13], data[14]);
+}
+
+static void
+kasumi_f8_16_buffer_wrapper(const uint16_t *context, uint16_t **data)
+{
+        kasumi_8_blocks(context, &data[0]);
+        kasumi_8_blocks(context, &data[8]);
+}
+
+typedef void (*kasumi_wrapper_t)(const uint16_t *, uint16_t **);
+
+static kasumi_wrapper_t kasumiWrapperArray[] = {
+        NULL,
+        kasumi_f8_1_buffer_wrapper,
+        kasumi_f8_2_buffer_wrapper,
+        kasumi_f8_3_buffer_wrapper,
+        kasumi_4_blocks,
+        kasumi_f8_5_buffer_wrapper,
+        kasumi_f8_6_buffer_wrapper,
+        kasumi_f8_7_buffer_wrapper,
+        kasumi_8_blocks,
+        kasumi_f8_9_buffer_wrapper,
+        kasumi_f8_10_buffer_wrapper,
+        kasumi_f8_11_buffer_wrapper,
+        kasumi_f8_12_buffer_wrapper,
+        kasumi_f8_13_buffer_wrapper,
+        kasumi_f8_14_buffer_wrapper,
+        kasumi_f8_15_buffer_wrapper,
+        kasumi_f8_16_buffer_wrapper};
+
+/*---------------------------------------------------------------------
+* kasumi_key_schedule_sk()
+* Build the key schedule. Most "key" operations use 16-bit
+*
+* Context is a flat array of 64 uint16. The context is built in the same order
+* it will be used.
+*---------------------------------------------------------------------*/
+static inline void
+kasumi_key_schedule_sk(uint16_t *context, const void *pKey)
+{
+
+        /* Kasumi constants*/
+        static const uint16_t C[] = {0x0123, 0x4567, 0x89AB, 0xCDEF,
+                                     0xFEDC, 0xBA98, 0x7654, 0x3210};
+
+        uint16_t k[8], kprime[8], n;
+        const uint8_t *pk = (const uint8_t *) pKey;
+
+        /* Build K[] and K'[] keys */
+        for (n = 0; n < 8; n++, pk += 2) {
+                k[n] = (pk[0] << 8) + pk[1];
+                kprime[n] = k[n] ^ C[n];
+        }
+
+        /*
+         * Finally construct the various sub keys [Kli1, KlO ...) in the right
+         * order for easy usage at run-time
+         */
+        for (n = 0; n < 8; n++) {
+                context[0] = ROL16(k[n], 1);
+                context[1] = kprime[(n + 2) & 0x7];
+                context[2] = ROL16(k[(n + 1) & 0x7], 5);
+                context[3] = kprime[(n + 4) & 0x7];
+                context[4] = ROL16(k[(n + 5) & 0x7], 8);
+                context[5] = kprime[(n + 3) & 0x7];
+                context[6] = ROL16(k[(n + 6) & 0x7], 13);
+                context[7] = kprime[(n + 7) & 0x7];
+                context += 8;
+        }
+#ifdef SAFE_DATA
+        clear_mem(k, sizeof(k));
+        clear_mem(kprime, sizeof(kprime));
+#endif
+}
+
+/*---------------------------------------------------------------------
+* kasumi_compute_sched()
+* Generic ksaumi key sched init function.
+*
+*---------------------------------------------------------------------*/
+static inline int
+kasumi_compute_sched(const uint8_t modifier,
+                     const void *const pKey, void *pCtx)
+{
+#ifdef SAFE_PARAM
+        /* Check for NULL pointers */
+        if (pKey == NULL || pCtx == NULL)
+                return -1;
+#endif
+        uint32_t i = 0;
+        const uint8_t *const key = (const uint8_t * const)pKey;
+        uint8_t ModKey[KASUMI_KEY_SIZE] = {0}; /* Modified key */
+        kasumi_key_sched_t *pLocalCtx = (kasumi_key_sched_t *)pCtx;
+
+        /* Construct the modified key*/
+        for (i = 0; i < KASUMI_KEY_SIZE; i++)
+                ModKey[i] = (uint8_t)key[i] ^ modifier;
+
+        kasumi_key_schedule_sk(pLocalCtx->sk16, pKey);
+        kasumi_key_schedule_sk(pLocalCtx->msk16, ModKey);
+
+#ifdef SAFE_DATA
+        clear_mem(ModKey, sizeof(ModKey));
+        CLEAR_SCRATCH_GPS();
+        CLEAR_SCRATCH_SIMD_REGS();
+#endif
+        return 0;
+}
+
+/*---------------------------------------------------------------------
+* kasumi_key_sched_size()
+* Get the size of a kasumi key sched context.
+*
+*---------------------------------------------------------------------*/
+static inline size_t
+kasumi_key_sched_size(void)
+{
+        /*
+         * There are two keys that need to be scheduled: the original one and
+         * the modified one (xored with the relevant modifier)
+         */
+        return sizeof(kasumi_key_sched_t);
+}
+
+/*---------------------------------------------------------------------
+* kasumi_init_f8_key_sched()
+* Compute the kasumi f8 key schedule.
+*
+*---------------------------------------------------------------------*/
+
+static inline int
+kasumi_init_f8_key_sched(const void *const pKey,
+                         kasumi_key_sched_t *pCtx)
+{
+        return kasumi_compute_sched(0x55, pKey, pCtx);
+}
+
+/*---------------------------------------------------------------------
+* kasumi_init_f9_key_sched()
+* Compute the kasumi f9 key schedule.
+*
+*---------------------------------------------------------------------*/
+
+static inline int
+kasumi_init_f9_key_sched(const void *const pKey,
+                         kasumi_key_sched_t *pCtx)
+{
+        return kasumi_compute_sched(0xAA, pKey, pCtx);
+}
+
+size_t
+kasumi_key_sched_size_sse(void);
+
+int
+kasumi_init_f8_key_sched_sse(const void *pKey, kasumi_key_sched_t *pCtx);
+
+int
+kasumi_init_f9_key_sched_sse(const void *pKey, kasumi_key_sched_t *pCtx);
+
+size_t
+kasumi_key_sched_size_avx(void);
+
+int
+kasumi_init_f8_key_sched_avx(const void *pKey, kasumi_key_sched_t *pCtx);
+
+int
+kasumi_init_f9_key_sched_avx(const void *pKey, kasumi_key_sched_t *pCtx);
+
+
+static inline void
+kasumi_f8_1_buffer(const kasumi_key_sched_t *pCtx, const uint64_t IV,
+                   const void *pIn, void *pOut,
+                   const uint32_t length)
+{
+        uint32_t blkcnt;
+        kasumi_union_t a, b; /* the modifier */
+        SafeBuf safeInBuf;
+        const uint8_t *pBufferIn = (const uint8_t *) pIn;
+        uint8_t *pBufferOut = (uint8_t *) pOut;
+        uint32_t lengthInBytes = length;
+
+        /* IV Endianity  */
+	a.b64[0] = BSWAP64(IV);
+
+        /* First encryption to create modifier */
+        kasumi_1_block(pCtx->msk16, a.b16 );
+
+        /* Final initialisation steps */
+        blkcnt = 0;
+        b.b64[0] = a.b64[0];
+
+        /* Now run the block cipher */
+        while (lengthInBytes) {
+                /* KASUMI it to produce the next block of keystream */
+                kasumi_1_block(pCtx->sk16, b.b16 );
+
+                if (lengthInBytes > KASUMI_BLOCK_SIZE) {
+                        pBufferIn = xor_keystrm_rev(pBufferOut, pBufferIn,
+                                                    b.b64[0]);
+                        pBufferOut += KASUMI_BLOCK_SIZE;
+                        /* loop variant */
+                        /* done another 64 bits */
+                        lengthInBytes -= KASUMI_BLOCK_SIZE;
+
+                        /* apply the modifier and update the block count */
+                        b.b64[0] ^= a.b64[0];
+                        b.b16[0] ^= (uint16_t)++blkcnt;
+                } else if (lengthInBytes < KASUMI_BLOCK_SIZE) {
+                        /* end of the loop, handle the last bytes */
+                        memcpy_keystrm(safeInBuf.b8, pBufferIn,
+                                       lengthInBytes);
+                        xor_keystrm_rev(b.b8, safeInBuf.b8, b.b64[0]);
+                        memcpy_keystrm(pBufferOut, b.b8, lengthInBytes);
+                        lengthInBytes = 0;
+                /* lengthInBytes == KASUMI_BLOCK_SIZE */
+                } else {
+                        xor_keystrm_rev(pBufferOut, pBufferIn, b.b64[0]);
+                        lengthInBytes = 0;
+                }
+        }
+#ifdef SAFE_DATA
+        /* Clear sensitive data in stack */
+        clear_mem(&a, sizeof(a));
+        clear_mem(&b, sizeof(b));
+        clear_mem(&safeInBuf, sizeof(safeInBuf));
+#endif
+}
+
+static inline void
+preserve_bits(kasumi_union_t *c,
+              const uint8_t *pcBufferOut, const uint8_t *pcBufferIn,
+              SafeBuf *safeOutBuf, SafeBuf *safeInBuf,
+              const uint8_t bit_len, const uint8_t byte_len)
+{
+        const uint64_t mask = UINT64_MAX << (KASUMI_BLOCK_SIZE * 8 - bit_len);
+
+        /* Clear the last bits of the keystream and the input
+         * (input only in out-of-place case) */
+        c->b64[0] &= mask;
+        if (pcBufferIn != pcBufferOut) {
+                const uint64_t swapMask = BSWAP64(mask);
+
+                safeInBuf->b64 &= swapMask;
+
+                /*
+                 * Merge the last bits from the output, to be preserved,
+                 * in the keystream, to be XOR'd with the input
+                 * (which last bits are 0, maintaining the output bits)
+                 */
+                memcpy_keystrm(safeOutBuf->b8, pcBufferOut, byte_len);
+                c->b64[0] |= BSWAP64(safeOutBuf->b64 & ~swapMask);
+        }
+}
+
+static inline void
+kasumi_f8_1_buffer_bit(const kasumi_key_sched_t *pCtx, const uint64_t IV,
+                       const void *pIn, void *pOut,
+                       const uint32_t lengthInBits,
+                       const uint32_t offsetInBits)
+{
+        const uint8_t *pBufferIn = (const uint8_t *) pIn;
+        uint8_t *pBufferOut = (uint8_t *) pOut;
+        uint32_t cipherLengthInBits = lengthInBits;
+        uint32_t blkcnt;
+        uint64_t shiftrem = 0;
+        kasumi_union_t a, b, c; /* the modifier */
+        const uint8_t *pcBufferIn = pBufferIn + (offsetInBits / 8);
+        uint8_t *pcBufferOut = pBufferOut + (offsetInBits / 8);
+        /* Offset into the first byte (0 - 7 bits) */
+        uint32_t remainOffset = offsetInBits % 8;
+        uint32_t byteLength = (cipherLengthInBits + 7) / 8;
+        SafeBuf safeOutBuf;
+        SafeBuf safeInBuf;
+
+        /* IV Endianity  */
+        a.b64[0] = BSWAP64(IV);
+
+        /* First encryption to create modifier */
+        kasumi_1_block(pCtx->msk16, a.b16);
+
+        /* Final initialisation steps */
+        blkcnt = 0;
+        b.b64[0] = a.b64[0];
+        /* Now run the block cipher */
+
+        /* Start with potential partial block (due to offset and length) */
+        kasumi_1_block(pCtx->sk16, b.b16);
+        c.b64[0] = b.b64[0] >> remainOffset;
+        /* Only one block to encrypt */
+        if (cipherLengthInBits < (64 - remainOffset)) {
+                byteLength = (cipherLengthInBits + 7) / 8;
+                memcpy_keystrm(safeInBuf.b8, pcBufferIn, byteLength);
+                /*
+                 * If operation is Out-of-place and there is offset
+                 * to be applied, "remainOffset" bits from the output buffer
+                 * need to be preserved (only applicable to first byte,
+                 * since remainOffset is up to 7 bits)
+                 */
+                if ((pIn != pOut) && remainOffset) {
+                        const uint8_t mask8 =
+                                (const uint8_t)(1 << (8 - remainOffset)) - 1;
+
+                        safeInBuf.b8[0] = (safeInBuf.b8[0] & mask8) |
+                                        (pcBufferOut[0] & ~mask8);
+                }
+
+                /* If last byte is a partial byte, the last bits of the output
+                 * need to be preserved */
+                const uint8_t bitlen_with_off = remainOffset +
+                                        cipherLengthInBits;
+
+                if ((bitlen_with_off & 0x7) != 0) {
+                        preserve_bits(&c, pcBufferOut, pcBufferIn, &safeOutBuf,
+                                      &safeInBuf, bitlen_with_off, byteLength);
+                }
+                xor_keystrm_rev(safeOutBuf.b8, safeInBuf.b8, c.b64[0]);
+                memcpy_keystrm(pcBufferOut, safeOutBuf.b8, byteLength);
+                return;
+        }
+
+        /*
+         * If operation is Out-of-place and there is offset
+         * to be applied, "remainOffset" bits from the output buffer
+         * need to be preserved (only applicable to first byte,
+         * since remainOffset is up to 7 bits)
+         */
+         if ((pIn != pOut) && remainOffset) {
+                const uint8_t mask8 =
+                        (const uint8_t)(1 << (8 - remainOffset)) - 1;
+
+                memcpy_keystrm(safeInBuf.b8, pcBufferIn, 8);
+                safeInBuf.b8[0] = (safeInBuf.b8[0] & mask8) |
+                                (pcBufferOut[0] & ~mask8);
+                xor_keystrm_rev(pcBufferOut, safeInBuf.b8, c.b64[0]);
+                pcBufferIn += KASUMI_BLOCK_SIZE;
+        } else {
+                /* At least 64 bits to produce (including offset) */
+                pcBufferIn = xor_keystrm_rev(pcBufferOut, pcBufferIn, c.b64[0]);
+        }
+
+        if (remainOffset != 0)
+                shiftrem = b.b64[0] << (64 - remainOffset);
+        cipherLengthInBits -= KASUMI_BLOCK_SIZE * 8 - remainOffset;
+        pcBufferOut += KASUMI_BLOCK_SIZE;
+        /* apply the modifier and update the block count */
+        b.b64[0] ^= a.b64[0];
+        b.b16[0] ^= (uint16_t)++blkcnt;
+
+        while (cipherLengthInBits) {
+                /* KASUMI it to produce the next block of keystream */
+                kasumi_1_block(pCtx->sk16, b.b16);
+                c.b64[0] = (b.b64[0] >> remainOffset) | shiftrem;
+                if (remainOffset != 0)
+                        shiftrem = b.b64[0] << (64 - remainOffset);
+                if (cipherLengthInBits >= KASUMI_BLOCK_SIZE * 8) {
+                        pcBufferIn = xor_keystrm_rev(pcBufferOut,
+                                                     pcBufferIn, c.b64[0]);
+                        cipherLengthInBits -= KASUMI_BLOCK_SIZE * 8;
+                        pcBufferOut += KASUMI_BLOCK_SIZE;
+                        /* loop variant */
+
+                        /* apply the modifier and update the block count */
+                        b.b64[0] ^= a.b64[0];
+                        b.b16[0] ^= (uint16_t)++blkcnt;
+                } else {
+                        /* end of the loop, handle the last bytes */
+                        byteLength = (cipherLengthInBits + 7) / 8;
+                        memcpy_keystrm(safeInBuf.b8, pcBufferIn,
+                                       byteLength);
+
+                        /* If last byte is a partial byte, the last bits
+                         * of the output need to be preserved */
+                        if ((cipherLengthInBits & 0x7) != 0)
+                                preserve_bits(&c, pcBufferOut, pcBufferIn,
+                                              &safeOutBuf, &safeInBuf,
+                                              cipherLengthInBits, byteLength);
+                        xor_keystrm_rev(safeOutBuf.b8, safeInBuf.b8, c.b64[0]);
+                        memcpy_keystrm(pcBufferOut, safeOutBuf.b8, byteLength);
+                        cipherLengthInBits = 0;
+                }
+        }
+#ifdef SAFE_DATA
+        /* Clear sensitive data in stack */
+        clear_mem(&a, sizeof(a));
+        clear_mem(&b, sizeof(b));
+        clear_mem(&c, sizeof(c));
+        clear_mem(&safeInBuf, sizeof(safeInBuf));
+        clear_mem(&safeOutBuf, sizeof(safeOutBuf));
+#endif
+}
+
+static inline void
+kasumi_f8_2_buffer(const kasumi_key_sched_t *pCtx,
+                   const uint64_t IV1, const uint64_t IV2,
+                   const void *pIn1, void *pOut1,
+                   const uint32_t length1,
+                   const void *pIn2, void *pOut2,
+                   const uint32_t length2)
+{
+        const uint8_t *pBufferIn1 = (const uint8_t *) pIn1;
+        uint8_t *pBufferOut1 = (uint8_t *) pOut1;
+        uint32_t lengthInBytes1 = length1;
+        const uint8_t *pBufferIn2 = (const uint8_t *) pIn2;
+        uint8_t *pBufferOut2 = (uint8_t *) pOut2;
+        uint32_t lengthInBytes2 = length2;
+        uint32_t blkcnt, length;
+        kasumi_union_t a1, b1; /* the modifier */
+        kasumi_union_t a2, b2; /* the modifier */
+        SafeBuf safeInBuf;
+
+        kasumi_union_t temp;
+
+        /* IV Endianity  */
+        a1.b64[0] = BSWAP64(IV1);
+        a2.b64[0] = BSWAP64(IV2);
+
+        kasumi_2_blocks(pCtx->msk16, a1.b16, a2.b16);
+
+        /* Final initialisation steps */
+        blkcnt = 0;
+        b1.b64[0] = a1.b64[0];
+        b2.b64[0] = a2.b64[0];
+
+        /* check which packet is longer and save "common" shortest length */
+        if (lengthInBytes1 > lengthInBytes2)
+                length = lengthInBytes2;
+        else
+                length = lengthInBytes1;
+
+        /* Round down to to a whole number of qwords. (QWORDLENGTHINBYTES-1 */
+        length &= ~7;
+        lengthInBytes1 -= length;
+        lengthInBytes2 -= length;
+
+        /* Now run the block cipher for common packet length, a whole number of
+         * blocks */
+        while (length) {
+                /* KASUMI it to produce the next block of keystream for both
+                 * packets */
+                kasumi_2_blocks(pCtx->sk16, b1.b16, b2.b16);
+
+                /* xor and write keystream */
+                pBufferIn1 =
+                    xor_keystrm_rev(pBufferOut1, pBufferIn1, b1.b64[0]);
+                pBufferOut1 += KASUMI_BLOCK_SIZE;
+                pBufferIn2 =
+                    xor_keystrm_rev(pBufferOut2, pBufferIn2, b2.b64[0]);
+                pBufferOut2 += KASUMI_BLOCK_SIZE;
+                /* loop variant */
+                length -= KASUMI_BLOCK_SIZE; /* done another 64 bits */
+
+                /* apply the modifier and update the block count */
+                b1.b64[0] ^= a1.b64[0];
+                b1.b16[0] ^= (uint16_t)++blkcnt;
+                b2.b64[0] ^= a2.b64[0];
+                b2.b16[0] ^= (uint16_t)blkcnt;
+        }
+
+        /*
+         * Process common part at end of first packet and second packet.
+         * One of the packets has a length less than 8 bytes.
+         */
+        if (lengthInBytes1 > 0 && lengthInBytes2 > 0) {
+                /* final round for 1 of the packets */
+                kasumi_2_blocks(pCtx->sk16, b1.b16, b2.b16);
+                if (lengthInBytes1 > KASUMI_BLOCK_SIZE) {
+                        pBufferIn1 = xor_keystrm_rev(pBufferOut1,
+                                                     pBufferIn1, b1.b64[0]);
+                        pBufferOut1 += KASUMI_BLOCK_SIZE;
+                        b1.b64[0] ^= a1.b64[0];
+                        b1.b16[0] ^= (uint16_t)++blkcnt;
+                        lengthInBytes1 -= KASUMI_BLOCK_SIZE;
+                } else if (lengthInBytes1 < KASUMI_BLOCK_SIZE) {
+                        memcpy_keystrm(safeInBuf.b8, pBufferIn1,
+                                       lengthInBytes1);
+                        xor_keystrm_rev(temp.b8, safeInBuf.b8, b1.b64[0]);
+                        memcpy_keystrm(pBufferOut1, temp.b8,
+                                       lengthInBytes1);
+                        lengthInBytes1 = 0;
+                /* lengthInBytes1 == KASUMI_BLOCK_SIZE */
+                } else {
+                        xor_keystrm_rev(pBufferOut1, pBufferIn1, b1.b64[0]);
+                        lengthInBytes1 = 0;
+                }
+                if (lengthInBytes2 > KASUMI_BLOCK_SIZE) {
+                        pBufferIn2 = xor_keystrm_rev(pBufferOut2,
+                                                     pBufferIn2, b2.b64[0]);
+                        pBufferOut2 += KASUMI_BLOCK_SIZE;
+                        b2.b64[0] ^= a2.b64[0];
+                        b2.b16[0] ^= (uint16_t)++blkcnt;
+                        lengthInBytes2 -= KASUMI_BLOCK_SIZE;
+                } else if (lengthInBytes2 < KASUMI_BLOCK_SIZE) {
+                        memcpy_keystrm(safeInBuf.b8, pBufferIn2,
+                                       lengthInBytes2);
+                        xor_keystrm_rev(temp.b8, safeInBuf.b8, b2.b64[0]);
+                        memcpy_keystrm(pBufferOut2, temp.b8,
+                                       lengthInBytes2);
+                        lengthInBytes2 = 0;
+                /* lengthInBytes2 == KASUMI_BLOCK_SIZE */
+                } else {
+                        xor_keystrm_rev(pBufferOut2, pBufferIn2, b2.b64[0]);
+                        lengthInBytes2 = 0;
+                }
+        }
+
+        if (lengthInBytes1 < lengthInBytes2) {
+                /* packet 2 is not completed since lengthInBytes2 > 0
+                *  packet 1 has less than 8 bytes.
+                */
+                if (lengthInBytes1) {
+                        kasumi_1_block(pCtx->sk16, b1.b16);
+                        xor_keystrm_rev(pBufferOut1, pBufferIn1, b1.b64[0]);
+                }
+                /* move pointers to right variables for packet 1 */
+                lengthInBytes1 = lengthInBytes2;
+                b1.b64[0] = b2.b64[0];
+                a1.b64[0] = a2.b64[0];
+                pBufferIn1 = pBufferIn2;
+                pBufferOut1 = pBufferOut2;
+        } else { /* lengthInBytes1 >= lengthInBytes2 */
+                if (!lengthInBytes1)
+                        /* both packets are completed */
+                        return;
+                /* process the remaining of packet 2 */
+                if (lengthInBytes2) {
+                        kasumi_1_block(pCtx->sk16, b2.b16);
+                        xor_keystrm_rev(pBufferOut2, pBufferIn2, b2.b64[0]);
+                }
+                /* packet 1 is not completed */
+        }
+
+        /* process the length difference from ipkt1 and pkt2 */
+        while (lengthInBytes1) {
+                /* KASUMI it to produce the next block of keystream */
+                kasumi_1_block(pCtx->sk16, b1.b16);
+
+                if (lengthInBytes1 > KASUMI_BLOCK_SIZE) {
+                        pBufferIn1 = xor_keystrm_rev(pBufferOut1,
+                                                     pBufferIn1, b1.b64[0]);
+                        pBufferOut1 += KASUMI_BLOCK_SIZE;
+                        /* loop variant */
+                        lengthInBytes1 -= KASUMI_BLOCK_SIZE;
+
+                        /* apply the modifier and update the block count */
+                        b1.b64[0] ^= a1.b64[0];
+                        b1.b16[0] ^= (uint16_t)++blkcnt;
+                } else if (lengthInBytes1 < KASUMI_BLOCK_SIZE) {
+                        /* end of the loop, handle the last bytes */
+                        memcpy_keystrm(safeInBuf.b8, pBufferIn1,
+                                       lengthInBytes1);
+                        xor_keystrm_rev(temp.b8, safeInBuf.b8, b1.b64[0]);
+                        memcpy_keystrm(pBufferOut1, temp.b8,
+                                       lengthInBytes1);
+                        lengthInBytes1 = 0;
+                /* lengthInBytes1 == KASUMI_BLOCK_SIZE */
+                } else {
+                        xor_keystrm_rev(pBufferOut1, pBufferIn1, b1.b64[0]);
+                        lengthInBytes1 = 0;
+                }
+        }
+#ifdef SAFE_DATA
+        /* Clear sensitive data in stack */
+        clear_mem(&a1, sizeof(a1));
+        clear_mem(&b1, sizeof(b1));
+        clear_mem(&a2, sizeof(a2));
+        clear_mem(&b2, sizeof(b2));
+        clear_mem(&temp, sizeof(temp));
+        clear_mem(&safeInBuf, sizeof(safeInBuf));
+#endif
+}
+
+static inline void
+kasumi_f8_3_buffer(const kasumi_key_sched_t *pCtx,
+                   const uint64_t IV1, const uint64_t IV2, const uint64_t IV3,
+                   const void *pIn1, void *pOut1,
+                   const void *pIn2, void *pOut2,
+                   const void *pIn3, void *pOut3,
+                   const uint32_t length)
+{
+        const uint8_t *pBufferIn1 = (const uint8_t *) pIn1;
+        uint8_t *pBufferOut1 = (uint8_t *) pOut1;
+        const uint8_t *pBufferIn2 = (const uint8_t *) pIn2;
+        uint8_t *pBufferOut2 = (uint8_t *) pOut2;
+        const uint8_t *pBufferIn3 = (const uint8_t *) pIn3;
+        uint8_t *pBufferOut3 = (uint8_t *) pOut3;
+        uint32_t lengthInBytes = length;
+        uint32_t blkcnt;
+        kasumi_union_t a1, b1; /* the modifier */
+        kasumi_union_t a2, b2; /* the modifier */
+        kasumi_union_t a3, b3; /* the modifier */
+        SafeBuf safeInBuf1, safeInBuf2, safeInBuf3;
+
+        /* IV Endianity  */
+        a1.b64[0] = BSWAP64(IV1);
+        a2.b64[0] = BSWAP64(IV2);
+        a3.b64[0] = BSWAP64(IV3);
+
+        kasumi_3_blocks(pCtx->msk16, a1.b16, a2.b16, a3.b16);
+
+        /* Final initialisation steps */
+        blkcnt = 0;
+        b1.b64[0] = a1.b64[0];
+        b2.b64[0] = a2.b64[0];
+        b3.b64[0] = a3.b64[0];
+
+        /* Now run the block cipher for common packet lengthInBytes, a whole
+         * number of blocks */
+        while (lengthInBytes) {
+                /* KASUMI it to produce the next block of keystream for all the
+                 * packets */
+                kasumi_3_blocks(pCtx->sk16, b1.b16, b2.b16, b3.b16);
+
+                if (lengthInBytes > KASUMI_BLOCK_SIZE) {
+                        /* xor and write keystream */
+                        pBufferIn1 = xor_keystrm_rev(pBufferOut1,
+                                                     pBufferIn1, b1.b64[0]);
+                        pBufferOut1 += KASUMI_BLOCK_SIZE;
+                        pBufferIn2 = xor_keystrm_rev(pBufferOut2,
+                                                     pBufferIn2, b2.b64[0]);
+                        pBufferOut2 += KASUMI_BLOCK_SIZE;
+                        pBufferIn3 = xor_keystrm_rev(pBufferOut3,
+                                                     pBufferIn3, b3.b64[0]);
+                        pBufferOut3 += KASUMI_BLOCK_SIZE;
+                        /* loop variant */
+                        lengthInBytes -= KASUMI_BLOCK_SIZE;
+
+                        /* apply the modifier and update the block count */
+                        b1.b64[0] ^= a1.b64[0];
+                        b1.b16[0] ^= (uint16_t)++blkcnt;
+                        b2.b64[0] ^= a2.b64[0];
+                        b2.b16[0] ^= (uint16_t)blkcnt;
+                        b3.b64[0] ^= a3.b64[0];
+                        b3.b16[0] ^= (uint16_t)blkcnt;
+                } else if (lengthInBytes < KASUMI_BLOCK_SIZE) {
+                        /* end of the loop, handle the last bytes */
+                        memcpy_keystrm(safeInBuf1.b8, pBufferIn1,
+                                       lengthInBytes);
+                        xor_keystrm_rev(b1.b8, safeInBuf1.b8, b1.b64[0]);
+                        memcpy_keystrm(pBufferOut1, b1.b8, lengthInBytes);
+
+                        memcpy_keystrm(safeInBuf2.b8, pBufferIn2,
+                                       lengthInBytes);
+                        xor_keystrm_rev(b2.b8, safeInBuf2.b8, b2.b64[0]);
+                        memcpy_keystrm(pBufferOut2, b2.b8, lengthInBytes);
+
+                        memcpy_keystrm(safeInBuf3.b8, pBufferIn3,
+                                       lengthInBytes);
+                        xor_keystrm_rev(b3.b8, safeInBuf3.b8, b3.b64[0]);
+                        memcpy_keystrm(pBufferOut3, b3.b8, lengthInBytes);
+                        lengthInBytes = 0;
+                /* lengthInBytes == KASUMI_BLOCK_SIZE */
+                } else {
+                        xor_keystrm_rev(pBufferOut1, pBufferIn1, b1.b64[0]);
+                        xor_keystrm_rev(pBufferOut2, pBufferIn2, b2.b64[0]);
+                        xor_keystrm_rev(pBufferOut3, pBufferIn3, b3.b64[0]);
+                        lengthInBytes = 0;
+                }
+        }
+#ifdef SAFE_DATA
+        /* Clear sensitive data in stack */
+        clear_mem(&a1, sizeof(a1));
+        clear_mem(&b1, sizeof(b1));
+        clear_mem(&a2, sizeof(a2));
+        clear_mem(&b2, sizeof(b2));
+        clear_mem(&a3, sizeof(a3));
+        clear_mem(&b3, sizeof(b3));
+        clear_mem(&safeInBuf1, sizeof(safeInBuf1));
+        clear_mem(&safeInBuf2, sizeof(safeInBuf2));
+        clear_mem(&safeInBuf3, sizeof(safeInBuf3));
+#endif
+}
+
+/*---------------------------------------------------------
+* @description
+*       Kasumi F8 4 packet:
+*       Four packets enc/dec with the same key schedule.
+*       The 4 Ivs are independent and are passed as an array of values
+*       The packets are separate, the datalength is common
+*---------------------------------------------------------*/
+
+static inline void
+kasumi_f8_4_buffer(const kasumi_key_sched_t *pCtx, const uint64_t IV1,
+                   const uint64_t IV2, const uint64_t IV3, const uint64_t IV4,
+                   const void *pIn1, void *pOut1,
+                   const void *pIn2, void *pOut2,
+                   const void *pIn3, void *pOut3,
+                   const void *pIn4, void *pOut4,
+                   const uint32_t length)
+{
+        const uint8_t *pBufferIn1 = (const uint8_t *) pIn1;
+        uint8_t *pBufferOut1 = (uint8_t *) pOut1;
+        const uint8_t *pBufferIn2 = (const uint8_t *) pIn2;
+        uint8_t *pBufferOut2 = (uint8_t *) pOut2;
+        const uint8_t *pBufferIn3 = (const uint8_t *) pIn3;
+        uint8_t *pBufferOut3 = (uint8_t *) pOut3;
+        const uint8_t *pBufferIn4 = (const uint8_t *) pIn4;
+        uint8_t *pBufferOut4 = (uint8_t *) pOut4;
+        uint32_t lengthInBytes = length;
+        uint32_t blkcnt;
+        kasumi_union_t a1, b1; /* the modifier */
+        kasumi_union_t a2, b2; /* the modifier */
+        kasumi_union_t a3, b3; /* the modifier */
+        kasumi_union_t a4, b4; /* the modifier */
+        uint16_t *pTemp[4] = {b1.b16, b2.b16, b3.b16, b4.b16};
+        SafeBuf safeInBuf1, safeInBuf2, safeInBuf3, safeInBuf4;
+
+        /* IV Endianity  */
+        b1.b64[0] = BSWAP64(IV1);
+        b2.b64[0] = BSWAP64(IV2);
+        b3.b64[0] = BSWAP64(IV3);
+        b4.b64[0] = BSWAP64(IV4);
+
+        kasumi_4_blocks(pCtx->msk16, pTemp);
+
+        /* Final initialisation steps */
+        blkcnt = 0;
+        a1.b64[0] = b1.b64[0];
+        a2.b64[0] = b2.b64[0];
+        a3.b64[0] = b3.b64[0];
+        a4.b64[0] = b4.b64[0];
+
+        /* Now run the block cipher for common packet lengthInBytes, a whole
+         * number of blocks */
+        while (lengthInBytes) {
+                /* KASUMI it to produce the next block of keystream for all the
+                 * packets */
+                kasumi_4_blocks(pCtx->sk16, pTemp);
+
+                if (lengthInBytes > KASUMI_BLOCK_SIZE) {
+                        /* xor and write keystream */
+                        pBufferIn1 = xor_keystrm_rev(pBufferOut1,
+                                                     pBufferIn1, b1.b64[0]);
+                        pBufferOut1 += KASUMI_BLOCK_SIZE;
+                        pBufferIn2 = xor_keystrm_rev(pBufferOut2,
+                                                     pBufferIn2, b2.b64[0]);
+                        pBufferOut2 += KASUMI_BLOCK_SIZE;
+                        pBufferIn3 = xor_keystrm_rev(pBufferOut3,
+                                                     pBufferIn3, b3.b64[0]);
+                        pBufferOut3 += KASUMI_BLOCK_SIZE;
+                        pBufferIn4 = xor_keystrm_rev(pBufferOut4,
+                                                     pBufferIn4, b4.b64[0]);
+                        pBufferOut4 += KASUMI_BLOCK_SIZE;
+                        /* loop variant */
+                        lengthInBytes -= KASUMI_BLOCK_SIZE;
+
+                        /* apply the modifier and update the block count */
+                        b1.b64[0] ^= a1.b64[0];
+                        b1.b16[0] ^= (uint16_t)++blkcnt;
+                        b2.b64[0] ^= a2.b64[0];
+                        b2.b16[0] ^= (uint16_t)blkcnt;
+                        b3.b64[0] ^= a3.b64[0];
+                        b3.b16[0] ^= (uint16_t)blkcnt;
+                        b4.b64[0] ^= a4.b64[0];
+                        b4.b16[0] ^= (uint16_t)blkcnt;
+                } else if (lengthInBytes < KASUMI_BLOCK_SIZE) {
+                        /* end of the loop, handle the last bytes */
+                        memcpy_keystrm(safeInBuf1.b8, pBufferIn1,
+                                       lengthInBytes);
+                        xor_keystrm_rev(b1.b8, safeInBuf1.b8, b1.b64[0]);
+                        memcpy_keystrm(pBufferOut1, b1.b8, lengthInBytes);
+
+                        memcpy_keystrm(safeInBuf2.b8, pBufferIn2,
+                                       lengthInBytes);
+                        xor_keystrm_rev(b2.b8, safeInBuf2.b8, b2.b64[0]);
+                        memcpy_keystrm(pBufferOut2, b2.b8, lengthInBytes);
+
+                        memcpy_keystrm(safeInBuf3.b8, pBufferIn3,
+                                       lengthInBytes);
+                        xor_keystrm_rev(b3.b8, safeInBuf3.b8, b3.b64[0]);
+                        memcpy_keystrm(pBufferOut3, b3.b8, lengthInBytes);
+
+                        memcpy_keystrm(safeInBuf4.b8, pBufferIn4,
+                                       lengthInBytes);
+                        xor_keystrm_rev(b4.b8, safeInBuf4.b8, b4.b64[0]);
+                        memcpy_keystrm(pBufferOut4, b4.b8, lengthInBytes);
+                        lengthInBytes = 0;
+                /* lengthInBytes == KASUMI_BLOCK_SIZE */
+                } else {
+                        xor_keystrm_rev(pBufferOut1, pBufferIn1, b1.b64[0]);
+                        xor_keystrm_rev(pBufferOut2, pBufferIn2, b2.b64[0]);
+                        xor_keystrm_rev(pBufferOut3, pBufferIn3, b3.b64[0]);
+                        xor_keystrm_rev(pBufferOut4, pBufferIn4, b4.b64[0]);
+                        lengthInBytes = 0;
+                }
+        }
+#ifdef SAFE_DATA
+        /* Clear sensitive data in stack */
+        clear_mem(&a1, sizeof(a1));
+        clear_mem(&b1, sizeof(b1));
+        clear_mem(&a2, sizeof(a2));
+        clear_mem(&b2, sizeof(b2));
+        clear_mem(&a3, sizeof(a3));
+        clear_mem(&b3, sizeof(b3));
+        clear_mem(&a4, sizeof(a4));
+        clear_mem(&b4, sizeof(b4));
+        clear_mem(&safeInBuf1, sizeof(safeInBuf1));
+        clear_mem(&safeInBuf2, sizeof(safeInBuf2));
+        clear_mem(&safeInBuf3, sizeof(safeInBuf3));
+        clear_mem(&safeInBuf4, sizeof(safeInBuf4));
+#endif
+}
+
+/*---------------------------------------------------------
+* @description
+*       Kasumi F8 2 packet:
+*       Two packets enc/dec with the same key schedule.
+*       The 2 Ivs are independent and are passed as an array of values.
+*       The packets are separate, the datalength is common
+*---------------------------------------------------------*/
+/******************************************************************************
+* @description
+*       Kasumi F8 n packet:
+*       Performs F8 enc/dec on [n] packets. The operation is performed in-place.
+*       The input IV's are passed in Big Endian format.
+*       The KeySchedule is in Little Endian format.
+*******************************************************************************/
+
+static inline void
+kasumi_f8_n_buffer(const kasumi_key_sched_t *pKeySchedule, const uint64_t IV[],
+                   const void * const pIn[], void *pOut[],
+                   const uint32_t lengths[], const uint32_t bufCount)
+{
+        if (bufCount > 16) {
+                pOut[0] = NULL;
+                printf("dataCount too high (%d)\n", bufCount);
+                return;
+        }
+
+        uint32_t dataCount = bufCount;
+        kasumi_union_t A[NUM_PACKETS_16], temp[NUM_PACKETS_16], tempSort;
+        uint16_t *data[NUM_PACKETS_16];
+        uint32_t dataLen[NUM_PACKETS_16];
+        uint8_t *pDataOut[NUM_PACKETS_16] = {NULL};
+        const uint8_t *pDataIn[NUM_PACKETS_16] = {NULL};
+        const uint8_t *srctempbuff;
+        uint8_t *dsttempbuff;
+        uint32_t blkcnt = 0;
+        uint32_t len = 0;
+        uint32_t packet_idx, inner_idx, same_size_blocks;
+        int sortNeeded = 0, tempLen = 0;
+        SafeBuf safeInBuf;
+
+        memcpy((void *)dataLen, lengths, dataCount * sizeof(uint32_t));
+        memcpy((void *)pDataIn, pIn, dataCount * sizeof(void *));
+        memcpy((void *)pDataOut, pOut, dataCount * sizeof(void *));
+
+        /* save the IV to A for each packet */
+        packet_idx = dataCount;
+        while (packet_idx--) {
+                /*copy IV in reverse endian order as input IV is BE */
+                temp[packet_idx].b64[0] = BSWAP64(IV[packet_idx]);
+
+                /* set LE IV pointers */
+                data[packet_idx] = temp[packet_idx].b16;
+
+                /* check if all packets are sorted by decreasing length */
+                if (packet_idx > 0 &&
+                    dataLen[packet_idx - 1] < dataLen[packet_idx])
+                        /* this packet array is not correctly sorted  */
+                        sortNeeded = 1;
+        }
+
+        /* do 1st kasumi block on A with modified key, this overwrites A */
+        kasumiWrapperArray[dataCount](pKeySchedule->msk16, data);
+
+        if (sortNeeded) {
+                /* sort packets in decreasing buffer size from [0] to [n]th
+                packet,
+                        ** where buffer[0] will contain longest buffer and
+                buffer[n] will
+                contain the shortest buffer.
+                4 arrays are swapped :
+                - pointers to input buffers
+                - pointers to output buffers
+                - pointers to input IV's
+                - input buffer lengths
+                */
+                packet_idx = dataCount;
+                while (packet_idx--) {
+                        inner_idx = packet_idx;
+                        while (inner_idx--) {
+                                if (dataLen[packet_idx] > dataLen[inner_idx]) {
+
+                                        /* swap buffers to arrange in descending
+                                         * order from [0]. */
+                                        srctempbuff = pDataIn[packet_idx];
+                                        dsttempbuff = pDataOut[packet_idx];
+                                        tempSort = temp[packet_idx];
+                                        tempLen = dataLen[packet_idx];
+
+                                        pDataIn[packet_idx] =
+                                            pDataIn[inner_idx];
+                                        pDataOut[packet_idx] =
+                                            pDataOut[inner_idx];
+                                        temp[packet_idx] = temp[inner_idx];
+                                        dataLen[packet_idx] =
+                                            dataLen[inner_idx];
+
+                                        pDataIn[inner_idx] = srctempbuff;
+                                        pDataOut[inner_idx] = dsttempbuff;
+                                        temp[inner_idx] = tempSort;
+                                        dataLen[inner_idx] = tempLen;
+                                }
+                        } /* for inner packet idx (inner bubble-sort) */
+                }         /* for outer packet idx (outer bubble-sort) */
+        }                 /* if sortNeeded */
+
+        packet_idx = dataCount;
+        while (packet_idx--)
+                /* copy the schedule */
+                A[packet_idx].b64[0] = temp[packet_idx].b64[0];
+
+        while (dataCount > 0) {
+                /* max num of blocks left depends on roundUp(smallest packet),
+                * The shortest stream to process is always stored at location
+                * [dataCount - 1]
+                */
+                same_size_blocks =
+                    ((dataLen[dataCount - 1] + KASUMI_BLOCK_SIZE - 1) /
+                     KASUMI_BLOCK_SIZE) -
+                    blkcnt;
+
+                /* process streams of complete blocks */
+                while (same_size_blocks-- > 1) {
+                        /* do kasumi block encryption */
+                        kasumiWrapperArray[dataCount](pKeySchedule->sk16,
+                                                          data);
+
+                        packet_idx = dataCount;
+                        while (packet_idx--)
+                                xor_keystrm_rev(pDataOut[packet_idx] + len,
+                                                pDataIn[packet_idx] + len,
+                                                temp[packet_idx].b64[0]);
+
+                        /* length already done since the start of the packets */
+                        len += KASUMI_BLOCK_SIZE;
+
+                        /* block idx is incremented and rewritten in the
+                         * keystream */
+                        blkcnt += 1;
+                        packet_idx = dataCount;
+                        while (packet_idx--) {
+                                temp[packet_idx].b64[0] ^= A[packet_idx].b64[0];
+                                temp[packet_idx].b16[0] ^= (uint16_t)blkcnt;
+                        } /* for packet_idx */
+
+                } /* while same_size_blocks  (iteration on multiple blocks) */
+
+                /* keystream for last block of all packets */
+                kasumiWrapperArray[dataCount](pKeySchedule->sk16, data);
+
+                /* process incomplete blocks without overwriting past the buffer
+                 * end */
+                while ((dataCount > 0) &&
+                       (dataLen[dataCount - 1] < (len + KASUMI_BLOCK_SIZE))) {
+
+                        dataCount--;
+                        /* incomplete block is copied into a temp buffer */
+                        memcpy_keystrm(safeInBuf.b8, pDataIn[dataCount] + len,
+                                       dataLen[dataCount] - len);
+                        xor_keystrm_rev(temp[dataCount].b8,
+                                        safeInBuf.b8,
+                                        temp[dataCount].b64[0]);
+
+                        memcpy_keystrm(pDataOut[dataCount] + len,
+                                       temp[dataCount].b8,
+                                       dataLen[dataCount] - len);
+                } /* while dataCount */
+
+                /* process last blocks: it can be the last complete block of the
+                packets or, if
+                KASUMI_SAFE_BUFFER is defined, the last block (complete or not)
+                of the packets*/
+                while ((dataCount > 0) &&
+                       (dataLen[dataCount - 1] <= (len + KASUMI_BLOCK_SIZE))) {
+
+                        dataCount--;
+                        xor_keystrm_rev(pDataOut[dataCount] + len,
+                                        pDataIn[dataCount] + len,
+                                        temp[dataCount].b64[0]);
+                } /* while dataCount */
+                /* block idx is incremented and rewritten in the keystream */
+                blkcnt += 1;
+
+                /* for the following packets, this block is not the last one:
+                dataCount is not decremented */
+                packet_idx = dataCount;
+                while (packet_idx--) {
+
+                        xor_keystrm_rev(pDataOut[packet_idx] + len,
+                                        pDataIn[packet_idx] + len,
+                                        temp[packet_idx].b64[0]);
+                        temp[packet_idx].b64[0] ^= A[packet_idx].b64[0];
+                        temp[packet_idx].b16[0] ^= (uint16_t)blkcnt;
+                } /* while packet_idx */
+
+                /* length already done since the start of the packets */
+                len += KASUMI_BLOCK_SIZE;
+
+                /* the remaining packets, if any, have now at least one valid
+                block, which might be complete or not */
+
+        } /* while (dataCount) */
+#ifdef SAFE_DATA
+        uint32_t i;
+
+        /* Clear sensitive data in stack */
+        for (i = 0; i < dataCount; i++) {
+                clear_mem(&A[i], sizeof(A[i]));
+                clear_mem(&temp[i], sizeof(temp[i]));
+        }
+        clear_mem(&tempSort, sizeof(tempSort));
+        clear_mem(&safeInBuf, sizeof(safeInBuf));
+#endif
+}
+
+static inline void
+kasumi_f9_1_buffer(const kasumi_key_sched_t *pCtx, const void *dataIn,
+                   const uint32_t length, void *pDigest)
+{
+        kasumi_union_t a, b, mask;
+        const uint64_t *pIn = (const uint64_t *)dataIn;
+        uint32_t lengthInBytes = length;
+        SafeBuf safeBuf;
+
+        /* Init */
+        a.b64[0] = 0;
+        b.b64[0] = 0;
+        mask.b64[0] = -1;
+
+        /* Now run kasumi for all 8 byte blocks */
+        while (lengthInBytes >= 8) {
+
+                a.b64[0] ^= BSWAP64(*(pIn++));
+
+                /* KASUMI it */
+                kasumi_1_block(pCtx->sk16, a.b16);
+
+                /* loop variant */
+                lengthInBytes -= 8; /* done another 64 bits */
+
+                /* update */
+                b.b64[0] ^= a.b64[0];
+        }
+
+        if (lengthInBytes) {
+                /* Not a whole 8 byte block remaining */
+                mask.b64[0] = ~(mask.b64[0] >> (BYTESIZE * lengthInBytes));
+                memcpy(&safeBuf.b64, pIn, lengthInBytes);
+                mask.b64[0] &= BSWAP64(safeBuf.b64);
+                a.b64[0] ^= mask.b64[0];
+
+                /* KASUMI it */
+                kasumi_1_block(pCtx->sk16, a.b16);
+
+                /* update */
+                b.b64[0] ^= a.b64[0];
+        }
+
+        /* Kasumi b */
+        kasumi_1_block(pCtx->msk16, b.b16);
+
+        /* swap result */
+        *(uint32_t *)pDigest = bswap4(b.b32[1]);
+#ifdef SAFE_DATA
+        /* Clear sensitive data in stack */
+        clear_mem(&a, sizeof(a));
+        clear_mem(&b, sizeof(b));
+        clear_mem(&mask, sizeof(mask));
+        clear_mem(&safeBuf, sizeof(safeBuf));
+#endif
+}
+
+/*---------------------------------------------------------
+* @description
+*       Kasumi F9 1 packet with user config:
+*       Single packet digest with user defined IV, and precomputed key schedule.
+*
+*       IV = swap32(count) << 32 | swap32(fresh)
+*
+*---------------------------------------------------------*/
+
+static inline void
+kasumi_f9_1_buffer_user(const kasumi_key_sched_t *pCtx, const uint64_t IV,
+                        const void *pDataIn, const uint32_t length,
+                        void *pDigest, const uint32_t direction)
+{
+        kasumi_union_t a, b, mask, message, temp;
+        uint32_t lengthInBits = length;
+        const uint64_t *pIn = (const uint64_t *)pDataIn;
+        kasumi_union_t safebuff;
+
+        a.b64[0] = 0;
+        b.b64[0] = 0;
+
+        /* Use the count and fresh for first round */
+        a.b64[0] = BSWAP64(IV);
+        /* KASUMI it */
+        kasumi_1_block(pCtx->sk16, a.b16);
+        /* update */
+        b.b64[0] = a.b64[0];
+
+        /* Now run kasumi for all 8 byte blocks */
+        while (lengthInBits >= QWORDSIZEINBITS) {
+                a.b64[0] ^= BSWAP64(*(pIn++));
+                /* KASUMI it */
+                kasumi_1_block(pCtx->sk16, a.b16);
+                /* loop variant */
+                lengthInBits -= 64; /* done another 64 bits */
+                /* update */
+                b.b64[0] ^= a.b64[0];
+        }
+
+        /* Is there any non 8 byte blocks remaining ? */
+        if (lengthInBits == 0) {
+                /* last block is : direct + 1 + 62 0's */
+                a.b64[0] ^= ((uint64_t)direction + direction + LAST_PADDING_BIT)
+                            << (QWORDSIZEINBITS - 2);
+                kasumi_1_block(pCtx->sk16, a.b16);
+                /* update */
+                b.b64[0] ^= a.b64[0];
+        } else if (lengthInBits <= (QWORDSIZEINBITS - 2)) {
+                /* last block is : message + direction + LAST_PADDING_BITS(1) +
+                 * less than 62 0's */
+                mask.b64[0] = -1;
+                temp.b64[0] = 0;
+                message.b64[0] = 0;
+                mask.b64[0] = ~(mask.b64[0] >> lengthInBits);
+                /*round up and copy last lengthInBits */
+                memcpy(&safebuff.b64[0], pIn, (lengthInBits + 7) / 8);
+                message.b64[0] = BSWAP64(safebuff.b64[0]);
+                temp.b64[0] = mask.b64[0] & message.b64[0];
+                temp.b64[0] |=
+                    ((uint64_t)direction + direction + LAST_PADDING_BIT)
+                    << ((QWORDSIZEINBITS - 2) - lengthInBits);
+                a.b64[0] ^= temp.b64[0];
+                /* KASUMI it */
+                kasumi_1_block(pCtx->sk16, a.b16);
+
+                /* update */
+                b.b64[0] ^= a.b64[0];
+        } else if (lengthInBits == (QWORDSIZEINBITS - 1)) {
+                /* next block is : message + direct  */
+                /* last block is : 1 + 63 0's */
+                a.b64[0] ^= direction | (~1 & BSWAP64(*(pIn++)));
+                /* KASUMI it */
+                kasumi_1_block(pCtx->sk16, a.b16);
+                /* update */
+                b.b64[0] ^= a.b64[0];
+                a.b8[QWORDSIZEINBYTES - 1] ^= (LAST_PADDING_BIT)
+                                              << (QWORDSIZEINBYTES - 1);
+                /* KASUMI it */
+                kasumi_1_block(pCtx->sk16, a.b16);
+                /* update */
+                b.b64[0] ^= a.b64[0];
+        }
+        /* Kasumi b */
+        kasumi_1_block(pCtx->msk16, b.b16);
+
+        /* swap result */
+        *(uint32_t *)pDigest = bswap4(b.b32[1]);
+#ifdef SAFE_DATA
+        /* Clear sensitive data in stack */
+        clear_mem(&a, sizeof(a));
+        clear_mem(&b, sizeof(b));
+        clear_mem(&mask, sizeof(mask));
+        clear_mem(&message, sizeof(message));
+        clear_mem(&temp, sizeof(temp));
+        clear_mem(&safebuff, sizeof(safebuff));
+#endif
+}
+
+void kasumi_f8_1_buffer_sse(const kasumi_key_sched_t *pCtx, const uint64_t IV,
+                            const void *pBufferIn, void *pBufferOut,
+                            const uint32_t cipherLengthInBytes);
+
+void kasumi_f8_1_buffer_bit_sse(const kasumi_key_sched_t *pCtx,
+                                const uint64_t IV,
+                                const void *pBufferIn, void *pBufferOut,
+                                const uint32_t cipherLengthInBits,
+                                const uint32_t offsetInBits);
+
+void kasumi_f8_2_buffer_sse(const kasumi_key_sched_t *pCtx,
+                            const uint64_t IV1, const uint64_t IV2,
+                            const void *pBufferIn1, void *pBufferOut1,
+                            const uint32_t lengthInBytes1,
+                            const void *pBufferIn2, void *pBufferOut2,
+                            const uint32_t lengthInBytes2);
+
+void kasumi_f8_3_buffer_sse(const kasumi_key_sched_t *pCtx, const uint64_t IV1,
+                            const uint64_t IV2, const uint64_t IV3,
+                            const void *pBufferIn1, void *pBufferOut1,
+                            const void *pBufferIn2, void *pBufferOut2,
+                            const void *pBufferIn3, void *pBufferOut3,
+                            const uint32_t lengthInBytes);
+
+void kasumi_f8_4_buffer_sse(const kasumi_key_sched_t *pCtx,
+                            const uint64_t IV1, const uint64_t IV2,
+                            const uint64_t IV3, const uint64_t IV4,
+                            const void *pBufferIn1, void *pBufferOut1,
+                            const void *pBufferIn2, void *pBufferOut2,
+                            const void *pBufferIn3, void *pBufferOut3,
+                            const void *pBufferIn4, void *pBufferOut4,
+                            const uint32_t lengthInBytes);
+
+void kasumi_f8_n_buffer_sse(const kasumi_key_sched_t *pKeySchedule,
+                            const uint64_t IV[],
+                            const void * const pDataIn[], void *pDataOut[],
+                            const uint32_t dataLen[], const uint32_t dataCount);
+
+void kasumi_f9_1_buffer_sse(const kasumi_key_sched_t *pCtx,
+                            const void *pBufferIn,
+                            const uint32_t lengthInBytes, void *pDigest);
+
+void kasumi_f9_1_buffer_user_sse(const kasumi_key_sched_t *pCtx,
+                                 const uint64_t IV, const void *pBufferIn,
+                                 const uint32_t lengthInBits,
+                                 void *pDigest, const uint32_t direction);
+
+
+void kasumi_f8_1_buffer_avx(const kasumi_key_sched_t *pCtx, const uint64_t IV,
+                            const void *pBufferIn, void *pBufferOut,
+                            const uint32_t cipherLengthInBytes);
+void kasumi_f8_1_buffer_bit_avx(const kasumi_key_sched_t *pCtx,
+                                const uint64_t IV,
+                                const void *pBufferIn, void *pBufferOut,
+                                const uint32_t cipherLengthInBits,
+                                const uint32_t offsetInBits);
+void kasumi_f8_2_buffer_avx(const kasumi_key_sched_t *pCtx,
+                            const uint64_t IV1, const uint64_t IV2,
+                            const void *pBufferIn1, void *pBufferOut1,
+                            const uint32_t lengthInBytes1,
+                            const void *pBufferIn2, void *pBufferOut2,
+                            const uint32_t lengthInBytes2);
+void kasumi_f8_3_buffer_avx(const kasumi_key_sched_t *pCtx, const uint64_t IV1,
+                            const uint64_t IV2, const uint64_t IV3,
+                            const void *pBufferIn1, void *pBufferOut1,
+                            const void *pBufferIn2, void *pBufferOut2,
+                            const void *pBufferIn3, void *pBufferOut3,
+                            const uint32_t lengthInBytes);
+void kasumi_f8_4_buffer_avx(const kasumi_key_sched_t *pCtx,
+                            const uint64_t IV1, const uint64_t IV2,
+                            const uint64_t IV3, const uint64_t IV4,
+                            const void *pBufferIn1, void *pBufferOut1,
+                            const void *pBufferIn2, void *pBufferOut2,
+                            const void *pBufferIn3, void *pBufferOut3,
+                            const void *pBufferIn4, void *pBufferOut4,
+                            const uint32_t lengthInBytes);
+void kasumi_f8_n_buffer_avx(const kasumi_key_sched_t *pKeySchedule,
+                            const uint64_t IV[],
+                            const void * const pDataIn[], void *pDataOut[],
+                            const uint32_t dataLen[], const uint32_t dataCount);
+
+void kasumi_f9_1_buffer_avx(const kasumi_key_sched_t *pCtx,
+                            const void *pBufferIn,
+                            const uint32_t lengthInBytes, void *pDigest);
+
+void kasumi_f9_1_buffer_user_avx(const kasumi_key_sched_t *pCtx,
+                                 const uint64_t IV, const void *pBufferIn,
+                                 const uint32_t lengthInBits,
+                                 void *pDigest, const uint32_t direction);
+#endif /*_KASUMI_INTERNAL_H_*/
+
diff --git a/src/spdk/intel-ipsec-mb/include/memcpy.asm b/src/spdk/intel-ipsec-mb/include/memcpy.asm
new file mode 100644
index 000000000..82e4f2cb2
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/include/memcpy.asm
@@ -0,0 +1,613 @@
+;;
+;; Copyright (c) 2012-2018, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;;     * Redistributions of source code must retain the above copyright notice,
+;;       this list of conditions and the following disclaimer.
+;;     * Redistributions in binary form must reproduce the above copyright
+;;       notice, this list of conditions and the following disclaimer in the
+;;       documentation and/or other materials provided with the distribution.
+;;     * Neither the name of Intel Corporation nor the names of its contributors
+;;       may be used to endorse or promote products derived from this software
+;;       without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+%ifndef __MEMCPY_ASM__
+%define __MEMCPY_ASM__
+
+%include "include/reg_sizes.asm"
+
+
+; This section defines a series of macros to copy small to medium amounts
+; of data from memory to memory, where the size is variable but limited.
+;
+; The macros are all called as:
+; memcpy DST, SRC, SIZE, TMP0, TMP1, XTMP0, XTMP1, XTMP2, XTMP3
+; with the parameters defined as:
+;    DST     : register: pointer to dst (not modified)
+;    SRC     : register: pointer to src (not modified)
+;    SIZE    : register: length in bytes (not modified)
+;    TMP0    : 64-bit temp GPR (clobbered)
+;    TMP1    : 64-bit temp GPR (clobbered)
+;    XTMP0   : temp XMM (clobbered)
+;    XTMP1   : temp XMM (clobbered)
+;    XTMP2   : temp XMM (clobbered)
+;    XTMP3   : temp XMM (clobbered)
+;
+; The name indicates the options. The name is of the form:
+; memcpy_<VEC>_<SZ><ZERO><RET>
+; where:
+; <VEC> is either "sse" or "avx" or "avx2"
+; <SZ> is either "64" or "128" and defines largest value of SIZE
+; <ZERO> is blank or "_1". If "_1" then the min SIZE is 1 (otherwise 0)
+; <RET> is blank or "_ret". If blank, the code falls through. If "ret"
+;                           it does a "ret" at the end
+;
+; For the avx2 versions, the temp XMM registers need to be YMM registers
+; If the SZ is 64, then only two YMM temps are needed, i.e. it is called as:
+; memcpy_avx2_64 DST, SRC, SIZE, TMP0, TMP1, YTMP0, YTMP1
+; memcpy_avx2_128 DST, SRC, SIZE, TMP0, TMP1, YTMP0, YTMP1, YTMP2, YTMP3
+;
+; For example:
+; memcpy_sse_64		: SSE,  0 <= size < 64, falls through
+; memcpy_avx_64_1	: AVX1, 1 <= size < 64, falls through
+; memcpy_sse_128_ret	: SSE,  0 <= size < 128, ends with ret
+; mempcy_avx_128_1_ret	: AVX1, 1 <= size < 128, ends with ret
+;
+
+%macro memcpy_sse_64 9
+	__memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 0, 64, 0, 0
+%endm
+
+%macro memcpy_sse_64_1 9
+	__memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 1, 64, 0, 0
+%endm
+
+%macro memcpy_sse_128 9
+	__memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 0, 128, 0, 0
+%endm
+
+%macro memcpy_sse_128_1 9
+	__memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 1, 128, 0, 0
+%endm
+
+%macro memcpy_sse_64_ret 9
+	__memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 0, 64, 1, 0
+%endm
+
+%macro memcpy_sse_64_1_ret 9
+	__memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 1, 64, 1, 0
+%endm
+
+%macro memcpy_sse_128_ret 9
+	__memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 0, 128, 1, 0
+%endm
+
+%macro memcpy_sse_128_1_ret 9
+	__memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 1, 128, 1, 0
+%endm
+
+
+%macro memcpy_sse_16 5
+	__memcpy_int %1,%2,%3,%4,%5,,,,, 0, 16, 0, 0
+%endm
+
+%macro memcpy_sse_16_1 5
+	__memcpy_int %1,%2,%3,%4,%5,,,,, 1, 16, 0, 0
+%endm
+
+	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%macro memcpy_avx_64 9
+	__memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 0, 64, 0, 1
+%endm
+
+%macro memcpy_avx_64_1 9
+	__memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 1, 64, 0, 1
+%endm
+
+%macro memcpy_avx_128 9
+	__memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 0, 128, 0, 1
+%endm
+
+%macro memcpy_avx_128_1 9
+	__memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 1, 128, 0, 1
+%endm
+
+%macro memcpy_avx_64_ret 9
+	__memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 0, 64, 1, 1
+%endm
+
+%macro memcpy_avx_64_1_ret 9
+	__memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 1, 64, 1, 1
+%endm
+
+%macro memcpy_avx_128_ret 9
+	__memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 0, 128, 1, 1
+%endm
+
+%macro memcpy_avx_128_1_ret 9
+	__memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 1, 128, 1, 1
+%endm
+
+
+%macro memcpy_avx_16 5
+	__memcpy_int %1,%2,%3,%4,%5,,,,, 0, 16, 0, 1
+%endm
+
+%macro memcpy_avx_16_1 5
+	__memcpy_int %1,%2,%3,%4,%5,,,,, 1, 16, 0, 1
+%endm
+
+	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%macro memcpy_avx2_64 7
+	__memcpy_int %1,%2,%3,%4,%5,%6,%7,--,--, 0, 64, 0, 2
+%endm
+
+%macro memcpy_avx2_64_1 7
+	__memcpy_int %1,%2,%3,%4,%5,%6,%7,--,--, 1, 64, 0, 2
+%endm
+
+%macro memcpy_avx2_128 9
+	__memcpy_int %1,%2,%3,%4,%5,%6,%7, %8, %9, 0, 128, 0, 2
+%endm
+
+%macro memcpy_avx2_128_1 9
+	__memcpy_int %1,%2,%3,%4,%5,%6,%7, %8, %9, 1, 128, 0, 2
+%endm
+
+%macro memcpy_avx2_64_ret 7
+	__memcpy_int %1,%2,%3,%4,%5,%6,%7,--,--, 0, 64, 1, 2
+%endm
+
+%macro memcpy_avx2_64_1_ret 7
+	__memcpy_int %1,%2,%3,%4,%5,%6,%7,--,--, 1, 64, 1, 2
+%endm
+
+%macro memcpy_avx2_128_ret 9
+	__memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 0, 128, 1, 2
+%endm
+
+%macro memcpy_avx2_128_1_ret 9
+	__memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 1, 128, 1, 2
+%endm
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+
+%macro __memcpy_int 13
+%define %%DST     %1	; register: pointer to dst (not modified)
+%define %%SRC     %2	; register: pointer to src (not modified)
+%define %%SIZE    %3	; register: length in bytes (not modified)
+%define %%TMP0    %4	; 64-bit temp GPR (clobbered)
+%define %%TMP1    %5	; 64-bit temp GPR (clobbered)
+%define %%XTMP0   %6	; temp XMM (clobbered)
+%define %%XTMP1   %7	; temp XMM (clobbered)
+%define %%XTMP2   %8	; temp XMM (clobbered)
+%define %%XTMP3   %9	; temp XMM (clobbered)
+%define %%NOT0    %10	; if not 0, then assume size cannot be zero
+%define %%MAXSIZE %11	; 128, 64, etc
+%define %%USERET  %12   ; if not 0, use "ret" at end
+%define %%USEAVX  %13   ; 0 = SSE, 1 = AVX1, 2 = AVX2
+
+%if (%%USERET != 0)
+ %define %%DONE	ret
+%else
+ %define %%DONE jmp %%end
+%endif
+
+%if (%%USEAVX != 0)
+ %define %%MOVDQU vmovdqu
+%else
+ %define %%MOVDQU movdqu
+%endif
+
+%if (%%MAXSIZE >= 128)
+	test	%%SIZE, 64
+	jz	%%lt64
+  %if (%%USEAVX >= 2)
+	%%MOVDQU	%%XTMP0, [%%SRC + 0*32]
+	%%MOVDQU	%%XTMP1, [%%SRC + 1*32]
+	%%MOVDQU	%%XTMP2, [%%SRC + %%SIZE - 2*32]
+	%%MOVDQU	%%XTMP3, [%%SRC + %%SIZE - 1*32]
+
+	%%MOVDQU	[%%DST + 0*32], %%XTMP0
+	%%MOVDQU	[%%DST + 1*32], %%XTMP1
+	%%MOVDQU	[%%DST + %%SIZE - 2*32], %%XTMP2
+	%%MOVDQU	[%%DST + %%SIZE - 1*32], %%XTMP3
+  %else
+	%%MOVDQU	%%XTMP0, [%%SRC + 0*16]
+	%%MOVDQU	%%XTMP1, [%%SRC + 1*16]
+	%%MOVDQU	%%XTMP2, [%%SRC + 2*16]
+	%%MOVDQU	%%XTMP3, [%%SRC + 3*16]
+	%%MOVDQU	[%%DST + 0*16], %%XTMP0
+	%%MOVDQU	[%%DST + 1*16], %%XTMP1
+	%%MOVDQU	[%%DST + 2*16], %%XTMP2
+	%%MOVDQU	[%%DST + 3*16], %%XTMP3
+
+	%%MOVDQU	%%XTMP0, [%%SRC + %%SIZE - 4*16]
+	%%MOVDQU	%%XTMP1, [%%SRC + %%SIZE - 3*16]
+	%%MOVDQU	%%XTMP2, [%%SRC + %%SIZE - 2*16]
+	%%MOVDQU	%%XTMP3, [%%SRC + %%SIZE - 1*16]
+	%%MOVDQU	[%%DST + %%SIZE - 4*16], %%XTMP0
+	%%MOVDQU	[%%DST + %%SIZE - 3*16], %%XTMP1
+	%%MOVDQU	[%%DST + %%SIZE - 2*16], %%XTMP2
+	%%MOVDQU	[%%DST + %%SIZE - 1*16], %%XTMP3
+  %endif
+	%%DONE
+%endif
+
+%if (%%MAXSIZE >= 64)
+%%lt64:
+	test	%%SIZE, 32
+	jz	%%lt32
+  %if (%%USEAVX >= 2)
+	%%MOVDQU	%%XTMP0, [%%SRC + 0*32]
+	%%MOVDQU	%%XTMP1, [%%SRC + %%SIZE - 1*32]
+	%%MOVDQU	[%%DST + 0*32], %%XTMP0
+	%%MOVDQU	[%%DST + %%SIZE - 1*32], %%XTMP1
+  %else
+	%%MOVDQU	%%XTMP0, [%%SRC + 0*16]
+	%%MOVDQU	%%XTMP1, [%%SRC + 1*16]
+	%%MOVDQU	%%XTMP2, [%%SRC + %%SIZE - 2*16]
+	%%MOVDQU	%%XTMP3, [%%SRC + %%SIZE - 1*16]
+	%%MOVDQU	[%%DST + 0*16], %%XTMP0
+	%%MOVDQU	[%%DST + 1*16], %%XTMP1
+	%%MOVDQU	[%%DST + %%SIZE - 2*16], %%XTMP2
+	%%MOVDQU	[%%DST + %%SIZE - 1*16], %%XTMP3
+  %endif
+	%%DONE
+%endif
+
+%if (%%MAXSIZE >= 32)
+%%lt32:
+	test	%%SIZE, 16
+	jz	%%lt16
+  %if (%%USEAVX >= 2)
+	%%MOVDQU	XWORD(%%XTMP0), [%%SRC + 0*16]
+	%%MOVDQU	XWORD(%%XTMP1), [%%SRC + %%SIZE - 1*16]
+	%%MOVDQU	[%%DST + 0*16], XWORD(%%XTMP0)
+	%%MOVDQU	[%%DST + %%SIZE - 1*16], XWORD(%%XTMP1)
+  %else
+	%%MOVDQU	%%XTMP0, [%%SRC + 0*16]
+	%%MOVDQU	%%XTMP1, [%%SRC + %%SIZE - 1*16]
+	%%MOVDQU	[%%DST + 0*16], %%XTMP0
+	%%MOVDQU	[%%DST + %%SIZE - 1*16], %%XTMP1
+  %endif
+	%%DONE
+%endif
+
+%if (%%MAXSIZE >= 16)
+%%lt16:
+	test	%%SIZE, 8
+	jz	%%lt8
+	mov	%%TMP0, [%%SRC]
+	mov	%%TMP1, [%%SRC + %%SIZE - 8]
+	mov	[%%DST], %%TMP0
+	mov	[%%DST + %%SIZE - 8], %%TMP1
+	%%DONE
+%endif
+
+%if (%%MAXSIZE >= 8)
+%%lt8:
+	test	%%SIZE, 4
+	jz	%%lt4
+	mov	DWORD(%%TMP0), [%%SRC]
+	mov	DWORD(%%TMP1), [%%SRC + %%SIZE - 4]
+	mov	[%%DST], DWORD(%%TMP0)
+	mov	[%%DST + %%SIZE - 4], DWORD(%%TMP1)
+	%%DONE
+%endif
+
+%if (%%MAXSIZE >= 4)
+%%lt4:
+	test	%%SIZE, 2
+	jz	%%lt2
+	movzx	DWORD(%%TMP0), word [%%SRC]
+	movzx	DWORD(%%TMP1), byte [%%SRC + %%SIZE - 1]
+	mov	[%%DST], WORD(%%TMP0)
+	mov	[%%DST + %%SIZE - 1], BYTE(%%TMP1)
+	%%DONE
+%endif
+
+%%lt2:
+%if (%%NOT0 == 0)
+	 test	 %%SIZE, 1
+	 jz	 %%end
+%endif
+	movzx	DWORD(%%TMP0), byte [%%SRC]
+	mov	[%%DST], BYTE(%%TMP0)
+%%end:
+%if (%%USERET != 0)
+	ret
+%endif
+%endm
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;; Utility macro to assist with SIMD shifting
+%macro _PSRLDQ 3
+%define %%VEC   %1
+%define %%REG   %2
+%define %%IMM   %3
+
+%ifidn %%VEC, SSE
+        psrldq  %%REG, %%IMM
+%else
+        vpsrldq %%REG, %%REG, %%IMM
+%endif
+%endm
+
+        ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; This section defines a series of macros to store small to medium amounts
+; of data from SIMD registers to memory, where the size is variable but limited.
+;
+; The macros are all called as:
+; memcpy DST, SRC, SIZE, TMP, IDX
+; with the parameters defined as:
+;    DST     : register: pointer to dst (not modified)
+;    SRC     : register: src data (clobbered)
+;    SIZE    : register: length in bytes (not modified)
+;    TMP     : 64-bit temp GPR (clobbered)
+;    IDX     : 64-bit GPR to store dst index/offset (clobbered)
+;
+; The name indicates the options. The name is of the form:
+; simd_store_<VEC>
+; where <VEC> is the SIMD instruction type e.g. "sse" or "avx"
+
+
+%macro simd_store_sse 5
+        __simd_store %1,%2,%3,%4,%5,SSE
+%endm
+
+%macro simd_store_avx 5
+        __simd_store %1,%2,%3,%4,%5,AVX
+%endm
+
+%macro simd_store_sse_15 5
+        __simd_store %1,%2,%3,%4,%5,SSE,15
+%endm
+
+%macro simd_store_avx_15 5
+        __simd_store %1,%2,%3,%4,%5,AVX,15
+%endm
+
+%macro __simd_store 6-7
+%define %%DST      %1    ; register: pointer to dst (not modified)
+%define %%SRC      %2    ; register: src data (clobbered)
+%define %%SIZE     %3    ; register: length in bytes (not modified)
+%define %%TMP      %4    ; 64-bit temp GPR (clobbered)
+%define %%IDX      %5    ; 64-bit temp GPR to store dst idx (clobbered)
+%define %%SIMDTYPE %6    ; "SSE" or "AVX"
+%define %%MAX_LEN  %7    ; [optional] maximum length to be stored, default 16
+
+%define %%PSRLDQ _PSRLDQ %%SIMDTYPE,
+
+%ifidn %%SIMDTYPE, SSE
+ %define %%MOVDQU movdqu
+ %define %%MOVQ movq
+%else
+ %define %%MOVDQU vmovdqu
+ %define %%MOVQ vmovq
+%endif
+
+;; determine max byte size for store operation
+%if %0 > 6
+%assign max_length_to_store %%MAX_LEN
+%else
+%assign max_length_to_store 16
+%endif
+
+%if max_length_to_store > 16
+%error "__simd_store macro invoked with MAX_LEN bigger than 16!"
+%endif
+
+        xor %%IDX, %%IDX        ; zero idx
+
+%if max_length_to_store == 16
+        test    %%SIZE, 16
+        jz      %%lt16
+        %%MOVDQU [%%DST], %%SRC
+        jmp     %%end
+%%lt16:
+%endif
+
+%if max_length_to_store >= 8
+        test    %%SIZE, 8
+        jz      %%lt8
+        %%MOVQ  [%%DST + %%IDX], %%SRC
+        %%PSRLDQ %%SRC, 8
+        add     %%IDX, 8
+%%lt8:
+%endif
+
+        %%MOVQ %%TMP, %%SRC     ; use GPR from now on
+
+%if max_length_to_store >= 4
+        test    %%SIZE, 4
+        jz      %%lt4
+        mov     [%%DST + %%IDX], DWORD(%%TMP)
+        shr     %%TMP, 32
+        add     %%IDX, 4
+%%lt4:
+%endif
+
+        test    %%SIZE, 2
+        jz      %%lt2
+        mov     [%%DST + %%IDX], WORD(%%TMP)
+        shr     %%TMP, 16
+        add     %%IDX, 2
+%%lt2:
+        test    %%SIZE, 1
+        jz      %%end
+        mov     [%%DST + %%IDX], BYTE(%%TMP)
+%%end:
+%endm
+
+; This section defines a series of macros to load small to medium amounts
+; (from 0 to 16 bytes) of data from memory to SIMD registers,
+; where the size is variable but limited.
+;
+; The macros are all called as:
+; simd_load DST, SRC, SIZE
+; with the parameters defined as:
+;    DST     : register: destination XMM register
+;    SRC     : register: pointer to src data (not modified)
+;    SIZE    : register: length in bytes (not modified)
+;
+; The name indicates the options. The name is of the form:
+; simd_load_<VEC>_<SZ><ZERO>
+; where:
+; <VEC> is either "sse" or "avx"
+; <SZ> is either "15" or "16" and defines largest value of SIZE
+; <ZERO> is blank or "_1". If "_1" then the min SIZE is 1 (otherwise 0)
+;
+; For example:
+; simd_load_sse_16		: SSE, 0 <= size <= 16
+; simd_load_avx_15_1	        : AVX, 1 <= size <= 15
+
+%macro simd_load_sse_15_1 3
+        __simd_load %1,%2,%3,0,0,SSE
+%endm
+%macro simd_load_sse_15 3
+        __simd_load %1,%2,%3,1,0,SSE
+%endm
+%macro simd_load_sse_16_1 3
+        __simd_load %1,%2,%3,0,1,SSE
+%endm
+%macro simd_load_sse_16 3
+        __simd_load %1,%2,%3,1,1,SSE
+%endm
+
+%macro simd_load_avx_15_1 3
+        __simd_load %1,%2,%3,0,0,AVX
+%endm
+%macro simd_load_avx_15 3
+        __simd_load %1,%2,%3,1,0,AVX
+%endm
+%macro simd_load_avx_16_1 3
+        __simd_load %1,%2,%3,0,1,AVX
+%endm
+%macro simd_load_avx_16 3
+        __simd_load %1,%2,%3,1,1,AVX
+%endm
+
+%macro __simd_load 6
+%define %%DST       %1    ; [out] destination XMM register
+%define %%SRC       %2    ; [in] pointer to src data
+%define %%SIZE      %3    ; [in] length in bytes (0-16 bytes)
+%define %%ACCEPT_0  %4    ; 0 = min length = 1, 1 = min length = 0
+%define %%ACCEPT_16 %5    ; 0 = max length = 15 , 1 = max length = 16
+%define %%SIMDTYPE  %6    ; "SSE" or "AVX"
+
+%ifidn %%SIMDTYPE, SSE
+ %define %%MOVDQU movdqu
+ %define %%PINSRB pinsrb
+ %define %%PINSRQ pinsrq
+ %define %%PXOR   pxor
+%else
+ %define %%MOVDQU vmovdqu
+ %define %%PINSRB vpinsrb
+ %define %%PINSRQ vpinsrq
+ %define %%PXOR   vpxor
+%endif
+
+%if (%%ACCEPT_16 != 0)
+        test    %%SIZE, 16
+        jz      %%_skip_16
+        %%MOVDQU %%DST, [%%SRC]
+        jmp     %%end_load
+
+%%_skip_16:
+%endif
+        %%PXOR  %%DST, %%DST ; clear XMM register
+%if (%%ACCEPT_0 != 0)
+        or      %%SIZE, %%SIZE
+        je      %%end_load
+%endif
+        cmp     %%SIZE, 1
+        je      %%_size_1
+        cmp     %%SIZE, 2
+        je      %%_size_2
+        cmp     %%SIZE, 3
+        je      %%_size_3
+        cmp     %%SIZE, 4
+        je      %%_size_4
+        cmp     %%SIZE, 5
+        je      %%_size_5
+        cmp     %%SIZE, 6
+        je      %%_size_6
+        cmp     %%SIZE, 7
+        je      %%_size_7
+        cmp     %%SIZE, 8
+        je      %%_size_8
+        cmp     %%SIZE, 9
+        je      %%_size_9
+        cmp     %%SIZE, 10
+        je      %%_size_10
+        cmp     %%SIZE, 11
+        je      %%_size_11
+        cmp     %%SIZE, 12
+        je      %%_size_12
+        cmp     %%SIZE, 13
+        je      %%_size_13
+        cmp     %%SIZE, 14
+        je      %%_size_14
+
+%%_size_15:
+        %%PINSRB %%DST, [%%SRC + 14], 14
+%%_size_14:
+        %%PINSRB %%DST, [%%SRC + 13], 13
+%%_size_13:
+        %%PINSRB %%DST, [%%SRC + 12], 12
+%%_size_12:
+        %%PINSRB %%DST, [%%SRC + 11], 11
+%%_size_11:
+        %%PINSRB %%DST, [%%SRC + 10], 10
+%%_size_10:
+        %%PINSRB %%DST, [%%SRC + 9], 9
+%%_size_9:
+        %%PINSRB %%DST, [%%SRC + 8], 8
+%%_size_8:
+        %%PINSRQ %%DST, [%%SRC], 0
+        jmp    %%end_load
+%%_size_7:
+        %%PINSRB %%DST, [%%SRC + 6], 6
+%%_size_6:
+        %%PINSRB %%DST, [%%SRC + 5], 5
+%%_size_5:
+        %%PINSRB %%DST, [%%SRC + 4], 4
+%%_size_4:
+        %%PINSRB %%DST, [%%SRC + 3], 3
+%%_size_3:
+        %%PINSRB %%DST, [%%SRC + 2], 2
+%%_size_2:
+        %%PINSRB %%DST, [%%SRC + 1], 1
+%%_size_1:
+        %%PINSRB %%DST, [%%SRC + 0], 0
+%%end_load:
+%endm
+%endif ; ifndef __MEMCPY_ASM__
diff --git a/src/spdk/intel-ipsec-mb/include/noaesni.h b/src/spdk/intel-ipsec-mb/include/noaesni.h
new file mode 100644
index 000000000..30d970edf
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/include/noaesni.h
@@ -0,0 +1,65 @@
+/*******************************************************************************
+  Copyright (c) 2018, Intel Corporation
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are met:
+
+      * Redistributions of source code must retain the above copyright notice,
+        this list of conditions and the following disclaimer.
+      * Redistributions in binary form must reproduce the above copyright
+        notice, this list of conditions and the following disclaimer in the
+        documentation and/or other materials provided with the distribution.
+      * Neither the name of Intel Corporation nor the names of its contributors
+        may be used to endorse or promote products derived from this software
+        without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+  DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+  DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+  SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+  CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+  OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+
+#include "intel-ipsec-mb.h"
+
+#ifndef NOAESNI_H
+#define NOAESNI_H
+
+IMB_DLL_EXPORT void init_mb_mgr_sse_no_aesni(MB_MGR *state);
+IMB_DLL_EXPORT JOB_AES_HMAC *submit_job_sse_no_aesni(MB_MGR *state);
+IMB_DLL_EXPORT JOB_AES_HMAC *submit_job_nocheck_sse_no_aesni(MB_MGR *state);
+IMB_DLL_EXPORT JOB_AES_HMAC *flush_job_sse_no_aesni(MB_MGR *state);
+IMB_DLL_EXPORT uint32_t queue_size_sse_no_aesni(MB_MGR *state);
+IMB_DLL_EXPORT JOB_AES_HMAC *get_completed_job_sse_no_aesni(MB_MGR *state);
+IMB_DLL_EXPORT JOB_AES_HMAC *get_next_job_sse_no_aesni(MB_MGR *state);
+
+IMB_DLL_EXPORT void
+aes_keyexp_128_sse_no_aesni(const void *key, void *enc_exp_keys,
+                            void *dec_exp_keys);
+IMB_DLL_EXPORT void
+aes_keyexp_192_sse_no_aesni(const void *key, void *enc_exp_keys,
+                            void *dec_exp_keys);
+IMB_DLL_EXPORT void
+aes_keyexp_256_sse_no_aesni(const void *key, void *enc_exp_keys,
+                            void *dec_exp_keys);
+IMB_DLL_EXPORT void
+aes_xcbc_expand_key_sse_no_aesni(const void *key, void *k1_exp, void *k2,
+                                 void *k3);
+IMB_DLL_EXPORT void
+aes_keyexp_128_enc_sse_no_aesni(const void *key, void *enc_exp_keys);
+IMB_DLL_EXPORT void
+aes_keyexp_192_enc_sse_no_aesni(const void *key, void *enc_exp_keys);
+IMB_DLL_EXPORT void
+aes_keyexp_256_enc_sse_no_aesni(const void *key, void *enc_exp_keys);
+IMB_DLL_EXPORT void
+aes_cmac_subkey_gen_sse_no_aesni(const void *key_exp, void *key1, void *key2);
+IMB_DLL_EXPORT void
+aes_cfb_128_one_sse_no_aesni(void *out, const void *in, const void *iv,
+                             const void *keys, uint64_t len);
+
+#endif /* NOAESNI_H */
diff --git a/src/spdk/intel-ipsec-mb/include/os.asm b/src/spdk/intel-ipsec-mb/include/os.asm
new file mode 100644
index 000000000..f54043ed2
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/include/os.asm
@@ -0,0 +1,58 @@
+;;
+;; Copyright (c) 2017-2018, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;;     * Redistributions of source code must retain the above copyright notice,
+;;       this list of conditions and the following disclaimer.
+;;     * Redistributions in binary form must reproduce the above copyright
+;;       notice, this list of conditions and the following disclaimer in the
+;;       documentation and/or other materials provided with the distribution.
+;;     * Neither the name of Intel Corporation nor the names of its contributors
+;;       may be used to endorse or promote products derived from this software
+;;       without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+%ifndef OS_ASM_FILE
+%define OS_ASM_FILE
+
+%ifndef WIN_ABI
+%ifidn __OUTPUT_FORMAT__, win64
+%define WIN_ABI
+%endif
+%endif
+
+%ifndef LINUX
+%ifidn __OUTPUT_FORMAT__, elf64
+%define LINUX
+%endif
+%endif
+
+%ifdef LINUX
+;;; macro to declare global symbols
+;;;  - name : symbol name
+;;;  - type : funtion or data
+;;;  - scope : internal, private, default
+%define MKGLOBAL(name,type,scope) global name %+ : %+ type scope
+%endif                          ; LINUX
+
+%ifdef WIN_ABI
+;;; macro to declare global symbols
+;;;  - name : symbol name
+;;;  - type : funtion or data
+;;;  - scope : internal, private, default (ignored in win64 coff format)
+%define MKGLOBAL(name,type,scope) global name
+%endif                          ; WIN_ABI
+
+%endif                          ; OS_ASM_FILE
diff --git a/src/spdk/intel-ipsec-mb/include/reg_sizes.asm b/src/spdk/intel-ipsec-mb/include/reg_sizes.asm
new file mode 100644
index 000000000..c9f9f8cd2
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/include/reg_sizes.asm
@@ -0,0 +1,300 @@
+;;
+;; Copyright (c) 2012-2018, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;;     * Redistributions of source code must retain the above copyright notice,
+;;       this list of conditions and the following disclaimer.
+;;     * Redistributions in binary form must reproduce the above copyright
+;;       notice, this list of conditions and the following disclaimer in the
+;;       documentation and/or other materials provided with the distribution.
+;;     * Neither the name of Intel Corporation nor the names of its contributors
+;;       may be used to endorse or promote products derived from this software
+;;       without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+; define d and w variants for registers
+
+%ifndef _REG_SIZES_ASM_
+%define _REG_SIZES_ASM_
+
+%define	raxd	eax
+%define raxw	ax
+%define raxb	al
+
+%define	rbxd	ebx
+%define rbxw	bx
+%define rbxb	bl
+
+%define	rcxd	ecx
+%define rcxw	cx
+%define rcxb	cl
+
+%define	rdxd	edx
+%define rdxw	dx
+%define rdxb	dl
+
+%define	rsid	esi
+%define rsiw	si
+%define rsib	sil
+
+%define	rdid	edi
+%define rdiw	di
+%define rdib	dil
+
+%define	rbpd	ebp
+%define rbpw	bp
+%define rbpb	bpl
+
+%define zmm0x xmm0
+%define zmm1x xmm1
+%define zmm2x xmm2
+%define zmm3x xmm3
+%define zmm4x xmm4
+%define zmm5x xmm5
+%define zmm6x xmm6
+%define zmm7x xmm7
+%define zmm8x xmm8
+%define zmm9x xmm9
+%define zmm10x xmm10
+%define zmm11x xmm11
+%define zmm12x xmm12
+%define zmm13x xmm13
+%define zmm14x xmm14
+%define zmm15x xmm15
+%define zmm16x xmm16
+%define zmm17x xmm17
+%define zmm18x xmm18
+%define zmm19x xmm19
+%define zmm20x xmm20
+%define zmm21x xmm21
+%define zmm22x xmm22
+%define zmm23x xmm23
+%define zmm24x xmm24
+%define zmm25x xmm25
+%define zmm26x xmm26
+%define zmm27x xmm27
+%define zmm28x xmm28
+%define zmm29x xmm29
+%define zmm30x xmm30
+%define zmm31x xmm31
+
+%define ymm0x xmm0
+%define ymm1x xmm1
+%define ymm2x xmm2
+%define ymm3x xmm3
+%define ymm4x xmm4
+%define ymm5x xmm5
+%define ymm6x xmm6
+%define ymm7x xmm7
+%define ymm8x xmm8
+%define ymm9x xmm9
+%define ymm10x xmm10
+%define ymm11x xmm11
+%define ymm12x xmm12
+%define ymm13x xmm13
+%define ymm14x xmm14
+%define ymm15x xmm15
+%define ymm16x xmm16
+%define ymm17x xmm17
+%define ymm18x xmm18
+%define ymm19x xmm19
+%define ymm20x xmm20
+%define ymm21x xmm21
+%define ymm22x xmm22
+%define ymm23x xmm23
+%define ymm24x xmm24
+%define ymm25x xmm25
+%define ymm26x xmm26
+%define ymm27x xmm27
+%define ymm28x xmm28
+%define ymm29x xmm29
+%define ymm30x xmm30
+%define ymm31x xmm31
+
+%define xmm0x xmm0
+%define xmm1x xmm1
+%define xmm2x xmm2
+%define xmm3x xmm3
+%define xmm4x xmm4
+%define xmm5x xmm5
+%define xmm6x xmm6
+%define xmm7x xmm7
+%define xmm8x xmm8
+%define xmm9x xmm9
+%define xmm10x xmm10
+%define xmm11x xmm11
+%define xmm12x xmm12
+%define xmm13x xmm13
+%define xmm14x xmm14
+%define xmm15x xmm15
+%define xmm16x xmm16
+%define xmm17x xmm17
+%define xmm18x xmm18
+%define xmm19x xmm19
+%define xmm20x xmm20
+%define xmm21x xmm21
+%define xmm22x xmm22
+%define xmm23x xmm23
+%define xmm24x xmm24
+%define xmm25x xmm25
+%define xmm26x xmm26
+%define xmm27x xmm27
+%define xmm28x xmm28
+%define xmm29x xmm29
+%define xmm30x xmm30
+%define xmm31x xmm31
+
+%define zmm0y ymm0
+%define zmm1y ymm1
+%define zmm2y ymm2
+%define zmm3y ymm3
+%define zmm4y ymm4
+%define zmm5y ymm5
+%define zmm6y ymm6
+%define zmm7y ymm7
+%define zmm8y ymm8
+%define zmm9y ymm9
+%define zmm10y ymm10
+%define zmm11y ymm11
+%define zmm12y ymm12
+%define zmm13y ymm13
+%define zmm14y ymm14
+%define zmm15y ymm15
+%define zmm16y ymm16
+%define zmm17y ymm17
+%define zmm18y ymm18
+%define zmm19y ymm19
+%define zmm20y ymm20
+%define zmm21y ymm21
+%define zmm22y ymm22
+%define zmm23y ymm23
+%define zmm24y ymm24
+%define zmm25y ymm25
+%define zmm26y ymm26
+%define zmm27y ymm27
+%define zmm28y ymm28
+%define zmm29y ymm29
+%define zmm30y ymm30
+%define zmm31y ymm31
+
+%define xmm0y ymm0
+%define xmm1y ymm1
+%define xmm2y ymm2
+%define xmm3y ymm3
+%define xmm4y ymm4
+%define xmm5y ymm5
+%define xmm6y ymm6
+%define xmm7y ymm7
+%define xmm8y ymm8
+%define xmm9y ymm9
+%define xmm10y ymm10
+%define xmm11y ymm11
+%define xmm12y ymm12
+%define xmm13y ymm13
+%define xmm14y ymm14
+%define xmm15y ymm15
+%define xmm16y ymm16
+%define xmm17y ymm17
+%define xmm18y ymm18
+%define xmm19y ymm19
+%define xmm20y ymm20
+%define xmm21y ymm21
+%define xmm22y ymm22
+%define xmm23y ymm23
+%define xmm24y ymm24
+%define xmm25y ymm25
+%define xmm26y ymm26
+%define xmm27y ymm27
+%define xmm28y ymm28
+%define xmm29y ymm29
+%define xmm30y ymm30
+%define xmm31y ymm31
+
+%define xmm0z zmm0
+%define xmm1z zmm1
+%define xmm2z zmm2
+%define xmm3z zmm3
+%define xmm4z zmm4
+%define xmm5z zmm5
+%define xmm6z zmm6
+%define xmm7z zmm7
+%define xmm8z zmm8
+%define xmm9z zmm9
+%define xmm10z zmm10
+%define xmm11z zmm11
+%define xmm12z zmm12
+%define xmm13z zmm13
+%define xmm14z zmm14
+%define xmm15z zmm15
+%define xmm16z zmm16
+%define xmm17z zmm17
+%define xmm18z zmm18
+%define xmm19z zmm19
+%define xmm20z zmm20
+%define xmm21z zmm21
+%define xmm22z zmm22
+%define xmm23z zmm23
+%define xmm24z zmm24
+%define xmm25z zmm25
+%define xmm26z zmm26
+%define xmm27z zmm27
+%define xmm28z zmm28
+%define xmm29z zmm29
+%define xmm30z zmm30
+%define xmm31z zmm31
+
+%define ymm0z zmm0
+%define ymm1z zmm1
+%define ymm2z zmm2
+%define ymm3z zmm3
+%define ymm4z zmm4
+%define ymm5z zmm5
+%define ymm6z zmm6
+%define ymm7z zmm7
+%define ymm8z zmm8
+%define ymm9z zmm9
+%define ymm10z zmm10
+%define ymm11z zmm11
+%define ymm12z zmm12
+%define ymm13z zmm13
+%define ymm14z zmm14
+%define ymm15z zmm15
+%define ymm16z zmm16
+%define ymm17z zmm17
+%define ymm18z zmm18
+%define ymm19z zmm19
+%define ymm20z zmm20
+%define ymm21z zmm21
+%define ymm22z zmm22
+%define ymm23z zmm23
+%define ymm24z zmm24
+%define ymm25z zmm25
+%define ymm26z zmm26
+%define ymm27z zmm27
+%define ymm28z zmm28
+%define ymm29z zmm29
+%define ymm30z zmm30
+%define ymm31z zmm31
+
+%define DWORD(reg) reg %+ d
+%define WORD(reg)  reg %+ w
+%define BYTE(reg)  reg %+ b
+
+%define XWORD(reg) reg %+ x
+%define YWORD(reg) reg %+ y
+%define ZWORD(reg) reg %+ z
+
+%endif ;; _REG_SIZES_ASM_
diff --git a/src/spdk/intel-ipsec-mb/include/save_xmms.asm b/src/spdk/intel-ipsec-mb/include/save_xmms.asm
new file mode 100644
index 000000000..c9fd67eb5
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/include/save_xmms.asm
@@ -0,0 +1,132 @@
+;;
+;; Copyright (c) 2012-2018, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;;     * Redistributions of source code must retain the above copyright notice,
+;;       this list of conditions and the following disclaimer.
+;;     * Redistributions in binary form must reproduce the above copyright
+;;       notice, this list of conditions and the following disclaimer in the
+;;       documentation and/or other materials provided with the distribution.
+;;     * Neither the name of Intel Corporation nor the names of its contributors
+;;       may be used to endorse or promote products derived from this software
+;;       without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+%include "include/os.asm"
+
+%ifdef LINUX
+%define ARG1	rdi
+%else
+%define ARG1	rcx
+%endif
+
+section .text
+; void save_xmms(UINT128 array[10])
+MKGLOBAL(save_xmms,function,internal)
+save_xmms:
+	movdqa	[ARG1 + 0*16], xmm6
+	movdqa	[ARG1 + 1*16], xmm7
+	movdqa	[ARG1 + 2*16], xmm8
+	movdqa	[ARG1 + 3*16], xmm9
+	movdqa	[ARG1 + 4*16], xmm10
+	movdqa	[ARG1 + 5*16], xmm11
+	movdqa	[ARG1 + 6*16], xmm12
+	movdqa	[ARG1 + 7*16], xmm13
+	movdqa	[ARG1 + 8*16], xmm14
+	movdqa	[ARG1 + 9*16], xmm15
+	ret
+
+
+; void restore_xmms(UINT128 array[10])
+MKGLOBAL(restore_xmms,function,internal)
+restore_xmms:
+	movdqa	xmm6, [ARG1 + 0*16]
+	movdqa	xmm7, [ARG1 + 1*16]
+	movdqa	xmm8, [ARG1 + 2*16]
+	movdqa	xmm9, [ARG1 + 3*16]
+	movdqa	xmm10, [ARG1 + 4*16]
+	movdqa	xmm11, [ARG1 + 5*16]
+	movdqa	xmm12, [ARG1 + 6*16]
+	movdqa	xmm13, [ARG1 + 7*16]
+	movdqa	xmm14, [ARG1 + 8*16]
+	movdqa	xmm15, [ARG1 + 9*16]
+%ifdef SAFE_DATA
+        ;; Clear potential sensitive data stored in stack
+        pxor    xmm0, xmm0
+        movdqa  [ARG1 + 0 * 16], xmm0
+        movdqa  [ARG1 + 1 * 16], xmm0
+        movdqa  [ARG1 + 2 * 16], xmm0
+        movdqa  [ARG1 + 3 * 16], xmm0
+        movdqa  [ARG1 + 4 * 16], xmm0
+        movdqa  [ARG1 + 5 * 16], xmm0
+        movdqa  [ARG1 + 6 * 16], xmm0
+        movdqa  [ARG1 + 7 * 16], xmm0
+        movdqa  [ARG1 + 8 * 16], xmm0
+        movdqa  [ARG1 + 9 * 16], xmm0
+%endif
+
+	ret
+
+
+        ; void save_xmms_avx(UINT128 array[10])
+MKGLOBAL(save_xmms_avx,function,internal)
+save_xmms_avx:
+        vmovdqa	[ARG1 + 0*16], xmm6
+	vmovdqa	[ARG1 + 1*16], xmm7
+	vmovdqa	[ARG1 + 2*16], xmm8
+	vmovdqa	[ARG1 + 3*16], xmm9
+	vmovdqa	[ARG1 + 4*16], xmm10
+	vmovdqa	[ARG1 + 5*16], xmm11
+	vmovdqa	[ARG1 + 6*16], xmm12
+	vmovdqa	[ARG1 + 7*16], xmm13
+	vmovdqa	[ARG1 + 8*16], xmm14
+	vmovdqa	[ARG1 + 9*16], xmm15
+	ret
+
+
+; void restore_xmms_avx(UINT128 array[10])
+MKGLOBAL(restore_xmms_avx,function,internal)
+restore_xmms_avx:
+	vmovdqa	xmm6, [ARG1 + 0*16]
+	vmovdqa	xmm7, [ARG1 + 1*16]
+	vmovdqa	xmm8, [ARG1 + 2*16]
+	vmovdqa	xmm9, [ARG1 + 3*16]
+	vmovdqa	xmm10, [ARG1 + 4*16]
+	vmovdqa	xmm11, [ARG1 + 5*16]
+	vmovdqa	xmm12, [ARG1 + 6*16]
+	vmovdqa	xmm13, [ARG1 + 7*16]
+	vmovdqa	xmm14, [ARG1 + 8*16]
+	vmovdqa	xmm15, [ARG1 + 9*16]
+
+%ifdef SAFE_DATA
+        ;; Clear potential sensitive data stored in stack
+        vpxor   xmm0, xmm0
+        vmovdqa [ARG1 + 0 * 16], xmm0
+        vmovdqa [ARG1 + 1 * 16], xmm0
+        vmovdqa [ARG1 + 2 * 16], xmm0
+        vmovdqa [ARG1 + 3 * 16], xmm0
+        vmovdqa [ARG1 + 4 * 16], xmm0
+        vmovdqa [ARG1 + 5 * 16], xmm0
+        vmovdqa [ARG1 + 6 * 16], xmm0
+        vmovdqa [ARG1 + 7 * 16], xmm0
+        vmovdqa [ARG1 + 8 * 16], xmm0
+        vmovdqa [ARG1 + 9 * 16], xmm0
+%endif
+	ret
+
+%ifdef LINUX
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
diff --git a/src/spdk/intel-ipsec-mb/include/save_xmms.h b/src/spdk/intel-ipsec-mb/include/save_xmms.h
new file mode 100644
index 000000000..e711958da
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/include/save_xmms.h
@@ -0,0 +1,39 @@
+/*******************************************************************************
+ Copyright (c) 2012-2018, Intel Corporation
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are met:
+
+     * Redistributions of source code must retain the above copyright notice,
+       this list of conditions and the following disclaimer.
+     * Redistributions in binary form must reproduce the above copyright
+       notice, this list of conditions and the following disclaimer in the
+       documentation and/or other materials provided with the distribution.
+     * Neither the name of Intel Corporation nor the names of its contributors
+       may be used to endorse or promote products derived from this software
+       without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+ FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+
+#ifndef SAVE_XMMS_H
+#define SAVE_XMMS_H
+
+#include "intel-ipsec-mb.h"
+
+void save_xmms(uint128_t array[10]);
+void restore_xmms(uint128_t array[10]);
+
+void save_xmms_avx(uint128_t array[10]);
+void restore_xmms_avx(uint128_t array[10]);
+
+#endif /* SAVE_XMMS_H */
diff --git a/src/spdk/intel-ipsec-mb/include/snow3g.h b/src/spdk/intel-ipsec-mb/include/snow3g.h
new file mode 100644
index 000000000..520a4b41f
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/include/snow3g.h
@@ -0,0 +1,511 @@
+/*******************************************************************************
+  Copyright (c) 2009-2019, Intel Corporation
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are met:
+
+      * Redistributions of source code must retain the above copyright notice,
+        this list of conditions and the following disclaimer.
+      * Redistributions in binary form must reproduce the above copyright
+        notice, this list of conditions and the following disclaimer in the
+        documentation and/or other materials provided with the distribution.
+      * Neither the name of Intel Corporation nor the names of its contributors
+        may be used to endorse or promote products derived from this software
+        without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+  DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+  DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+  SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+  CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+  OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+
+#ifndef _SNOW3G_H_
+#define _SNOW3G_H_
+
+/*******************************************************************************
+ * SSE
+ ******************************************************************************/
+void
+snow3g_f8_1_buffer_bit_sse(const snow3g_key_schedule_t *pCtx,
+                           const void *pIV,
+                           const void *pBufferIn,
+                           void *pBufferOut,
+                           const uint32_t cipherLengthInBits,
+                           const uint32_t offsetInBits);
+
+void
+snow3g_f8_1_buffer_sse(const snow3g_key_schedule_t *pCtx,
+                       const void *pIV,
+                       const void *pBufferIn,
+                       void *pBufferOut,
+                       const uint32_t lengthInBytes);
+
+void
+snow3g_f8_2_buffer_sse(const snow3g_key_schedule_t *pCtx,
+                       const void *pIV1,
+                       const void *pIV2,
+                       const void *pBufferIn1,
+                       void *pBufferOut1,
+                       const uint32_t lengthInBytes1,
+                       const void *pBufferIn2,
+                       void *pBufferOut2,
+                       const uint32_t lengthInBytes2);
+
+void
+snow3g_f8_4_buffer_sse(const snow3g_key_schedule_t *pCtx,
+                       const void *pIV1,
+                       const void *pIV2,
+                       const void *pIV3,
+                       const void *pIV4,
+                       const void *pBufferIn1,
+                       void *pBufferOut1,
+                       const uint32_t lengthInBytes1,
+                       const void *pBufferIn2,
+                       void *pBufferOut2,
+                       const uint32_t lengthInBytes2,
+                       const void *pBufferIn3,
+                       void *pBufferOut3,
+                       const uint32_t lengthInBytes3,
+                       const void *pBufferIn4,
+                       void *pBufferOut4,
+                       const uint32_t lengthInBytes4);
+
+void
+snow3g_f8_8_buffer_sse(const snow3g_key_schedule_t *pCtx,
+                       const void *pIV1,
+                       const void *pIV2,
+                       const void *pIV3,
+                       const void *pIV4,
+                       const void *pIV5,
+                       const void *pIV6,
+                       const void *pIV7,
+                       const void *pIV8,
+                       const void *pBufferIn1,
+                       void *pBufferOut1,
+                       const uint32_t lengthInBytes1,
+                       const void *pBufferIn2,
+                       void *pBufferOut2,
+                       const uint32_t lengthInBytes2,
+                       const void *pBufferIn3,
+                       void *pBufferOut3,
+                       const uint32_t lengthInBytes3,
+                       const void *pBufferIn4,
+                       void *pBufferOut4,
+                       const uint32_t lengthInBytes4,
+                       const void *pBufferIn5,
+                       void *pBufferOut5,
+                       const uint32_t lengthInBytes5,
+                       const void *pBufferIn6,
+                       void *pBufferOut6,
+                       const uint32_t lengthInBytes6,
+                       const void *pBufferIn7,
+                       void *pBufferOut7,
+                       const uint32_t lengthInBytes7,
+                       const void *pBufferIn8,
+                       void *pBufferOut8,
+                       const uint32_t lengthInBytes8);
+
+void
+snow3g_f8_8_buffer_multikey_sse(const snow3g_key_schedule_t * const pCtx[],
+                                const void * const pIV[],
+                                const void * const pBufferIn[],
+                                void *pBufferOut[],
+                                const uint32_t lengthInBytes[]);
+
+void
+snow3g_f8_n_buffer_sse(const snow3g_key_schedule_t *pCtx,
+                       const void * const IV[],
+                       const void * const pBufferIn[],
+                       void *pBufferOut[],
+                       const uint32_t bufferLenInBytes[],
+                       const uint32_t bufferCount);
+
+void
+snow3g_f8_n_buffer_multikey_sse(const snow3g_key_schedule_t * const pCtx[],
+                                const void * const IV[],
+                                const void * const pBufferIn[],
+                                void *pBufferOut[],
+                                const uint32_t bufferLenInBytes[],
+                                const uint32_t bufferCount);
+
+void
+snow3g_f9_1_buffer_sse(const snow3g_key_schedule_t *pCtx,
+                       const void *pIV,
+                       const void *pBufferIn,
+                       const uint64_t lengthInBits,
+                       void *pDigest);
+
+size_t
+snow3g_key_sched_size_sse(void);
+
+int
+snow3g_init_key_sched_sse(const void *pKey, snow3g_key_schedule_t *pCtx);
+
+/*******************************************************************************
+ * SSE NO-AESNI
+ ******************************************************************************/
+void
+snow3g_f8_1_buffer_bit_sse_no_aesni(const snow3g_key_schedule_t *pCtx,
+                                    const void *pIV,
+                                    const void *pBufferIn,
+                                    void *pBufferOut,
+                                    const uint32_t cipherLengthInBits,
+                                    const uint32_t offsetInBits);
+
+void
+snow3g_f8_1_buffer_sse_no_aesni(const snow3g_key_schedule_t *pCtx,
+                                const void *pIV,
+                                const void *pBufferIn,
+                                void *pBufferOut,
+                                const uint32_t lengthInBytes);
+
+void
+snow3g_f8_2_buffer_sse_no_aesni(const snow3g_key_schedule_t *pCtx,
+                                const void *pIV1,
+                                const void *pIV2,
+                                const void *pBufferIn1,
+                                void *pBufferOut1,
+                                const uint32_t lengthInBytes1,
+                                const void *pBufferIn2,
+                                void *pBufferOut2,
+                                const uint32_t lengthInBytes2);
+
+void
+snow3g_f8_4_buffer_sse_no_aesni(const snow3g_key_schedule_t *pCtx,
+                                const void *pIV1,
+                                const void *pIV2,
+                                const void *pIV3,
+                                const void *pIV4,
+                                const void *pBufferIn1,
+                                void *pBufferOut1,
+                                const uint32_t lengthInBytes1,
+                                const void *pBufferIn2,
+                                void *pBufferOut2,
+                                const uint32_t lengthInBytes2,
+                                const void *pBufferIn3,
+                                void *pBufferOut3,
+                                const uint32_t lengthInBytes3,
+                                const void *pBufferIn4,
+                                void *pBufferOut4,
+                                const uint32_t lengthInBytes4);
+
+void
+snow3g_f8_8_buffer_sse_no_aesni(const snow3g_key_schedule_t *pCtx,
+                                const void *pIV1,
+                                const void *pIV2,
+                                const void *pIV3,
+                                const void *pIV4,
+                                const void *pIV5,
+                                const void *pIV6,
+                                const void *pIV7,
+                                const void *pIV8,
+                                const void *pBufferIn1,
+                                void *pBufferOut1,
+                                const uint32_t lengthInBytes1,
+                                const void *pBufferIn2,
+                                void *pBufferOut2,
+                                const uint32_t lengthInBytes2,
+                                const void *pBufferIn3,
+                                void *pBufferOut3,
+                                const uint32_t lengthInBytes3,
+                                const void *pBufferIn4,
+                                void *pBufferOut4,
+                                const uint32_t lengthInBytes4,
+                                const void *pBufferIn5,
+                                void *pBufferOut5,
+                                const uint32_t lengthInBytes5,
+                                const void *pBufferIn6,
+                                void *pBufferOut6,
+                                const uint32_t lengthInBytes6,
+                                const void *pBufferIn7,
+                                void *pBufferOut7,
+                                const uint32_t lengthInBytes7,
+                                const void *pBufferIn8,
+                                void *pBufferOut8,
+                                const uint32_t lengthInBytes8);
+
+void
+snow3g_f8_8_buffer_multikey_sse_no_aesni(const snow3g_key_schedule_t * const
+                                         pCtx[],
+                                         const void * const pIV[],
+                                         const void * const pBufferIn[],
+                                         void *pBufferOut[],
+                                         const uint32_t lengthInBytes[]);
+
+void
+snow3g_f8_n_buffer_sse_no_aesni(const snow3g_key_schedule_t *pCtx,
+                                const void * const IV[],
+                                const void * const pBufferIn[],
+                                void *pBufferOut[],
+                                const uint32_t bufferLenInBytes[],
+                                const uint32_t bufferCount);
+
+void
+snow3g_f8_n_buffer_multikey_sse_no_aesni(const snow3g_key_schedule_t * const
+                                         pCtx[],
+                                         const void * const IV[],
+                                         const void * const pBufferIn[],
+                                         void *pBufferOut[],
+                                         const uint32_t bufferLenInBytes[],
+                                         const uint32_t bufferCount);
+
+void
+snow3g_f9_1_buffer_sse_no_aesni(const snow3g_key_schedule_t *pCtx,
+                                const void *pIV,
+                                const void *pBufferIn,
+                                const uint64_t lengthInBits,
+                                void *pDigest);
+
+size_t
+snow3g_key_sched_size_sse_no_aesni(void);
+
+int
+snow3g_init_key_sched_sse_no_aesni(const void *pKey,
+                                   snow3g_key_schedule_t *pCtx);
+
+/*******************************************************************************
+ * AVX
+ ******************************************************************************/
+void
+snow3g_f8_1_buffer_bit_avx(const snow3g_key_schedule_t *pCtx,
+                           const void *pIV,
+                           const void *pBufferIn,
+                           void *pBufferOut,
+                           const uint32_t cipherLengthInBits,
+                           const uint32_t offsetInBits);
+
+void
+snow3g_f8_1_buffer_avx(const snow3g_key_schedule_t *pCtx,
+                       const void *pIV,
+                       const void *pBufferIn,
+                       void *pBufferOut,
+                       const uint32_t lengthInBytes);
+
+void
+snow3g_f8_2_buffer_avx(const snow3g_key_schedule_t *pCtx,
+                       const void *pIV1,
+                       const void *pIV2,
+                       const void *pBufferIn1,
+                       void *pBufferOut1,
+                       const uint32_t lengthInBytes1,
+                       const void *pBufferIn2,
+                       void *pBufferOut2,
+                       const uint32_t lengthInBytes2);
+
+void
+snow3g_f8_4_buffer_avx(const snow3g_key_schedule_t *pCtx,
+                       const void *pIV1,
+                       const void *pIV2,
+                       const void *pIV3,
+                       const void *pIV4,
+                       const void *pBufferIn1,
+                       void *pBufferOut1,
+                       const uint32_t lengthInBytes1,
+                       const void *pBufferIn2,
+                       void *pBufferOut2,
+                       const uint32_t lengthInBytes2,
+                       const void *pBufferIn3,
+                       void *pBufferOut3,
+                       const uint32_t lengthInBytes3,
+                       const void *pBufferIn4,
+                       void *pBufferOut4,
+                       const uint32_t lengthInBytes4);
+
+void
+snow3g_f8_8_buffer_avx(const snow3g_key_schedule_t *pCtx,
+                       const void *pIV1,
+                       const void *pIV2,
+                       const void *pIV3,
+                       const void *pIV4,
+                       const void *pIV5,
+                       const void *pIV6,
+                       const void *pIV7,
+                       const void *pIV8,
+                       const void *pBufferIn1,
+                       void *pBufferOut1,
+                       const uint32_t lengthInBytes1,
+                       const void *pBufferIn2,
+                       void *pBufferOut2,
+                       const uint32_t lengthInBytes2,
+                       const void *pBufferIn3,
+                       void *pBufferOut3,
+                       const uint32_t lengthInBytes3,
+                       const void *pBufferIn4,
+                       void *pBufferOut4,
+                       const uint32_t lengthInBytes4,
+                       const void *pBufferIn5,
+                       void *pBufferOut5,
+                       const uint32_t lengthInBytes5,
+                       const void *pBufferIn6,
+                       void *pBufferOut6,
+                       const uint32_t lengthInBytes6,
+                       const void *pBufferIn7,
+                       void *pBufferOut7,
+                       const uint32_t lengthInBytes7,
+                       const void *pBufferIn8,
+                       void *pBufferOut8,
+                       const uint32_t lengthInBytes8);
+
+void
+snow3g_f8_8_buffer_multikey_avx(const snow3g_key_schedule_t * const pCtx[],
+                                const void * const pIV[],
+                                const void * const pBufferIn[],
+                                void *pBufferOut[],
+                                const uint32_t lengthInBytes[]);
+
+void
+snow3g_f8_n_buffer_avx(const snow3g_key_schedule_t *pCtx,
+                       const void * const IV[],
+                       const void * const pBufferIn[],
+                       void *pBufferOut[],
+                       const uint32_t bufferLenInBytes[],
+                       const uint32_t bufferCount);
+
+void
+snow3g_f8_n_buffer_multikey_avx(const snow3g_key_schedule_t * const pCtx[],
+                                const void * const IV[],
+                                const void * const pBufferIn[],
+                                void *pBufferOut[],
+                                const uint32_t bufferLenInBytes[],
+                                const uint32_t bufferCount);
+
+void
+snow3g_f9_1_buffer_avx(const snow3g_key_schedule_t *pCtx,
+                       const void *pIV,
+                       const void *pBufferIn,
+                       const uint64_t lengthInBits,
+                       void *pDigest);
+
+size_t
+snow3g_key_sched_size_avx(void);
+
+int
+snow3g_init_key_sched_avx(const void *pKey, snow3g_key_schedule_t *pCtx);
+
+/*******************************************************************************
+ * AVX2
+ ******************************************************************************/
+
+void
+snow3g_f8_1_buffer_bit_avx2(const snow3g_key_schedule_t *pCtx,
+                            const void *pIV,
+                            const void *pBufferIn,
+                            void *pBufferOut,
+                            const uint32_t cipherLengthInBits,
+                            const uint32_t offsetInBits);
+
+void
+snow3g_f8_1_buffer_avx2(const snow3g_key_schedule_t *pCtx,
+                        const void *pIV,
+                        const void *pBufferIn,
+                        void *pBufferOut,
+                        const uint32_t lengthInBytes);
+
+void
+snow3g_f8_2_buffer_avx2(const snow3g_key_schedule_t *pCtx,
+                        const void *pIV1,
+                        const void *pIV2,
+                        const void *pBufferIn1,
+                        void *pBufferOut1,
+                        const uint32_t lengthInBytes1,
+                        const void *pBufferIn2,
+                        void *pBufferOut2,
+                        const uint32_t lengthInBytes2);
+
+void
+snow3g_f8_4_buffer_avx2(const snow3g_key_schedule_t *pCtx,
+                        const void *pIV1,
+                        const void *pIV2,
+                        const void *pIV3,
+                        const void *pIV4,
+                        const void *pBufferIn1,
+                        void *pBufferOut1,
+                        const uint32_t lengthInBytes1,
+                        const void *pBufferIn2,
+                        void *pBufferOut2,
+                        const uint32_t lengthInBytes2,
+                        const void *pBufferIn3,
+                        void *pBufferOut3,
+                        const uint32_t lengthInBytes3,
+                        const void *pBufferIn4,
+                        void *pBufferOut4,
+                        const uint32_t lengthInBytes4);
+
+void
+snow3g_f8_8_buffer_avx2(const snow3g_key_schedule_t *pCtx,
+                        const void *pIV1,
+                        const void *pIV2,
+                        const void *pIV3,
+                        const void *pIV4,
+                        const void *pIV5,
+                        const void *pIV6,
+                        const void *pIV7,
+                        const void *pIV8,
+                        const void *pBufferIn1,
+                        void *pBufferOut1,
+                        const uint32_t lengthInBytes1,
+                        const void *pBufferIn2,
+                        void *pBufferOut2,
+                        const uint32_t lengthInBytes2,
+                        const void *pBufferIn3,
+                        void *pBufferOut3,
+                        const uint32_t lengthInBytes3,
+                        const void *pBufferIn4,
+                        void *pBufferOut4,
+                        const uint32_t lengthInBytes4,
+                        const void *pBufferIn5,
+                        void *pBufferOut5,
+                        const uint32_t lengthInBytes5,
+                        const void *pBufferIn6,
+                        void *pBufferOut6,
+                        const uint32_t lengthInBytes6,
+                        const void *pBufferIn7,
+                        void *pBufferOut7,
+                        const uint32_t lengthInBytes7,
+                        const void *pBufferIn8,
+                        void *pBufferOut8,
+                        const uint32_t lengthInBytes8);
+
+void
+snow3g_f8_8_buffer_multikey_avx2(const snow3g_key_schedule_t * const pCtx[],
+                                 const void * const pIV[],
+                                 const void * const pBufferIn[],
+                                 void *pBufferOut[],
+                                 const uint32_t lengthInBytes[]);
+
+void
+snow3g_f8_n_buffer_avx2(const snow3g_key_schedule_t *pCtx,
+                        const void * const IV[],
+                        const void * const pBufferIn[],
+                        void *pBufferOut[],
+                        const uint32_t bufferLenInBytes[],
+                        const uint32_t bufferCount);
+
+void
+snow3g_f8_n_buffer_multikey_avx2(const snow3g_key_schedule_t * const pCtx[],
+                                 const void * const IV[],
+                                 const void * const pBufferIn[],
+                                 void *pBufferOut[],
+                                 const uint32_t bufferLenInBytes[],
+                                 const uint32_t bufferCount);
+
+void
+snow3g_f9_1_buffer_avx2(const snow3g_key_schedule_t *pCtx,
+                        const void *pIV,
+                        const void *pBufferIn,
+                        const uint64_t lengthInBits,
+                        void *pDigest);
+
+size_t
+snow3g_key_sched_size_avx2(void);
+
+int
+snow3g_init_key_sched_avx2(const void *pKey, snow3g_key_schedule_t *pCtx);
+
+#endif /* _SNOW3G_H_ */
diff --git a/src/spdk/intel-ipsec-mb/include/snow3g_common.h b/src/spdk/intel-ipsec-mb/include/snow3g_common.h
new file mode 100644
index 000000000..d7c7e63c1
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/include/snow3g_common.h
@@ -0,0 +1,2840 @@
+/*******************************************************************************
+  Copyright (c) 2009-2019, Intel Corporation
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are met:
+
+      * Redistributions of source code must retain the above copyright notice,
+        this list of conditions and the following disclaimer.
+      * Redistributions in binary form must reproduce the above copyright
+        notice, this list of conditions and the following disclaimer in the
+        documentation and/or other materials provided with the distribution.
+      * Neither the name of Intel Corporation nor the names of its contributors
+        may be used to endorse or promote products derived from this software
+        without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+  DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+  DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+  SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+  CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+  OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+
+/*-----------------------------------------------------------------------
+ *
+ * An implementation of SNOW 3G, the core algorithm for the
+ * 3GPP Confidentiality and Integrity algorithms.
+ *
+ *-----------------------------------------------------------------------*/
+
+#ifndef SNOW3G_COMMON_H
+#define SNOW3G_COMMON_H
+
+#include <stdio.h>
+#include <string.h>
+#include <stdint.h>
+
+#include "intel-ipsec-mb.h"
+#include "include/snow3g.h"
+#include "include/snow3g_internal.h"
+#include "clear_regs_mem.h"
+
+#define CLEAR_MEM clear_mem
+#define CLEAR_VAR clear_var
+
+/* -------------------------------------------------------------------
+ * LFSR array shift by 1 position, 4 packets at a time
+ * ------------------------------------------------------------------ */
+
+#ifdef AVX2
+/* LFSR array shift */
+static inline void ShiftLFSR_8(snow3gKeyState8_t *pCtx)
+{
+        pCtx->iLFSR_X = (pCtx->iLFSR_X + 1) & 15;
+}
+#endif /* AVX2 */
+
+/* LFSR array shift */
+static inline void ShiftLFSR_4(snow3gKeyState4_t *pCtx)
+{
+        pCtx->iLFSR_X = (pCtx->iLFSR_X + 1) % 16;
+}
+
+/*---------------------------------------------------------
+ * @description
+ * Gf2 modular multiplication/reduction
+ *
+ *---------------------------------------------------------*/
+static inline uint64_t multiply_and_reduce64(uint64_t a, uint64_t b)
+{
+        uint64_t msk;
+        uint64_t res = 0;
+        uint64_t i = 64;
+
+        while (i--) {
+                msk = ((int64_t)res >> 63) & 0x1b;
+                res <<= 1;
+                res ^= msk;
+                msk = ((int64_t)b >> 63) & a;
+                b <<= 1;
+                res ^= msk;
+        }
+        return res;
+}
+
+#ifdef AVX2
+/* -------------------------------------------------------------------
+ * ClockLFSR sub-function as defined in snow3g standard
+ * S = LFSR[2]
+ *       ^ table_Alpha_div[LFSR[11] & 0xff]
+ *       ^ table_Alpha_mul[LFSR[0] & 0xff]
+ * ------------------------------------------------------------------ */
+static void C0_C11_8(__m256i *S, const __m256i *L0, const __m256i *L11)
+{
+        __m256i mask, Sx, B11, B0, offset;
+
+        offset = _mm256_set1_epi32(3);
+        mask = _mm256_setr_epi32(0xF0F0F000, 0xF0F0F004, 0xF0F0F008, 0xF0F0F00C,
+                                 0xF0F0F000, 0xF0F0F004, 0xF0F0F008,
+                                 0xF0F0F00C);
+        B11 = _mm256_shuffle_epi8(*L11, mask);
+        *S = _mm256_i32gather_epi32(snow3g_table_A_div, B11, 4);
+
+        mask = _mm256_add_epi32(mask, offset);
+        B0 = _mm256_shuffle_epi8(*L0, mask);
+        Sx = _mm256_i32gather_epi32(snow3g_table_A_mul, B0, 4);
+        *S = _mm256_xor_si256(*S, Sx);
+}
+#endif /* AVX2 */
+
+/* -------------------------------------------------------------------
+ * ClockLFSR sub-function as defined in snow3g standard
+ * S = LFSR[2]
+ *       ^ table_Alpha_div[LFSR[11] & 0xff]
+ *       ^ table_Alpha_mul[LFSR[0] & 0xff]
+ * ------------------------------------------------------------------ */
+static inline void C0_C11_4(uint32_t *S, const __m128i *L0, const __m128i *L11)
+{
+        unsigned B11[4], B0[4];
+
+        B11[0] = _mm_extract_epi8(*L11, 0);
+        B11[1] = _mm_extract_epi8(*L11, 4);
+        B11[2] = _mm_extract_epi8(*L11, 8);
+        B11[3] = _mm_extract_epi8(*L11, 12);
+
+        S[0] = snow3g_table_A_div[B11[0]];
+        S[1] = snow3g_table_A_div[B11[1]];
+        S[2] = snow3g_table_A_div[B11[2]];
+        S[3] = snow3g_table_A_div[B11[3]];
+
+        B0[0] = _mm_extract_epi8(*L0, 3);
+        B0[1] = _mm_extract_epi8(*L0, 7);
+        B0[2] = _mm_extract_epi8(*L0, 11);
+        B0[3] = _mm_extract_epi8(*L0, 15);
+
+        S[0] ^= snow3g_table_A_mul[B0[0]];
+        S[1] ^= snow3g_table_A_mul[B0[1]];
+        S[2] ^= snow3g_table_A_mul[B0[2]];
+        S[3] ^= snow3g_table_A_mul[B0[3]];
+}
+
+#ifdef AVX2
+/* -------------------------------------------------------------------
+ * ClockLFSR function as defined in snow3g standard
+ * S =  table_Alpha_div[LFSR[11] & 0xff]
+ *       ^ table_Alpha_mul[LFSR[0] >> 24]
+ *       ^ LFSR[2] ^ LFSR[0] << 8 ^ LFSR[11] >> 8
+ * ------------------------------------------------------------------ */
+static inline void ClockLFSR_8(snow3gKeyState8_t *pCtx)
+{
+        __m256i X2;
+        __m256i S, T, U;
+
+        U = pCtx->LFSR_X[pCtx->iLFSR_X];
+        S = pCtx->LFSR_X[(pCtx->iLFSR_X + 11) % 16];
+
+        C0_C11_8(&X2, &U, &S);
+
+        T = _mm256_slli_epi32(U, 8);
+        S = _mm256_srli_epi32(S, 8);
+        U = _mm256_xor_si256(T, pCtx->LFSR_X[(pCtx->iLFSR_X + 2) % 16]);
+
+        ShiftLFSR_8(pCtx);
+
+        S = _mm256_xor_si256(S, U);
+        S = _mm256_xor_si256(S, X2);
+        pCtx->LFSR_X[(pCtx->iLFSR_X + 15) % 16] = S;
+}
+#endif /* AVX2 */
+
+/* -------------------------------------------------------------------
+ * ClockLFSR function as defined in snow3g standard
+ * S =  table_Alpha_div[LFSR[11] & 0xff]
+ *       ^ table_Alpha_mul[LFSR[0] >> 24]
+ *       ^ LFSR[2] ^ LFSR[0] << 8 ^ LFSR[11] >> 8
+ * ------------------------------------------------------------------ */
+static inline void ClockLFSR_4(snow3gKeyState4_t *pCtx)
+{
+        uint32_t X2[4];
+        __m128i S, T, U;
+
+        U = pCtx->LFSR_X[pCtx->iLFSR_X];
+        S = pCtx->LFSR_X[(pCtx->iLFSR_X + 11) % 16];
+        C0_C11_4(X2, &U, &S);
+
+        T = _mm_slli_epi32(U, 8);
+        S = _mm_srli_epi32(S, 8);
+        U = _mm_xor_si128(T, pCtx->LFSR_X[(pCtx->iLFSR_X + 2) % 16]);
+        ShiftLFSR_4(pCtx);
+
+        /* (SSE4) */
+        T = _mm_insert_epi32(T, X2[0], 0);
+        T = _mm_insert_epi32(T, X2[1], 1);
+        T = _mm_insert_epi32(T, X2[2], 2);
+        T = _mm_insert_epi32(T, X2[3], 3);
+        S = _mm_xor_si128(S, U);
+        S = _mm_xor_si128(S, T);
+        pCtx->LFSR_X[(pCtx->iLFSR_X + 15) % 16] = S;
+}
+
+#ifdef AVX2
+/* -------------------------------------------------------------------
+ * ClockFSM function as defined in snow3g standard
+ * 8 packets at a time
+ * ------------------------------------------------------------------ */
+static inline void ClockFSM_8(snow3gKeyState8_t *pCtx, __m256i *data)
+{
+        __m256i F, R, S2T0, S2T1, S2T2, S2T3, S1T0, S1T1, S1T2, S1T3;
+        __m256i w3, w2, w1, w0, offset, mask;
+
+        F = _mm256_add_epi32(pCtx->LFSR_X[(pCtx->iLFSR_X + 15)%16],
+                             pCtx->FSM_X[0]);
+        R = _mm256_xor_si256(pCtx->LFSR_X[(pCtx->iLFSR_X + 5)%16],
+                             pCtx->FSM_X[2]);
+        *data = _mm256_xor_si256(F, pCtx->FSM_X[1]);
+        R = _mm256_add_epi32(R, pCtx->FSM_X[1]);
+        offset = _mm256_set1_epi32(0x1);
+
+        F = pCtx->FSM_X[1];
+        w3   = _mm256_setr_epi32(0xF0F0F000, 0xF0F0F004, 0xF0F0F008,
+                                 0xF0F0F00C, 0xF0F0F000, 0xF0F0F004,
+                                 0xF0F0F008, 0xF0F0F00C);
+        mask = _mm256_shuffle_epi8(F,w3);
+        S2T0 = _mm256_i32gather_epi32(S2_T0,mask,4);
+
+        w2   = _mm256_add_epi32(w3,offset);
+        mask = _mm256_shuffle_epi8(F,w2);
+        S2T1 = _mm256_i32gather_epi32(S2_T1,mask,4);
+
+        w1   = _mm256_add_epi32(w2,offset);
+        mask = _mm256_shuffle_epi8(pCtx->FSM_X[1],w1);
+        S2T2 = _mm256_i32gather_epi32(S2_T2,mask,4);
+
+        w0   = _mm256_add_epi32(w1,offset);
+        mask = _mm256_shuffle_epi8(F,w0);
+        S2T3 = _mm256_i32gather_epi32(S2_T3,mask,4);
+
+
+        F = pCtx->FSM_X[0];
+        w3   = _mm256_setr_epi32(0xF0F0F000, 0xF0F0F004, 0xF0F0F008,
+                                 0xF0F0F00C, 0xF0F0F010, 0xF0F0F014,
+                                 0xF0F0F018, 0xF0F0F01C);
+        mask = _mm256_shuffle_epi8(F,w3);
+        S1T0 = _mm256_i32gather_epi32(S1_T0,mask,4);
+
+        w2   = _mm256_add_epi32(w3,offset);
+        mask = _mm256_shuffle_epi8(F,w2);
+        S1T1 = _mm256_i32gather_epi32(S1_T1,mask,4);
+
+        w1   = _mm256_add_epi32(w2,offset);
+        mask = _mm256_shuffle_epi8(F,w1);
+        S1T2 = _mm256_i32gather_epi32(S1_T2,mask,4);
+
+        w0   = _mm256_add_epi32(w1,offset);
+        mask = _mm256_shuffle_epi8(F,w0);
+        S1T3 = _mm256_i32gather_epi32(S1_T3,mask,4);
+
+        S2T0 = _mm256_xor_si256(S2T0, S2T1);
+        S2T2 = _mm256_xor_si256(S2T2, S2T3);
+        S2T0  = _mm256_xor_si256(S2T0, S2T2);
+
+        S1T0 = _mm256_xor_si256(S1T0, S1T1);
+        S1T2 = _mm256_xor_si256(S1T2, S1T3);
+        S1T0 = _mm256_xor_si256(S1T0, S1T2);
+
+
+        pCtx->FSM_X[2]  = S2T0;
+        pCtx->FSM_X[1]  = S1T0;
+        pCtx->FSM_X[2]  = S2T0;
+        pCtx->FSM_X[0] = R;
+}
+
+#endif /* AVX2 */
+
+/* -------------------------------------------------------------------
+ * ClockFSM function as defined in snow3g standard
+ * 4 packets at a time
+ * ------------------------------------------------------------------ */
+static inline void ClockFSM_4(snow3gKeyState4_t *pCtx, __m128i *data)
+{
+        __m128i F, R;
+#ifdef _WIN32
+#pragma warning(push)
+#pragma warning(disable:4556)
+#endif
+#if defined (NO_AESNI) || defined (SAFE_LOOKUP)
+        uint32_t L = 0;
+#endif
+        uint32_t K = 0;
+
+        F = _mm_add_epi32(pCtx->LFSR_X[(pCtx->iLFSR_X + 15) % 16],
+                          pCtx->FSM_X[0]);
+        R = _mm_xor_si128(pCtx->LFSR_X[(pCtx->iLFSR_X + 5) % 16],
+                          pCtx->FSM_X[2]);
+        *data = _mm_xor_si128(F, pCtx->FSM_X[1]);
+        R = _mm_add_epi32(R, pCtx->FSM_X[1]);
+#if defined (NO_AESNI) || defined (SAFE_LOOKUP)
+        S1_S2_4(pCtx->FSM_X[2], pCtx->FSM_X[1], pCtx->FSM_X[0], K, L, 0);
+        S1_S2_4(pCtx->FSM_X[2], pCtx->FSM_X[1], pCtx->FSM_X[0], K, L, 1);
+        S1_S2_4(pCtx->FSM_X[2], pCtx->FSM_X[1], pCtx->FSM_X[0], K, L, 2);
+        S1_S2_4(pCtx->FSM_X[2], pCtx->FSM_X[1], pCtx->FSM_X[0], K, L, 3);
+#else
+        S1_S2_4(pCtx->FSM_X[2], pCtx->FSM_X[1], pCtx->FSM_X[0], K, 0);
+        S1_S2_4(pCtx->FSM_X[2], pCtx->FSM_X[1], pCtx->FSM_X[0], K, 1);
+        S1_S2_4(pCtx->FSM_X[2], pCtx->FSM_X[1], pCtx->FSM_X[0], K, 2);
+        S1_S2_4(pCtx->FSM_X[2], pCtx->FSM_X[1], pCtx->FSM_X[0], K, 3);
+#endif /* NO_AESNI */
+        pCtx->FSM_X[0] = R;
+
+#ifdef _WIN32
+#pragma warning(pop)
+#endif
+}
+
+/**
+*******************************************************************************
+* @description
+* This function generates 4 bytes of keystream 1 buffer at a time
+*
+* @param[in]     pCtx       Context where the scheduled keys are stored
+* @param[in/out] pKeyStream Pointer to generated keystream
+*
+*******************************************************************************/
+static inline void snow3g_keystream_1_4(snow3gKeyState1_t *pCtx,
+                                        uint32_t *pKeyStream)
+{
+        uint32_t F;
+
+        ClockFSM_1(pCtx, &F);
+        *pKeyStream = F ^ pCtx->LFSR_S[0];
+        ClockLFSR_1(pCtx);
+}
+
+/**
+*******************************************************************************
+* @description
+* This function generates 8 bytes of keystream 1 buffer at a time
+*
+* @param[in]            pCtx         Context where the scheduled keys are stored
+* @param[in/out]        pKeyStream   Pointer to generated keystream
+*
+*******************************************************************************/
+static inline void snow3g_keystream_1_8(snow3gKeyState1_t *pCtx,
+                                        uint64_t *pKeyStream)
+{
+        uint64_t F;
+        uint32_t FSM4;
+        uint32_t V0, V1;
+        uint32_t F0, F1;
+        uint32_t R0, R1;
+        uint32_t L0, L1, L11, L12;
+
+        /* Merged clock FSM + clock LFSR + clock FSM + clockLFSR
+         * in order to avoid redundancies in function processing
+         * and less instruction immediate dependencies
+         */
+        L0 = pCtx->LFSR_S[0];
+        V0 = pCtx->LFSR_S[2];
+        L1 = pCtx->LFSR_S[1];
+        V1 = pCtx->LFSR_S[3];
+        R1 = pCtx->FSM_R1;
+        L11 = pCtx->LFSR_S[11];
+        L12 = pCtx->LFSR_S[12];
+        V0 ^= snow3g_table_A_mul[L0 >> 24];
+        V1 ^= snow3g_table_A_mul[L1 >> 24];
+        V0 ^= snow3g_table_A_div[L11 & 0xff];
+        V1 ^= snow3g_table_A_div[L12 & 0xff];
+        V0 ^= L0 << 8;
+        V1 ^= L1 << 8;
+        V0 ^= L11 >> 8;
+        V1 ^= L12 >> 8;
+        F0 = pCtx->LFSR_S[15] + R1;
+        F0 ^= L0;
+        F0 ^= pCtx->FSM_R2;
+        R0 = pCtx->FSM_R3 ^ pCtx->LFSR_S[5];
+        R0 += pCtx->FSM_R2;
+        S1_S2_S3_1(pCtx->FSM_R3, pCtx->FSM_R2, R1, FSM4, R0);
+        R1 = pCtx->FSM_R3 ^ pCtx->LFSR_S[6];
+        F1 = V0 + R0;
+        F1 ^= L1;
+        F1 ^= pCtx->FSM_R2;
+        R1 += pCtx->FSM_R2;
+        pCtx->FSM_R3 = Snow3g_S2(pCtx->FSM_R2);
+        pCtx->FSM_R2 = FSM4;
+        pCtx->FSM_R1 = R1;
+
+        /* Shift LFSR twice */
+        ShiftTwiceLFSR_1(pCtx);
+
+        /* keystream mode LFSR update */
+        pCtx->LFSR_S[14] = V0;
+        pCtx->LFSR_S[15] = V1;
+
+        F = F0;
+        F <<= 32;
+        F |= (uint64_t)F1;
+
+        *pKeyStream = F;
+}
+
+#ifdef AVX2
+/**
+*******************************************************************************
+* @description
+* This function generates 8 bytes of keystream 8 buffers at a time
+*
+* @param[in]            pCtx         Context where the scheduled keys are stored
+* @param[in/out]        pKeyStream   Pointer to generated keystream
+*
+*******************************************************************************/
+static inline void snow3g_keystream_8_8(snow3gKeyState8_t *pCtx,
+                                        __m256i *pKeyStreamLo,
+                                        __m256i *pKeyStreamHi)
+{
+        __m256i H, L;
+
+        /* first set of 4 bytes */
+        ClockFSM_8(pCtx, &L);
+        L = _mm256_xor_si256(L, pCtx->LFSR_X[pCtx->iLFSR_X]);
+        ClockLFSR_8(pCtx);
+
+        /* second set of 4 bytes */
+        ClockFSM_8(pCtx, &H);
+        H = _mm256_xor_si256(H, pCtx->LFSR_X[pCtx->iLFSR_X]);
+        ClockLFSR_8(pCtx);
+
+        /* merge the 2 sets */
+        *pKeyStreamLo = _mm256_unpacklo_epi32(H, L);
+        *pKeyStreamHi = _mm256_unpackhi_epi32(H, L);
+}
+
+/**
+*******************************************************************************
+* @description
+* This function generates 4 bytes of keystream 8 buffers at a time
+*
+* @param[in]            pCtx         Context where the scheduled keys are stored
+* @param[in/out]        pKeyStream   Pointer to generated keystream
+*
+*******************************************************************************/
+static inline void snow3g_keystream_8_4(snow3gKeyState8_t *pCtx,
+                                        __m256i *pKeyStream)
+{
+        __m256i F;
+
+        ClockFSM_8(pCtx, &F);
+        *pKeyStream = _mm256_xor_si256(F, pCtx->LFSR_X[pCtx->iLFSR_X]);
+        ClockLFSR_8(pCtx);
+}
+
+/**
+*****************************************************************************
+* @description
+* This function generates 32 bytes of keystream 8 buffers at a time
+*
+* @param[in]            pCtx         Context where the scheduled keys are stored
+* @param[in/out]        pKeyStream   Array of generated keystreams
+*
+******************************************************************************/
+static inline void snow3g_keystream_8_32(snow3gKeyState8_t *pCtx,
+                                         __m256i *pKeyStream)
+{
+
+        __m256i temp[8];
+
+        /** produces the next 4 bytes for each buffer */
+        int i;
+
+        /** Byte reversal on each KS */
+        __m256i mask1 = {0x0001020304050607ULL, 0x08090a0b0c0d0e0fULL,
+                         0x0001020304050607ULL, 0x08090a0b0c0d0e0fULL};
+        /** Reversal, shifted 4 bytes right */
+        __m256i mask2 = {0x0405060708090a0bULL, 0x0c0d0e0f00010203ULL,
+                         0x0405060708090a0bULL, 0x0c0d0e0f00010203ULL};
+        /** Reversal, shifted 8 bytes right */
+        __m256i mask3 = {0x08090a0b0c0d0e0fULL, 0x0001020304050607ULL,
+                         0x08090a0b0c0d0e0fULL, 0x0001020304050607ULL};
+        /** Reversal, shifted 12 bytes right */
+        __m256i mask4 = {0x0c0d0e0f00010203ULL, 0x0405060708090a0bULL,
+                         0x0c0d0e0f00010203ULL, 0x0405060708090a0bULL};
+
+        snow3g_keystream_8_4(pCtx, &temp[0]);
+        snow3g_keystream_8_4(pCtx, &temp[1]);
+        snow3g_keystream_8_4(pCtx, &temp[2]);
+        snow3g_keystream_8_4(pCtx, &temp[3]);
+        snow3g_keystream_8_4(pCtx, &temp[4]);
+        snow3g_keystream_8_4(pCtx, &temp[5]);
+        snow3g_keystream_8_4(pCtx, &temp[6]);
+        snow3g_keystream_8_4(pCtx, &temp[7]);
+
+        temp[0] = _mm256_shuffle_epi8(temp[0], mask1);
+        temp[1] = _mm256_shuffle_epi8(temp[1], mask2);
+        temp[2] = _mm256_shuffle_epi8(temp[2], mask3);
+        temp[3] = _mm256_shuffle_epi8(temp[3], mask4);
+        temp[4] = _mm256_shuffle_epi8(temp[4], mask1);
+        temp[5] = _mm256_shuffle_epi8(temp[5], mask2);
+        temp[6] = _mm256_shuffle_epi8(temp[6], mask3);
+        temp[7] = _mm256_shuffle_epi8(temp[7], mask4);
+
+        __m256i blended[8];
+        /* blends KS together: 128bit slice consists
+           of 4 32-bit words for one packet */
+        blended[0] = _mm256_blend_epi32(temp[0], temp[1], 0xaa);
+        blended[1] = _mm256_blend_epi32(temp[0], temp[1], 0x55);
+        blended[2] = _mm256_blend_epi32(temp[2], temp[3], 0xaa);
+        blended[3] = _mm256_blend_epi32(temp[2], temp[3], 0x55);
+        blended[4] = _mm256_blend_epi32(temp[4], temp[5], 0xaa);
+        blended[5] = _mm256_blend_epi32(temp[4], temp[5], 0x55);
+        blended[6] = _mm256_blend_epi32(temp[6], temp[7], 0xaa);
+        blended[7] = _mm256_blend_epi32(temp[6], temp[7], 0x55);
+
+        temp[0] = _mm256_blend_epi32(blended[0], blended[2], 0xcc);
+        temp[1] = _mm256_blend_epi32(blended[1], blended[3], 0x99);
+        temp[2] = _mm256_blend_epi32(blended[0], blended[2], 0x33);
+        temp[3] = _mm256_blend_epi32(blended[1], blended[3], 0x66);
+        temp[4] = _mm256_blend_epi32(blended[4], blended[6], 0xcc);
+        temp[5] = _mm256_blend_epi32(blended[5], blended[7], 0x99);
+        temp[6] = _mm256_blend_epi32(blended[4], blended[6], 0x33);
+        temp[7] = _mm256_blend_epi32(blended[5], blended[7], 0x66);
+
+        /** sorts 32 bit words back into order */
+        blended[0] = temp[0];
+        blended[1] = _mm256_shuffle_epi32(temp[1], 0x39);
+        blended[2] = _mm256_shuffle_epi32(temp[2], 0x4e);
+        blended[3] = _mm256_shuffle_epi32(temp[3], 0x93);
+        blended[4] = temp[4];
+        blended[5] = _mm256_shuffle_epi32(temp[5], 0x39);
+        blended[6] = _mm256_shuffle_epi32(temp[6], 0x4e);
+        blended[7] = _mm256_shuffle_epi32(temp[7], 0x93);
+
+        for (i = 0; i < 4; i++) {
+                pKeyStream[i] = _mm256_permute2x128_si256(blended[i],
+                                                          blended[i + 4], 0x20);
+                pKeyStream[i + 4] = _mm256_permute2x128_si256(
+                        blended[i], blended[i + 4], 0x31);
+        }
+}
+
+#endif /* AVX2 */
+
+/**
+*******************************************************************************
+* @description
+* This function generates 4 bytes of keystream 4 buffers at a time
+*
+* @param[in]            pCtx         Context where the scheduled keys are stored
+* @param[in/out]        pKeyStream   Pointer to generated keystream
+*
+*******************************************************************************/
+static inline void snow3g_keystream_4_4(snow3gKeyState4_t *pCtx,
+                                        __m128i *pKeyStream)
+{
+        __m128i F;
+
+        ClockFSM_4(pCtx, &F);
+        *pKeyStream = _mm_xor_si128(F, pCtx->LFSR_X[pCtx->iLFSR_X]);
+        ClockLFSR_4(pCtx);
+}
+
+/**
+*******************************************************************************
+* @description
+* This function generates 8 bytes of keystream 4 buffers at a time
+*
+* @param[in]            pCtx         Context where the scheduled keys are stored
+* @param[in/out]        pKeyStreamLo Pointer to lower end of generated keystream
+* @param[in/out]        pKeyStreamHi Pointer to higer end of generated keystream
+*
+*******************************************************************************/
+static inline void snow3g_keystream_4_8(snow3gKeyState4_t *pCtx,
+                                        __m128i *pKeyStreamLo,
+                                        __m128i *pKeyStreamHi)
+{
+        __m128i H, L;
+
+        /* first set of 4 bytes */
+        ClockFSM_4(pCtx, &L);
+        L = _mm_xor_si128(L, pCtx->LFSR_X[pCtx->iLFSR_X]);
+        ClockLFSR_4(pCtx);
+
+        /* second set of 4 bytes */
+        ClockFSM_4(pCtx, &H);
+        H = _mm_xor_si128(H, pCtx->LFSR_X[pCtx->iLFSR_X]);
+        ClockLFSR_4(pCtx);
+
+        /* merge the 2 sets */
+        *pKeyStreamLo = _mm_unpacklo_epi32(H, L);
+        *pKeyStreamHi = _mm_unpackhi_epi32(H, L);
+}
+
+/**
+*******************************************************************************
+* @description
+* This function initializes the key schedule for 4 buffers for snow3g f8/f9.
+*
+*       @param [in]      pCtx        Context where the scheduled keys are stored
+*       @param [in]      pKeySched   Key schedule
+*       @param [in]      pIV1        IV for buffer 1
+*       @param [in]      pIV2        IV for buffer 2
+*       @param [in]      pIV3        IV for buffer 3
+*       @param [in]      pIV4        IV for buffer 4
+*
+*******************************************************************************/
+static inline void
+snow3gStateInitialize_4(snow3gKeyState4_t *pCtx,
+                        const snow3g_key_schedule_t *pKeySched,
+                        const void *pIV1, const void *pIV2,
+                        const void *pIV3, const void *pIV4)
+{
+        uint32_t K, L;
+        int i;
+        __m128i R, S, T, U;
+        __m128i V0, V1, T0, T1;
+
+        /* Initialize the LFSR table from constants, Keys, and IV */
+
+        /* Load complete 128b IV into register (SSE2)*/
+        uint64_t sm[2] = {0x0405060700010203ULL, 0x0c0d0e0f08090a0bULL};
+        __m128i *swapMask = (__m128i *) sm;
+
+        R = _mm_loadu_si128((const __m128i *)pIV1);
+        S = _mm_loadu_si128((const __m128i *)pIV2);
+        T = _mm_loadu_si128((const __m128i *)pIV3);
+        U = _mm_loadu_si128((const __m128i *)pIV4);
+
+        /* initialize the array block (SSE4) */
+        for (i = 0; i < 4; i++) {
+                K = pKeySched->k[i];
+                L = ~K;
+                V0 = _mm_set1_epi32(K);
+                V1 = _mm_set1_epi32(L);
+                pCtx->LFSR_X[i + 4] = V0;
+                pCtx->LFSR_X[i + 12] = V0;
+                pCtx->LFSR_X[i + 0] = V1;
+                pCtx->LFSR_X[i + 8] = V1;
+        }
+        /* Update the schedule structure with IVs */
+        /* Store the 4 IVs in LFSR by a column/row matrix swap
+         * after endianness correction */
+
+        /* endianness swap (SSSE3) */
+        R = _mm_shuffle_epi8(R, *swapMask);
+        S = _mm_shuffle_epi8(S, *swapMask);
+        T = _mm_shuffle_epi8(T, *swapMask);
+        U = _mm_shuffle_epi8(U, *swapMask);
+
+        /* row/column dword inversion (SSE2) */
+        T0 = _mm_unpacklo_epi32(R, S);
+        R = _mm_unpackhi_epi32(R, S);
+        T1 = _mm_unpacklo_epi32(T, U);
+        T = _mm_unpackhi_epi32(T, U);
+
+        /* row/column qword inversion (SSE2) */
+        U = _mm_unpackhi_epi64(R, T);
+        T = _mm_unpacklo_epi64(R, T);
+        S = _mm_unpackhi_epi64(T0, T1);
+        R = _mm_unpacklo_epi64(T0, T1);
+
+        /*IV ^ LFSR (SSE2) */
+        pCtx->LFSR_X[15] = _mm_xor_si128(pCtx->LFSR_X[15], U);
+        pCtx->LFSR_X[12] = _mm_xor_si128(pCtx->LFSR_X[12], T);
+        pCtx->LFSR_X[10] = _mm_xor_si128(pCtx->LFSR_X[10], S);
+        pCtx->LFSR_X[9] = _mm_xor_si128(pCtx->LFSR_X[9], R);
+        pCtx->iLFSR_X = 0;
+        /* FSM initialization (SSE2) */
+        S = _mm_setzero_si128();
+        for (i = 0; i < 3; i++)
+                pCtx->FSM_X[i] = S;
+
+        /* Initialisation rounds */
+        for (i = 0; i < 32; i++) {
+                ClockFSM_4(pCtx, &S);
+                ClockLFSR_4(pCtx);
+                pCtx->LFSR_X[(pCtx->iLFSR_X + 15) % 16] = _mm_xor_si128(
+                        pCtx->LFSR_X[(pCtx->iLFSR_X + 15) % 16], S);
+        }
+}
+
+#ifdef AVX2
+/**
+*******************************************************************************
+* @description
+* This function intializes the key schedule for 8 buffers with
+* individual keys, for snow3g f8/f9.
+*
+*       @param [in]      pCtx            Context where scheduled keys are stored
+*       @param [in]      pKeySched       Key schedule
+*       @param [in]      pIV1            IV for buffer 1
+*       @param [in]      pIV2            IV for buffer 2
+*       @param [in]      pIV3            IV for buffer 3
+*       @param [in]      pIV4            IV for buffer 4
+*       @param [in]      pIV5            IV for buffer 5
+*       @param [in]      pIV6            IV for buffer 6
+*       @param [in]      pIV7            IV for buffer 7
+*       @param [in]      pIV8            IV for buffer 8
+*
+*******************************************************************************/
+static inline void
+snow3gStateInitialize_8_multiKey(snow3gKeyState8_t *pCtx,
+                                 const snow3g_key_schedule_t * const KeySched[],
+                                 const void * const pIV[])
+{
+        DECLARE_ALIGNED(uint32_t k[8], 32);
+        DECLARE_ALIGNED(uint32_t l[8], 32);
+        __m256i *K = (__m256i *)k;
+        __m256i *L = (__m256i *)l;
+
+        int i, j;
+        __m256i mR, mS, mT, mU, T0, T1;
+
+        /* Initialize the LFSR table from constants, Keys, and IV */
+
+        /* Load complete 256b IV into register (SSE2)*/
+        __m256i swapMask = {0x0405060700010203ULL, 0x0c0d0e0f08090a0bULL,
+                            0x0405060700010203ULL, 0x0c0d0e0f08090a0bULL};
+        mR = _mm256_loadu2_m128i((const __m128i *)pIV[4],
+                                 (const __m128i *)pIV[0]);
+        mS = _mm256_loadu2_m128i((const __m128i *)pIV[5],
+                                 (const __m128i *)pIV[1]);
+        mT = _mm256_loadu2_m128i((const __m128i *)pIV[6],
+                                 (const __m128i *)pIV[2]);
+        mU = _mm256_loadu2_m128i((const __m128i *)pIV[7],
+                                 (const __m128i *)pIV[3]);
+
+        /* initialize the array block (SSE4) */
+        for (i = 0; i < 4; i++) {
+                for (j = 0; j < 8; j++) {
+                        k[j] = KeySched[j]->k[i];
+                        l[j] = ~k[j];
+                }
+
+                pCtx->LFSR_X[i + 4] = *K;
+                pCtx->LFSR_X[i + 12] = *K;
+                pCtx->LFSR_X[i + 0] = *L;
+                pCtx->LFSR_X[i + 8] = *L;
+        }
+
+        /* Update the schedule structure with IVs */
+        /* Store the 4 IVs in LFSR by a column/row matrix swap
+         * after endianness correction */
+
+        /* endianness swap (SSSE3) */
+        mR = _mm256_shuffle_epi8(mR, swapMask);
+        mS = _mm256_shuffle_epi8(mS, swapMask);
+        mT = _mm256_shuffle_epi8(mT, swapMask);
+        mU = _mm256_shuffle_epi8(mU, swapMask);
+
+        /* row/column dword inversion (SSE2) */
+        T0 = _mm256_unpacklo_epi32(mR, mS);
+        mR = _mm256_unpackhi_epi32(mR, mS);
+        T1 = _mm256_unpacklo_epi32(mT, mU);
+        mT = _mm256_unpackhi_epi32(mT, mU);
+
+        /* row/column qword inversion (SSE2) */
+        mU = _mm256_unpackhi_epi64(mR, mT);
+        mT = _mm256_unpacklo_epi64(mR, mT);
+        mS = _mm256_unpackhi_epi64(T0, T1);
+        mR = _mm256_unpacklo_epi64(T0, T1);
+
+        /*IV ^ LFSR (SSE2) */
+        pCtx->LFSR_X[15] = _mm256_xor_si256(pCtx->LFSR_X[15], mU);
+        pCtx->LFSR_X[12] = _mm256_xor_si256(pCtx->LFSR_X[12], mT);
+        pCtx->LFSR_X[10] = _mm256_xor_si256(pCtx->LFSR_X[10], mS);
+        pCtx->LFSR_X[9] = _mm256_xor_si256(pCtx->LFSR_X[9], mR);
+        pCtx->iLFSR_X = 0;
+        /* FSM initialization (SSE2) */
+        mS = _mm256_setzero_si256();
+        for (i = 0; i < 3; i++)
+                pCtx->FSM_X[i] = mS;
+
+        /* Initialisation rounds */
+        for (i = 0; i < 32; i++) {
+                ClockFSM_8(pCtx, &mS);
+                ClockLFSR_8(pCtx);
+                pCtx->LFSR_X[(pCtx->iLFSR_X + 15) % 16] = _mm256_xor_si256(
+                        pCtx->LFSR_X[(pCtx->iLFSR_X + 15) % 16], mS);
+        }
+}
+
+/**
+*******************************************************************************
+* @description
+* This function initializes the key schedule for 8 buffers for snow3g f8/f9.
+*
+*       @param [in]     pCtx         Context where the scheduled keys are stored
+*       @param [in]     pKeySched    Key schedule
+*       @param [in]     pIV1         IV for buffer 1
+*       @param [in]     pIV2         IV for buffer 2
+*       @param [in]     pIV3         IV for buffer 3
+*       @param [in]     pIV4         IV for buffer 4
+*       @param [in]     pIV5         IV for buffer 5
+*       @param [in]     pIV6         IV for buffer 6
+*       @param [in]     pIV7         IV for buffer 7
+*       @param [in]     pIV8         IV for buffer 8
+*
+*******************************************************************************/
+static inline void
+snow3gStateInitialize_8(snow3gKeyState8_t *pCtx,
+                        const snow3g_key_schedule_t *pKeySched,
+                        const void *pIV1, const void *pIV2,
+                        const void *pIV3, const void *pIV4,
+                        const void *pIV5, const void *pIV6,
+                        const void *pIV7, const void *pIV8)
+{
+        uint32_t K, L;
+        int i;
+        __m256i mR, mS, mT, mU, V0, V1, T0, T1;
+
+        /* Initialize the LFSR table from constants, Keys, and IV */
+
+        /* Load complete 256b IV into register (SSE2)*/
+        __m256i swapMask = {0x0405060700010203ULL, 0x0c0d0e0f08090a0bULL,
+                            0x0405060700010203ULL, 0x0c0d0e0f08090a0bULL};
+        mR = _mm256_loadu2_m128i((const __m128i *)pIV5, (const __m128i *)pIV1);
+        mS = _mm256_loadu2_m128i((const __m128i *)pIV6, (const __m128i *)pIV2);
+        mT = _mm256_loadu2_m128i((const __m128i *)pIV7, (const __m128i *)pIV3);
+        mU = _mm256_loadu2_m128i((const __m128i *)pIV8, (const __m128i *)pIV4);
+
+        /* initialize the array block (SSE4) */
+        for (i = 0; i < 4; i++) {
+                K = pKeySched->k[i];
+                L = ~K;
+                V0 = _mm256_set1_epi32(K);
+                V1 = _mm256_set1_epi32(L);
+                pCtx->LFSR_X[i + 4] = V0;
+                pCtx->LFSR_X[i + 12] = V0;
+                pCtx->LFSR_X[i + 0] = V1;
+                pCtx->LFSR_X[i + 8] = V1;
+        }
+
+        /* Update the schedule structure with IVs */
+        /* Store the 4 IVs in LFSR by a column/row matrix swap
+         * after endianness correction */
+
+        /* endianness swap (SSSE3) */
+        mR = _mm256_shuffle_epi8(mR, swapMask);
+        mS = _mm256_shuffle_epi8(mS, swapMask);
+        mT = _mm256_shuffle_epi8(mT, swapMask);
+        mU = _mm256_shuffle_epi8(mU, swapMask);
+
+        /* row/column dword inversion (SSE2) */
+        T0 = _mm256_unpacklo_epi32(mR, mS);
+        mR = _mm256_unpackhi_epi32(mR, mS);
+        T1 = _mm256_unpacklo_epi32(mT, mU);
+        mT = _mm256_unpackhi_epi32(mT, mU);
+
+        /* row/column qword inversion (SSE2) */
+        mU = _mm256_unpackhi_epi64(mR, mT);
+        mT = _mm256_unpacklo_epi64(mR, mT);
+        mS = _mm256_unpackhi_epi64(T0, T1);
+        mR = _mm256_unpacklo_epi64(T0, T1);
+
+        /*IV ^ LFSR (SSE2) */
+        pCtx->LFSR_X[15] = _mm256_xor_si256(pCtx->LFSR_X[15], mU);
+        pCtx->LFSR_X[12] = _mm256_xor_si256(pCtx->LFSR_X[12], mT);
+        pCtx->LFSR_X[10] = _mm256_xor_si256(pCtx->LFSR_X[10], mS);
+        pCtx->LFSR_X[9] = _mm256_xor_si256(pCtx->LFSR_X[9], mR);
+        pCtx->iLFSR_X = 0;
+        /* FSM initialization (SSE2) */
+        mS = _mm256_setzero_si256();
+        for (i = 0; i < 3; i++)
+                pCtx->FSM_X[i] = mS;
+
+        /* Initialisation rounds */
+        for (i = 0; i < 32; i++) {
+                ClockFSM_8(pCtx, &mS);
+                ClockLFSR_8(pCtx);
+                pCtx->LFSR_X[(pCtx->iLFSR_X + 15) % 16] = _mm256_xor_si256(
+                        pCtx->LFSR_X[(pCtx->iLFSR_X + 15) % 16], mS);
+        }
+}
+#endif /* AVX2 */
+
+static inline void
+preserve_bits(uint64_t *KS,
+              const uint8_t *pcBufferOut, const uint8_t *pcBufferIn,
+              SafeBuf *safeOutBuf, SafeBuf *safeInBuf,
+              const uint8_t bit_len, const uint8_t byte_len)
+{
+        const uint64_t mask = UINT64_MAX << (SNOW3G_BLOCK_SIZE * 8 - bit_len);
+
+        /* Clear the last bits of the keystream and the input
+         * (input only in out-of-place case) */
+        *KS &= mask;
+        if (pcBufferIn != pcBufferOut) {
+                const uint64_t swapMask = BSWAP64(mask);
+
+                safeInBuf->b64 &= swapMask;
+
+                /*
+                 * Merge the last bits from the output, to be preserved,
+                 * in the keystream, to be XOR'd with the input
+                 * (which last bits are 0, maintaining the output bits)
+                 */
+                memcpy_keystrm(safeOutBuf->b8, pcBufferOut, byte_len);
+                *KS |= BSWAP64(safeOutBuf->b64 & ~swapMask);
+        }
+}
+
+/**
+*******************************************************************************
+* @description
+* This function is the core snow3g bit algorithm
+* for the 3GPP confidentiality algorithm
+*
+* @param[in]    pCtx                Context where the scheduled keys are stored
+* @param[in]    pBufferIn           Input buffer
+* @param[out]   pBufferOut          Output buffer
+* @param[in]    cipherLengthInBits  length in bits of the data to be encrypted
+* @param[in]    bitOffset           offset in input buffer, where data are valid
+*
+*******************************************************************************/
+static inline void f8_snow3g_bit(snow3gKeyState1_t *pCtx,
+                                 const void *pIn,
+                                 void *pOut,
+                                 const uint32_t lengthInBits,
+                                 const uint32_t offsetInBits)
+{
+        const uint8_t *pBufferIn = pIn;
+        uint8_t *pBufferOut = pOut;
+        uint32_t cipherLengthInBits = lengthInBits;
+        uint64_t shiftrem = 0;
+        uint64_t KS8, KS8bit; /* 8 bytes of keystream */
+        const uint8_t *pcBufferIn = pBufferIn + (offsetInBits / 8);
+        uint8_t *pcBufferOut = pBufferOut + (offsetInBits / 8);
+        /* Offset into the first byte (0 - 7 bits) */
+        uint32_t remainOffset = offsetInBits % 8;
+        uint32_t byteLength = (cipherLengthInBits + 7) / 8;
+        SafeBuf safeInBuf = {0};
+        SafeBuf safeOutBuf = {0};
+
+        /* Now run the block cipher */
+
+        /* Start with potential partial block (due to offset and length) */
+        snow3g_keystream_1_8(pCtx, &KS8);
+        KS8bit = KS8 >> remainOffset;
+        /* Only one block to encrypt */
+        if (cipherLengthInBits < (64 - remainOffset)) {
+                byteLength = (cipherLengthInBits + 7) / 8;
+                memcpy_keystrm(safeInBuf.b8, pcBufferIn, byteLength);
+                /*
+                 * If operation is Out-of-place and there is offset
+                 * to be applied, "remainOffset" bits from the output buffer
+                 * need to be preserved (only applicable to first byte,
+                 * since remainOffset is up to 7 bits)
+                 */
+                if ((pIn != pOut) && remainOffset) {
+                        const uint8_t mask8 = (uint8_t)
+                                (1 << (8 - remainOffset)) - 1;
+
+                        safeInBuf.b8[0] = (safeInBuf.b8[0] & mask8) |
+                                (pcBufferOut[0] & ~mask8);
+                }
+                /* If last byte is a partial byte, the last bits of the output
+                 * need to be preserved */
+                const uint8_t bitlen_with_off = remainOffset +
+                        cipherLengthInBits;
+
+                if ((bitlen_with_off & 0x7) != 0)
+                        preserve_bits(&KS8bit, pcBufferOut, pcBufferIn,
+                                      &safeOutBuf, &safeInBuf,
+                                      bitlen_with_off, byteLength);
+
+                xor_keystrm_rev(safeOutBuf.b8, safeInBuf.b8, KS8bit);
+                memcpy_keystrm(pcBufferOut, safeOutBuf.b8, byteLength);
+                return;
+        }
+        /*
+         * If operation is Out-of-place and there is offset
+         * to be applied, "remainOffset" bits from the output buffer
+         * need to be preserved (only applicable to first byte,
+         * since remainOffset is up to 7 bits)
+         */
+        if ((pIn != pOut) && remainOffset) {
+                const uint8_t mask8 = (uint8_t)(1 << (8 - remainOffset)) - 1;
+
+                memcpy_keystrm(safeInBuf.b8, pcBufferIn, 8);
+                safeInBuf.b8[0] = (safeInBuf.b8[0] & mask8) |
+                        (pcBufferOut[0] & ~mask8);
+                xor_keystrm_rev(pcBufferOut, safeInBuf.b8, KS8bit);
+                pcBufferIn += SNOW3G_BLOCK_SIZE;
+        } else {
+                /* At least 64 bits to produce (including offset) */
+                pcBufferIn = xor_keystrm_rev(pcBufferOut, pcBufferIn, KS8bit);
+        }
+
+        if (remainOffset != 0)
+                shiftrem = KS8 << (64 - remainOffset);
+        cipherLengthInBits -= SNOW3G_BLOCK_SIZE * 8 - remainOffset;
+        pcBufferOut += SNOW3G_BLOCK_SIZE;
+
+        while (cipherLengthInBits) {
+                /* produce the next block of keystream */
+                snow3g_keystream_1_8(pCtx, &KS8);
+                KS8bit = (KS8 >> remainOffset) | shiftrem;
+                if (remainOffset != 0)
+                        shiftrem = KS8 << (64 - remainOffset);
+                if (cipherLengthInBits >= SNOW3G_BLOCK_SIZE * 8) {
+                        pcBufferIn = xor_keystrm_rev(pcBufferOut,
+                                                     pcBufferIn, KS8bit);
+                        cipherLengthInBits -= SNOW3G_BLOCK_SIZE * 8;
+                        pcBufferOut += SNOW3G_BLOCK_SIZE;
+                        /* loop variant */
+                } else {
+                        /* end of the loop, handle the last bytes */
+                        byteLength = (cipherLengthInBits + 7) / 8;
+                        memcpy_keystrm(safeInBuf.b8, pcBufferIn,
+                                       byteLength);
+
+                        /* If last byte is a partial byte, the last bits
+                         * of the output need to be preserved */
+                        if ((cipherLengthInBits & 0x7) != 0)
+                                preserve_bits(&KS8bit, pcBufferOut, pcBufferIn,
+                                              &safeOutBuf, &safeInBuf,
+                                              cipherLengthInBits, byteLength);
+
+                        xor_keystrm_rev(safeOutBuf.b8, safeInBuf.b8, KS8bit);
+                        memcpy_keystrm(pcBufferOut, safeOutBuf.b8, byteLength);
+                        cipherLengthInBits = 0;
+                }
+        }
+#ifdef SAFE_DATA
+        CLEAR_VAR(&KS8, sizeof(KS8));
+        CLEAR_VAR(&KS8bit, sizeof(KS8bit));
+        CLEAR_MEM(&safeInBuf, sizeof(safeInBuf));
+        CLEAR_MEM(&safeOutBuf, sizeof(safeOutBuf));
+#endif
+}
+
+/**
+*******************************************************************************
+* @description
+* This function is the core snow3g algorithm for
+* the 3GPP confidentiality and integrity algorithm.
+*
+* @param[in]       pCtx            Context where the scheduled keys are stored
+* @param[in]       pBufferIn       Input buffer
+* @param[out]      pBufferOut      Output buffer
+* @param[in]       lengthInBytes   length in bytes of the data to be encrypted
+*
+*******************************************************************************/
+static inline void f8_snow3g(snow3gKeyState1_t *pCtx,
+                             const void *pIn,
+                             void *pOut,
+                             const uint32_t lengthInBytes)
+{
+        uint32_t qwords = lengthInBytes / SNOW3G_8_BYTES; /* number of qwords */
+        uint32_t words = lengthInBytes & 4; /* remaining word if not 0 */
+        uint32_t bytes = lengthInBytes & 3; /* remaining bytes */
+        uint32_t KS4;                       /* 4 bytes of keystream */
+        uint64_t KS8;                       /* 8 bytes of keystream */
+        const uint8_t *pBufferIn = pIn;
+        uint8_t *pBufferOut = pOut;
+
+        /* process 64 bits at a time */
+        while (qwords--) {
+                /* generate keystream 8 bytes at a time */
+                snow3g_keystream_1_8(pCtx, &KS8);
+
+                /* xor keystream 8 bytes at a time */
+                pBufferIn = xor_keystrm_rev(pBufferOut, pBufferIn, KS8);
+                pBufferOut += SNOW3G_8_BYTES;
+        }
+
+        /* check for remaining 0 to 7 bytes */
+        if (0 != words) {
+                if (bytes) {
+                        /* 5 to 7 last bytes, process 8 bytes */
+                        uint8_t buftemp[8];
+                        uint8_t safeBuff[8];
+
+                        memset(safeBuff, 0, SNOW3G_8_BYTES);
+                        snow3g_keystream_1_8(pCtx, &KS8);
+                        memcpy_keystrm(safeBuff, pBufferIn, 4 + bytes);
+                        xor_keystrm_rev(buftemp, safeBuff, KS8);
+                        memcpy_keystrm(pBufferOut, buftemp, 4 + bytes);
+#ifdef SAFE_DATA
+                        CLEAR_MEM(&safeBuff, sizeof(safeBuff));
+                        CLEAR_MEM(&buftemp, sizeof(buftemp));
+#endif
+                } else {
+                        /* exactly 4 last bytes */
+                        snow3g_keystream_1_4(pCtx, &KS4);
+                        xor_keystream_reverse_32(pBufferOut, pBufferIn, KS4);
+                }
+        } else if (0 != bytes) {
+                /* 1 to 3 last bytes */
+                uint8_t buftemp[4];
+                uint8_t safeBuff[4];
+
+                memset(safeBuff, 0, SNOW3G_4_BYTES);
+                snow3g_keystream_1_4(pCtx, &KS4);
+                memcpy_keystream_32(safeBuff, pBufferIn, bytes);
+                xor_keystream_reverse_32(buftemp, safeBuff, KS4);
+                memcpy_keystream_32(pBufferOut, buftemp, bytes);
+#ifdef SAFE_DATA
+                CLEAR_MEM(&safeBuff, sizeof(safeBuff));
+                CLEAR_MEM(&buftemp, sizeof(buftemp));
+#endif
+        }
+
+#ifdef SAFE_DATA
+        CLEAR_VAR(&KS4, sizeof(KS4));
+        CLEAR_VAR(&KS8, sizeof(KS8));
+#endif
+}
+
+#ifdef AVX2
+/**
+*******************************************************************************
+* @description
+* This function converts the state from a 4 buffer state structure to 1
+* buffer state structure.
+*
+* @param[in]    pSrcState               Pointer to the source state
+* @param[in]    pDstState               Pointer to the destination state
+* @param[in]    NumBuffers              Number of buffers
+*
+*******************************************************************************/
+static inline void snow3gStateConvert_8(snow3gKeyState8_t *pSrcState,
+                                        snow3gKeyState1_t *pDstState,
+                                        uint32_t NumBuffers)
+{
+        uint32_t T = 0, iLFSR_X = pSrcState->iLFSR_X;
+        __m256i *LFSR_X = pSrcState->LFSR_X;
+        int i;
+
+        for (i = 0; i < 16; i++) {
+                switch (NumBuffers) {
+                case 0:
+                        T = _mm256_extract_epi32(LFSR_X[(i + iLFSR_X) % 16], 0);
+                        break;
+                case 1:
+                        T = _mm256_extract_epi32(LFSR_X[(i + iLFSR_X) % 16], 1);
+                        break;
+                case 2:
+                        T = _mm256_extract_epi32(LFSR_X[(i + iLFSR_X) % 16], 2);
+                        break;
+                case 3:
+                        T = _mm256_extract_epi32(LFSR_X[(i + iLFSR_X) % 16], 3);
+                        break;
+                case 4:
+                        T = _mm256_extract_epi32(LFSR_X[(i + iLFSR_X) % 16], 4);
+                        break;
+                case 5:
+                        T = _mm256_extract_epi32(LFSR_X[(i + iLFSR_X) % 16], 5);
+                        break;
+                case 6:
+                        T = _mm256_extract_epi32(LFSR_X[(i + iLFSR_X) % 16], 6);
+                        break;
+                case 7:
+                        T = _mm256_extract_epi32(LFSR_X[(i + iLFSR_X) % 16], 7);
+                        break;
+                }
+                pDstState->LFSR_S[i] = T;
+        }
+        i = 0;
+        switch (NumBuffers) {
+        case 0:
+                T = _mm256_extract_epi32(pSrcState->FSM_X[i], 0);
+                break;
+        case 1:
+                T = _mm256_extract_epi32(pSrcState->FSM_X[i], 1);
+                break;
+        case 2:
+                T = _mm256_extract_epi32(pSrcState->FSM_X[i], 2);
+                break;
+        case 3:
+                T = _mm256_extract_epi32(pSrcState->FSM_X[i], 3);
+                break;
+        case 4:
+                T = _mm256_extract_epi32(pSrcState->FSM_X[i], 4);
+                break;
+        case 5:
+                T = _mm256_extract_epi32(pSrcState->FSM_X[i], 5);
+                break;
+        case 6:
+                T = _mm256_extract_epi32(pSrcState->FSM_X[i], 6);
+                break;
+        case 7:
+                T = _mm256_extract_epi32(pSrcState->FSM_X[i], 7);
+                break;
+        }
+        pDstState->FSM_R1 = T;
+
+        i = 1;
+        switch (NumBuffers) {
+        case 0:
+                T = _mm256_extract_epi32(pSrcState->FSM_X[i], 0);
+                break;
+        case 1:
+                T = _mm256_extract_epi32(pSrcState->FSM_X[i], 1);
+                break;
+        case 2:
+                T = _mm256_extract_epi32(pSrcState->FSM_X[i], 2);
+                break;
+        case 3:
+                T = _mm256_extract_epi32(pSrcState->FSM_X[i], 3);
+                break;
+        case 4:
+                T = _mm256_extract_epi32(pSrcState->FSM_X[i], 4);
+                break;
+        case 5:
+                T = _mm256_extract_epi32(pSrcState->FSM_X[i], 5);
+                break;
+        case 6:
+                T = _mm256_extract_epi32(pSrcState->FSM_X[i], 6);
+                break;
+        case 7:
+                T = _mm256_extract_epi32(pSrcState->FSM_X[i], 7);
+                break;
+        }
+        pDstState->FSM_R2 = T;
+
+        i = 2;
+        switch (NumBuffers) {
+        case 0:
+                T = _mm256_extract_epi32(pSrcState->FSM_X[i], 0);
+                break;
+        case 1:
+                T = _mm256_extract_epi32(pSrcState->FSM_X[i], 1);
+                break;
+        case 2:
+                T = _mm256_extract_epi32(pSrcState->FSM_X[i], 2);
+                break;
+        case 3:
+                T = _mm256_extract_epi32(pSrcState->FSM_X[i], 3);
+                break;
+        case 4:
+                T = _mm256_extract_epi32(pSrcState->FSM_X[i], 4);
+                break;
+        case 5:
+                T = _mm256_extract_epi32(pSrcState->FSM_X[i], 5);
+                break;
+        case 6:
+                T = _mm256_extract_epi32(pSrcState->FSM_X[i], 6);
+                break;
+        case 7:
+                T = _mm256_extract_epi32(pSrcState->FSM_X[i], 7);
+                break;
+        }
+        pDstState->FSM_R3 = T;
+}
+#endif /* AVX2 */
+
+/**
+*******************************************************************************
+* @description
+* This function converts the state from a 4 buffer state structure to 1
+* buffer state structure.
+*
+* @param[in]    pSrcState               Pointer to the source state
+* @param[in]    pDstState               Pointer to the destination state
+* @param[in]    NumBuffers              Number of buffers
+*
+*******************************************************************************/
+static inline void snow3gStateConvert_4(snow3gKeyState4_t *pSrcState,
+                                        snow3gKeyState1_t *pDstState,
+                                        uint32_t NumBuffers)
+{
+        uint32_t i;
+        uint32_t T = 0, iLFSR_X = pSrcState->iLFSR_X;
+        __m128i *LFSR_X = pSrcState->LFSR_X;
+
+        for (i = 0; i < 16; i++) {
+                switch (NumBuffers) {
+                case 0:
+                        T = _mm_extract_epi32(LFSR_X[(i + iLFSR_X) % 16], 0);
+                        break;
+                case 1:
+                        T = _mm_extract_epi32(LFSR_X[(i + iLFSR_X) % 16], 1);
+                        break;
+                case 2:
+                        T = _mm_extract_epi32(LFSR_X[(i + iLFSR_X) % 16], 2);
+                        break;
+                case 3:
+                        T = _mm_extract_epi32(LFSR_X[(i + iLFSR_X) % 16], 3);
+                        break;
+                }
+                pDstState->LFSR_S[i] = T;
+        }
+
+        i = 0;
+        switch (NumBuffers) {
+        case 0:
+                T = _mm_extract_epi32(pSrcState->FSM_X[i], 0);
+                break;
+        case 1:
+                T = _mm_extract_epi32(pSrcState->FSM_X[i], 1);
+                break;
+        case 2:
+                T = _mm_extract_epi32(pSrcState->FSM_X[i], 2);
+                break;
+        case 3:
+                T = _mm_extract_epi32(pSrcState->FSM_X[i], 3);
+                break;
+        }
+        pDstState->FSM_R1 = T;
+
+        i = 1;
+        switch (NumBuffers) {
+        case 0:
+                T = _mm_extract_epi32(pSrcState->FSM_X[i], 0);
+                break;
+        case 1:
+                T = _mm_extract_epi32(pSrcState->FSM_X[i], 1);
+                break;
+        case 2:
+                T = _mm_extract_epi32(pSrcState->FSM_X[i], 2);
+                break;
+        case 3:
+                T = _mm_extract_epi32(pSrcState->FSM_X[i], 3);
+                break;
+        }
+        pDstState->FSM_R2 = T;
+
+        i = 2;
+        switch (NumBuffers) {
+        case 0:
+                T = _mm_extract_epi32(pSrcState->FSM_X[i], 0);
+                break;
+        case 1:
+                T = _mm_extract_epi32(pSrcState->FSM_X[i], 1);
+                break;
+        case 2:
+                T = _mm_extract_epi32(pSrcState->FSM_X[i], 2);
+                break;
+        case 3:
+                T = _mm_extract_epi32(pSrcState->FSM_X[i], 3);
+                break;
+        }
+        pDstState->FSM_R3 = T;
+}
+
+/*---------------------------------------------------------
+ * f8()
+ * Initializations and Context size definitions
+ *---------------------------------------------------------*/
+size_t SNOW3G_KEY_SCHED_SIZE(void) { return sizeof(snow3g_key_schedule_t); }
+
+int SNOW3G_INIT_KEY_SCHED(const void *pKey, snow3g_key_schedule_t *pCtx)
+{
+#ifdef SAFE_PARAM
+        if ((pKey == NULL) || (pCtx == NULL))
+                return -1;
+#endif
+
+        const uint32_t *pKey32 = pKey;
+
+        pCtx->k[3] = BSWAP32(pKey32[0]);
+        pCtx->k[2] = BSWAP32(pKey32[1]);
+        pCtx->k[1] = BSWAP32(pKey32[2]);
+        pCtx->k[0] = BSWAP32(pKey32[3]);
+
+        return 0;
+}
+
+/*---------------------------------------------------------
+ * @description
+ *      Snow3G F8 1 buffer:
+ *      Single buffer enc/dec with IV and precomputed key schedule
+ *---------------------------------------------------------*/
+void SNOW3G_F8_1_BUFFER(const snow3g_key_schedule_t *pHandle,
+                        const void *pIV,
+                        const void *pBufferIn,
+                        void  *pBufferOut,
+                        const uint32_t lengthInBytes)
+{
+#ifdef SAFE_PARAM
+        if ((pHandle == NULL) || (pIV == NULL) ||
+            (pBufferIn == NULL) || (pBufferOut == NULL) ||
+            (lengthInBytes == 0) || (lengthInBytes > SNOW3G_MAX_BYTELEN))
+                return;
+#endif
+        snow3gKeyState1_t ctx;
+        uint32_t KS4; /* 4 bytes of keystream */
+
+        /* Initialize the schedule from the IV */
+        snow3gStateInitialize_1(&ctx, pHandle, pIV);
+
+        /* Clock FSM and LFSR once, ignore the keystream */
+        snow3g_keystream_1_4(&ctx, &KS4);
+
+        f8_snow3g(&ctx, pBufferIn, pBufferOut, lengthInBytes);
+
+#ifdef SAFE_DATA
+        CLEAR_VAR(&KS4, sizeof(KS4));
+        CLEAR_MEM(&ctx, sizeof(ctx));
+        CLEAR_SCRATCH_GPS();
+        CLEAR_SCRATCH_SIMD_REGS();
+#endif /* SAFE_DATA */
+}
+
+/*---------------------------------------------------------
+ * @description
+ *      Snow3G F8 bit 1 buffer:
+ *      Single buffer enc/dec with IV and precomputed key schedule
+ *---------------------------------------------------------*/
+void SNOW3G_F8_1_BUFFER_BIT(const snow3g_key_schedule_t *pHandle,
+                            const void *pIV,
+                            const void *pBufferIn,
+                            void *pBufferOut,
+                            const uint32_t lengthInBits,
+                            const uint32_t offsetInBits)
+{
+#ifdef SAFE_PARAM
+        if ((pHandle == NULL) || (pIV == NULL) ||
+            (pBufferIn == NULL) || (pBufferOut == NULL) ||
+            (lengthInBits == 0))
+                return;
+#endif
+
+        snow3gKeyState1_t ctx;
+        uint32_t KS4; /* 4 bytes of keystream */
+
+        /* Initialize the schedule from the IV */
+        snow3gStateInitialize_1(&ctx, pHandle, pIV);
+
+        /* Clock FSM and LFSR once, ignore the keystream */
+        snow3g_keystream_1_4(&ctx, &KS4);
+
+        f8_snow3g_bit(&ctx, pBufferIn, pBufferOut, lengthInBits, offsetInBits);
+
+#ifdef SAFE_DATA
+        CLEAR_VAR(&KS4, sizeof(KS4));
+        CLEAR_MEM(&ctx, sizeof(ctx));
+        CLEAR_SCRATCH_GPS();
+        CLEAR_SCRATCH_SIMD_REGS();
+#endif /* SAFE_DATA */
+}
+
+/*---------------------------------------------------------
+ * @description
+ *      Snow3G F8 2 buffer:
+ *      Two buffers enc/dec with the same key schedule.
+ *      The 3 IVs are independent and are passed as an array of pointers.
+ *      Each buffer and data length are separate.
+ *---------------------------------------------------------*/
+void SNOW3G_F8_2_BUFFER(const snow3g_key_schedule_t *pHandle,
+                        const void *pIV1,
+                        const void *pIV2,
+                        const void *pBufIn1,
+                        void *pBufOut1,
+                        const uint32_t lenInBytes1,
+                        const void *pBufIn2,
+                        void *pBufOut2,
+                        const uint32_t lenInBytes2)
+{
+#ifdef SAFE_PARAM
+        if ((pHandle == NULL) || (pIV1 == NULL) || (pIV2 == NULL) ||
+            (pBufIn1 == NULL) || (pBufOut1 == NULL) ||
+            (pBufIn2 == NULL) || (pBufOut2 == NULL) ||
+            (lenInBytes1 == 0) || (lenInBytes1 > SNOW3G_MAX_BYTELEN) ||
+            (lenInBytes2 == 0) || (lenInBytes2 > SNOW3G_MAX_BYTELEN))
+                return;
+#endif
+
+        snow3gKeyState1_t ctx1, ctx2;
+        uint32_t KS4; /* 4 bytes of keystream */
+
+        /* Initialize the schedule from the IV */
+        snow3gStateInitialize_1(&ctx1, pHandle, pIV1);
+
+        /* Clock FSM and LFSR once, ignore the keystream */
+        snow3g_keystream_1_4(&ctx1, &KS4);
+
+        /* data processing for packet 1 */
+        f8_snow3g(&ctx1, pBufIn1, pBufOut1, lenInBytes1);
+
+        /* Initialize the schedule from the IV */
+        snow3gStateInitialize_1(&ctx2, pHandle, pIV2);
+
+        /* Clock FSM and LFSR once, ignore the keystream */
+        snow3g_keystream_1_4(&ctx2, &KS4);
+
+        /* data processing for packet 2 */
+        f8_snow3g(&ctx2, pBufIn2, pBufOut2, lenInBytes2);
+
+#ifdef SAFE_DATA
+        CLEAR_VAR(&KS4, sizeof(KS4));
+        CLEAR_MEM(&ctx1, sizeof(ctx1));
+        CLEAR_MEM(&ctx2, sizeof(ctx2));
+        CLEAR_SCRATCH_GPS();
+        CLEAR_SCRATCH_SIMD_REGS();
+#endif /* SAFE_DATA */
+
+}
+
+/*---------------------------------------------------------
+ * @description
+ *      Snow3G F8 4 buffer:
+ *      Four packets enc/dec with the same key schedule.
+ *      The 4 IVs are independent and are passed as an array of pointers.
+ *      Each buffer and data length are separate.
+ *---------------------------------------------------------*/
+void SNOW3G_F8_4_BUFFER(const snow3g_key_schedule_t *pHandle,
+                        const void *pIV1,
+                        const void *pIV2,
+                        const void *pIV3,
+                        const void *pIV4,
+                        const void *pBufferIn1,
+                        void *pBufferOut1,
+                        const uint32_t lengthInBytes1,
+                        const void *pBufferIn2,
+                        void *pBufferOut2,
+                        const uint32_t lengthInBytes2,
+                        const void *pBufferIn3,
+                        void *pBufferOut3,
+                        const uint32_t lengthInBytes3,
+                        const void *pBufferIn4,
+                        void *pBufferOut4,
+                        const uint32_t lengthInBytes4)
+{
+#ifdef SAFE_PARAM
+        if ((pHandle == NULL) ||
+            (pIV1 == NULL) || (pIV2 == NULL) ||
+            (pIV3 == NULL) || (pIV4 == NULL) ||
+            (pBufferIn1 == NULL) || (pBufferOut1 == NULL) ||
+            (pBufferIn2 == NULL) || (pBufferOut2 == NULL) ||
+            (pBufferIn3 == NULL) || (pBufferOut3 == NULL) ||
+            (pBufferIn4 == NULL) || (pBufferOut4 == NULL) ||
+            (lengthInBytes1 == 0) || (lengthInBytes1 > SNOW3G_MAX_BYTELEN) ||
+            (lengthInBytes2 == 0) || (lengthInBytes2 > SNOW3G_MAX_BYTELEN) ||
+            (lengthInBytes3 == 0) || (lengthInBytes3 > SNOW3G_MAX_BYTELEN) ||
+            (lengthInBytes4 == 0) || (lengthInBytes4 > SNOW3G_MAX_BYTELEN))
+                return;
+#endif
+
+        snow3gKeyState4_t ctx;
+        __m128i H, L; /* 4 bytes of keystream */
+        uint32_t lenInBytes1 = lengthInBytes1;
+        uint32_t lenInBytes2 = lengthInBytes2;
+        uint32_t lenInBytes3 = lengthInBytes3;
+        uint32_t lenInBytes4 = lengthInBytes4;
+        uint32_t bytes1 =
+                (lenInBytes1 < lenInBytes2 ? lenInBytes1
+                 : lenInBytes2); /* number of bytes */
+        uint32_t bytes2 =
+                (lenInBytes3 < lenInBytes4 ? lenInBytes3
+                 : lenInBytes4);    /* number of bytes */
+        /* min num of bytes */
+        uint32_t bytes = (bytes1 < bytes2) ? bytes1 : bytes2;
+        uint32_t qwords = bytes / SNOW3G_8_BYTES;
+        uint8_t *pBufOut1 = pBufferOut1;
+        uint8_t *pBufOut2 = pBufferOut2;
+        uint8_t *pBufOut3 = pBufferOut3;
+        uint8_t *pBufOut4 = pBufferOut4;
+        const uint8_t *pBufIn1 = pBufferIn1;
+        const uint8_t *pBufIn2 = pBufferIn2;
+        const uint8_t *pBufIn3 = pBufferIn3;
+        const uint8_t *pBufIn4 = pBufferIn4;
+
+        bytes = qwords * SNOW3G_8_BYTES; /* rounded down minimum length */
+
+        /* Initialize the schedule from the IV */
+        snow3gStateInitialize_4(&ctx, pHandle, pIV1, pIV2, pIV3, pIV4);
+
+        /* Clock FSM and LFSR once, ignore the keystream */
+        snow3g_keystream_4_4(&ctx, &L);
+
+        lenInBytes1 -= bytes;
+        lenInBytes2 -= bytes;
+        lenInBytes3 -= bytes;
+        lenInBytes4 -= bytes;
+
+        /* generates 4 bytes at a time on all streams */
+        while (qwords--) {
+                snow3g_keystream_4_8(&ctx, &L, &H);
+                pBufIn1 = xor_keystrm_rev(pBufOut1, pBufIn1,
+                                          _mm_extract_epi64(L, 0));
+                pBufIn2 = xor_keystrm_rev(pBufOut2, pBufIn2,
+                                          _mm_extract_epi64(L, 1));
+                pBufIn3 = xor_keystrm_rev(pBufOut3, pBufIn3,
+                                          _mm_extract_epi64(H, 0));
+                pBufIn4 = xor_keystrm_rev(pBufOut4, pBufIn4,
+                                          _mm_extract_epi64(H, 1));
+
+                pBufOut1 += SNOW3G_8_BYTES;
+                pBufOut2 += SNOW3G_8_BYTES;
+                pBufOut3 += SNOW3G_8_BYTES;
+                pBufOut4 += SNOW3G_8_BYTES;
+        }
+
+        /* process the remaining of each buffer
+         *  - extract the LFSR and FSM structures
+         *  - Continue process 1 buffer
+         */
+        if (lenInBytes1) {
+                snow3gKeyState1_t ctx1;
+
+                snow3gStateConvert_4(&ctx, &ctx1, 0);
+                f8_snow3g(&ctx1, pBufIn1, pBufOut1, lenInBytes1);
+        }
+
+        if (lenInBytes2) {
+                snow3gKeyState1_t ctx2;
+
+                snow3gStateConvert_4(&ctx, &ctx2, 1);
+                f8_snow3g(&ctx2, pBufIn2, pBufOut2, lenInBytes2);
+        }
+
+        if (lenInBytes3) {
+                snow3gKeyState1_t ctx3;
+
+                snow3gStateConvert_4(&ctx, &ctx3, 2);
+                f8_snow3g(&ctx3, pBufIn3, pBufOut3, lenInBytes3);
+        }
+
+        if (lenInBytes4) {
+                snow3gKeyState1_t ctx4;
+
+                snow3gStateConvert_4(&ctx, &ctx4, 3);
+                f8_snow3g(&ctx4, pBufIn4, pBufOut4, lenInBytes4);
+        }
+
+#ifdef SAFE_DATA
+        H = _mm_setzero_si128();
+        L = _mm_setzero_si128();
+        CLEAR_MEM(&ctx, sizeof(ctx));
+        CLEAR_SCRATCH_GPS();
+        CLEAR_SCRATCH_SIMD_REGS();
+#endif /* SAFE_DATA */
+
+}
+
+#ifdef AVX2
+/*---------------------------------------------------------
+ * @description
+ *      Snow3G 8 buffer ks 8 multi:
+ *      Processes 8 packets 8 bytes at a time.
+ *      Uses individual key schedule for each buffer.
+ *---------------------------------------------------------*/
+static inline void
+snow3g_8_buffer_ks_8_multi(uint32_t bytes,
+                           const snow3g_key_schedule_t * const pKey[],
+                           const void * const IV[],
+                           const void * const pBufferIn[],
+                           void *pBufferOut[], const uint32_t *lengthInBytes)
+{
+        uint32_t qwords = bytes / SNOW3G_8_BYTES;
+        __m256i H, L; /* 8 bytes of keystream */
+        snow3gKeyState8_t ctx;
+        int i;
+        const uint8_t *tBufferIn[8];
+        uint8_t *tBufferOut[8];
+        uint32_t tLenInBytes[8];
+
+        bytes = qwords * SNOW3G_8_BYTES; /* rounded down minimum length */
+
+        for (i = 0; i < 8; i++) {
+                tBufferIn[i] = pBufferIn[i];
+                tBufferOut[i] = pBufferOut[i];
+                tLenInBytes[i] = lengthInBytes[i];
+        }
+
+        /* Initialize the schedule from the IV */
+        snow3gStateInitialize_8_multiKey(&ctx, pKey, IV);
+
+        /* Clock FSM and LFSR once, ignore the keystream */
+        snow3g_keystream_8_4(&ctx, &L);
+
+        for (i = 0; i < 8; i++)
+                tLenInBytes[i] -= bytes;
+
+        /* generates 8 sets at a time on all streams */
+        for (i = qwords; i != 0; i--) {
+                int j;
+
+                snow3g_keystream_8_8(&ctx, &L, &H);
+
+                tBufferIn[0] = xor_keystrm_rev(tBufferOut[0], tBufferIn[0],
+                                               _mm256_extract_epi64(L, 0));
+                tBufferIn[1] = xor_keystrm_rev(tBufferOut[1], tBufferIn[1],
+                                               _mm256_extract_epi64(L, 1));
+                tBufferIn[2] = xor_keystrm_rev(tBufferOut[2], tBufferIn[2],
+                                               _mm256_extract_epi64(H, 0));
+                tBufferIn[3] = xor_keystrm_rev(tBufferOut[3], tBufferIn[3],
+                                               _mm256_extract_epi64(H, 1));
+                tBufferIn[4] = xor_keystrm_rev(tBufferOut[4], tBufferIn[4],
+                                               _mm256_extract_epi64(L, 2));
+                tBufferIn[5] = xor_keystrm_rev(tBufferOut[5], tBufferIn[5],
+                                               _mm256_extract_epi64(L, 3));
+                tBufferIn[6] = xor_keystrm_rev(tBufferOut[6], tBufferIn[6],
+                                               _mm256_extract_epi64(H, 2));
+                tBufferIn[7] = xor_keystrm_rev(tBufferOut[7], tBufferIn[7],
+                                               _mm256_extract_epi64(H, 3));
+
+                for (j = 0; j < 8; j++)
+                        tBufferOut[j] += SNOW3G_8_BYTES;
+        }
+
+        /* process the remaining of each buffer
+         *  - extract the LFSR and FSM structures
+         *  - Continue process 1 buffer
+         */
+        if (tLenInBytes[0]) {
+                snow3gKeyState1_t ctx1;
+
+                snow3gStateConvert_8(&ctx, &ctx1, 0);
+                f8_snow3g(&ctx1, tBufferIn[0], tBufferOut[0], tLenInBytes[0]);
+        }
+        if (tLenInBytes[1]) {
+                snow3gKeyState1_t ctx2;
+
+                snow3gStateConvert_8(&ctx, &ctx2, 1);
+                f8_snow3g(&ctx2, tBufferIn[1], tBufferOut[1], tLenInBytes[1]);
+        }
+        if (tLenInBytes[2]) {
+                snow3gKeyState1_t ctx3;
+
+                snow3gStateConvert_8(&ctx, &ctx3, 2);
+                f8_snow3g(&ctx3, tBufferIn[2], tBufferOut[2], tLenInBytes[2]);
+        }
+        if (tLenInBytes[3]) {
+                snow3gKeyState1_t ctx4;
+
+                snow3gStateConvert_8(&ctx, &ctx4, 3);
+                f8_snow3g(&ctx4, tBufferIn[3], tBufferOut[3], tLenInBytes[3]);
+        }
+        if (tLenInBytes[4]) {
+                snow3gKeyState1_t ctx5;
+
+                snow3gStateConvert_8(&ctx, &ctx5, 4);
+                f8_snow3g(&ctx5, tBufferIn[4], tBufferOut[4], tLenInBytes[4]);
+        }
+        if (tLenInBytes[5]) {
+                snow3gKeyState1_t ctx6;
+
+                snow3gStateConvert_8(&ctx, &ctx6, 5);
+                f8_snow3g(&ctx6, tBufferIn[5], tBufferOut[5], tLenInBytes[5]);
+        }
+        if (tLenInBytes[6]) {
+                snow3gKeyState1_t ctx7;
+
+                snow3gStateConvert_8(&ctx, &ctx7, 6);
+                f8_snow3g(&ctx7, tBufferIn[6], tBufferOut[6], tLenInBytes[6]);
+        }
+        if (tLenInBytes[7]) {
+                snow3gKeyState1_t ctx8;
+
+                snow3gStateConvert_8(&ctx, &ctx8, 7);
+                f8_snow3g(&ctx8, tBufferIn[7], tBufferOut[7], tLenInBytes[7]);
+        }
+
+#ifdef SAFE_DATA
+        H = _mm256_setzero_si256();
+        L = _mm256_setzero_si256();
+        CLEAR_MEM(&ctx, sizeof(ctx));
+#endif /* SAFE_DATA */
+}
+
+/*---------------------------------------------------------
+ * @description
+ *      Snow3G 8 buffer ks 32 multi:
+ *      Processes 8 packets 32 bytes at a time.
+ *      Uses individual key schedule for each buffer.
+ *---------------------------------------------------------*/
+static inline void
+snow3g_8_buffer_ks_32_multi(uint32_t bytes,
+                            const snow3g_key_schedule_t * const pKey[],
+                            const void * const IV[],
+                            const void * const pBufferIn[],
+                            void *pBufferOut[], const uint32_t *lengthInBytes)
+{
+
+        snow3gKeyState8_t ctx;
+        uint32_t i;
+
+        const uint8_t *tBufferIn[8];
+        uint8_t *tBufferOut[8];
+        uint32_t tLenInBytes[8];
+
+        for (i = 0; i < 8; i++) {
+                tBufferIn[i] = pBufferIn[i];
+                tBufferOut[i] = pBufferOut[i];
+                tLenInBytes[i] = lengthInBytes[i];
+        }
+
+        uint32_t blocks = bytes / 32;
+
+        bytes = blocks * 32; /* rounded down minimum length */
+
+        /* Initialize the schedule from the IV */
+        snow3gStateInitialize_8_multiKey(&ctx, pKey, IV);
+
+        /* Clock FSM and LFSR once, ignore the keystream */
+        __m256i ks[8];
+
+        snow3g_keystream_8_4(&ctx, ks);
+
+        for (i = 0; i < 8; i++)
+                tLenInBytes[i] -= bytes;
+
+        __m256i in[8];
+
+        /* generates 8 sets at a time on all streams */
+        for (i = 0; i < blocks; i++) {
+                int j;
+
+                in[0] = _mm256_loadu_si256((const __m256i *)tBufferIn[0]);
+                in[1] = _mm256_loadu_si256((const __m256i *)tBufferIn[1]);
+                in[2] = _mm256_loadu_si256((const __m256i *)tBufferIn[2]);
+                in[3] = _mm256_loadu_si256((const __m256i *)tBufferIn[3]);
+                in[4] = _mm256_loadu_si256((const __m256i *)tBufferIn[4]);
+                in[5] = _mm256_loadu_si256((const __m256i *)tBufferIn[5]);
+                in[6] = _mm256_loadu_si256((const __m256i *)tBufferIn[6]);
+                in[7] = _mm256_loadu_si256((const __m256i *)tBufferIn[7]);
+
+                snow3g_keystream_8_32(&ctx, ks);
+
+                _mm256_storeu_si256((__m256i *)tBufferOut[0],
+                                    _mm256_xor_si256(in[0], ks[0]));
+                _mm256_storeu_si256((__m256i *)tBufferOut[1],
+                                    _mm256_xor_si256(in[1], ks[1]));
+                _mm256_storeu_si256((__m256i *)tBufferOut[2],
+                                    _mm256_xor_si256(in[2], ks[2]));
+                _mm256_storeu_si256((__m256i *)tBufferOut[3],
+                                    _mm256_xor_si256(in[3], ks[3]));
+                _mm256_storeu_si256((__m256i *)tBufferOut[4],
+                                    _mm256_xor_si256(in[4], ks[4]));
+                _mm256_storeu_si256((__m256i *)tBufferOut[5],
+                                    _mm256_xor_si256(in[5], ks[5]));
+                _mm256_storeu_si256((__m256i *)tBufferOut[6],
+                                    _mm256_xor_si256(in[6], ks[6]));
+                _mm256_storeu_si256((__m256i *)tBufferOut[7],
+                                    _mm256_xor_si256(in[7], ks[7]));
+
+                for (j = 0; j < 8; j++) {
+                        tBufferIn[i] += 32;
+                        tBufferOut[i] += 32;
+                }
+        }
+
+        /* process the remaining of each buffer
+         *  - extract the LFSR and FSM structures
+         *  - Continue process 1 buffer
+         */
+        if (tLenInBytes[0]) {
+                snow3gKeyState1_t ctx1;
+
+                snow3gStateConvert_8(&ctx, &ctx1, 0);
+                f8_snow3g(&ctx1, tBufferIn[0], tBufferOut[0], tLenInBytes[0]);
+        }
+        if (tLenInBytes[1]) {
+                snow3gKeyState1_t ctx2;
+
+                snow3gStateConvert_8(&ctx, &ctx2, 1);
+                f8_snow3g(&ctx2, tBufferIn[1], tBufferOut[1], tLenInBytes[1]);
+        }
+        if (tLenInBytes[2]) {
+                snow3gKeyState1_t ctx3;
+
+                snow3gStateConvert_8(&ctx, &ctx3, 2);
+                f8_snow3g(&ctx3, tBufferIn[2], tBufferOut[2], tLenInBytes[2]);
+        }
+        if (tLenInBytes[3]) {
+                snow3gKeyState1_t ctx4;
+
+                snow3gStateConvert_8(&ctx, &ctx4, 3);
+                f8_snow3g(&ctx4, tBufferIn[3], tBufferOut[3], tLenInBytes[3]);
+        }
+        if (tLenInBytes[4]) {
+                snow3gKeyState1_t ctx5;
+
+                snow3gStateConvert_8(&ctx, &ctx5, 4);
+                f8_snow3g(&ctx5, tBufferIn[4], tBufferOut[4], tLenInBytes[4]);
+        }
+        if (tLenInBytes[5]) {
+                snow3gKeyState1_t ctx6;
+
+                snow3gStateConvert_8(&ctx, &ctx6, 5);
+                f8_snow3g(&ctx6, tBufferIn[5], tBufferOut[5], tLenInBytes[5]);
+        }
+        if (tLenInBytes[6]) {
+                snow3gKeyState1_t ctx7;
+
+                snow3gStateConvert_8(&ctx, &ctx7, 6);
+                f8_snow3g(&ctx7, tBufferIn[6], tBufferOut[6], tLenInBytes[6]);
+        }
+        if (tLenInBytes[7]) {
+                snow3gKeyState1_t ctx8;
+
+                snow3gStateConvert_8(&ctx, &ctx8, 7);
+                f8_snow3g(&ctx8, tBufferIn[7], tBufferOut[7], tLenInBytes[7]);
+        }
+
+#ifdef SAFE_DATA
+        CLEAR_MEM(&ctx, sizeof(ctx));
+        CLEAR_MEM(&ks, sizeof(ks));
+        CLEAR_MEM(&in, sizeof(in));
+#endif /* SAFE_DATA */
+}
+
+/*---------------------------------------------------------
+ * @description
+ *      Snow3G 8 buffer ks 8 multi:
+ *      Processes 8 packets 8 bytes at a time.
+ *      Uses same key schedule for each buffer.
+ *---------------------------------------------------------*/
+static inline void
+snow3g_8_buffer_ks_8(uint32_t bytes,
+                     const snow3g_key_schedule_t *pHandle,
+                     const void *pIV1,
+                     const void *pIV2,
+                     const void *pIV3,
+                     const void *pIV4,
+                     const void *pIV5,
+                     const void *pIV6,
+                     const void *pIV7,
+                     const void *pIV8,
+                     const void *pBufferIn1, void *pBufferOut1,
+                     const uint32_t lengthInBytes1,
+                     const void *pBufferIn2, void *pBufferOut2,
+                     const uint32_t lengthInBytes2,
+                     const void *pBufferIn3, void *pBufferOut3,
+                     const uint32_t lengthInBytes3,
+                     const void *pBufferIn4, void *pBufferOut4,
+                     const uint32_t lengthInBytes4,
+                     const void *pBufferIn5, void *pBufferOut5,
+                     const uint32_t lengthInBytes5,
+                     const void *pBufferIn6, void *pBufferOut6,
+                     const uint32_t lengthInBytes6,
+                     const void *pBufferIn7, void *pBufferOut7,
+                     const uint32_t lengthInBytes7,
+                     const void *pBufferIn8, void *pBufferOut8,
+                     const uint32_t lengthInBytes8)
+{
+
+        uint32_t qwords = bytes / SNOW3G_8_BYTES;
+        __m256i H, L; /* 8 bytes of keystream */
+        snow3gKeyState8_t ctx;
+        int i;
+        uint32_t lenInBytes1 = lengthInBytes1;
+        uint32_t lenInBytes2 = lengthInBytes2;
+        uint32_t lenInBytes3 = lengthInBytes3;
+        uint32_t lenInBytes4 = lengthInBytes4;
+        uint32_t lenInBytes5 = lengthInBytes5;
+        uint32_t lenInBytes6 = lengthInBytes6;
+        uint32_t lenInBytes7 = lengthInBytes7;
+        uint32_t lenInBytes8 = lengthInBytes8;
+        uint8_t *pBufOut1 = pBufferOut1;
+        uint8_t *pBufOut2 = pBufferOut2;
+        uint8_t *pBufOut3 = pBufferOut3;
+        uint8_t *pBufOut4 = pBufferOut4;
+        uint8_t *pBufOut5 = pBufferOut5;
+        uint8_t *pBufOut6 = pBufferOut6;
+        uint8_t *pBufOut7 = pBufferOut7;
+        uint8_t *pBufOut8 = pBufferOut8;
+        const uint8_t *pBufIn1 = pBufferIn1;
+        const uint8_t *pBufIn2 = pBufferIn2;
+        const uint8_t *pBufIn3 = pBufferIn3;
+        const uint8_t *pBufIn4 = pBufferIn4;
+        const uint8_t *pBufIn5 = pBufferIn5;
+        const uint8_t *pBufIn6 = pBufferIn6;
+        const uint8_t *pBufIn7 = pBufferIn7;
+        const uint8_t *pBufIn8 = pBufferIn8;
+
+        bytes = qwords * SNOW3G_8_BYTES; /* rounded down minimum length */
+
+        /* Initialize the schedule from the IV */
+        snow3gStateInitialize_8(&ctx, pHandle, pIV1, pIV2, pIV3,
+                                pIV4, pIV5, pIV6, pIV7, pIV8);
+
+        /* Clock FSM and LFSR once, ignore the keystream */
+        snow3g_keystream_8_4(&ctx, &L);
+
+        lenInBytes1 -= bytes;
+        lenInBytes2 -= bytes;
+        lenInBytes3 -= bytes;
+        lenInBytes4 -= bytes;
+        lenInBytes5 -= bytes;
+        lenInBytes6 -= bytes;
+        lenInBytes7 -= bytes;
+        lenInBytes8 -= bytes;
+
+        /* generates 8 sets at a time on all streams */
+        for (i = qwords; i != 0; i--) {
+                snow3g_keystream_8_8(&ctx, &L, &H);
+
+                pBufIn1 = xor_keystrm_rev(pBufOut1, pBufIn1,
+                                          _mm256_extract_epi64(L, 0));
+                pBufIn2 = xor_keystrm_rev(pBufOut2, pBufIn2,
+                                          _mm256_extract_epi64(L, 1));
+                pBufIn3 = xor_keystrm_rev(pBufOut3, pBufIn3,
+                                          _mm256_extract_epi64(H, 0));
+                pBufIn4 = xor_keystrm_rev(pBufOut4, pBufIn4,
+                                          _mm256_extract_epi64(H, 1));
+                pBufIn5 = xor_keystrm_rev(pBufOut5, pBufIn5,
+                                          _mm256_extract_epi64(L, 2));
+                pBufIn6 = xor_keystrm_rev(pBufOut6, pBufIn6,
+                                          _mm256_extract_epi64(L, 3));
+                pBufIn7 = xor_keystrm_rev(pBufOut7, pBufIn7,
+                                          _mm256_extract_epi64(H, 2));
+                pBufIn8 = xor_keystrm_rev(pBufOut8, pBufIn8,
+                                          _mm256_extract_epi64(H, 3));
+
+                pBufOut1 += SNOW3G_8_BYTES;
+                pBufOut2 += SNOW3G_8_BYTES;
+                pBufOut3 += SNOW3G_8_BYTES;
+                pBufOut4 += SNOW3G_8_BYTES;
+                pBufOut5 += SNOW3G_8_BYTES;
+                pBufOut6 += SNOW3G_8_BYTES;
+                pBufOut7 += SNOW3G_8_BYTES;
+                pBufOut8 += SNOW3G_8_BYTES;
+        }
+
+        /* process the remaining of each buffer
+         *  - extract the LFSR and FSM structures
+         *  - Continue process 1 buffer
+         */
+        if (lenInBytes1) {
+                snow3gKeyState1_t ctx1;
+
+                snow3gStateConvert_8(&ctx, &ctx1, 0);
+                f8_snow3g(&ctx1, pBufIn1, pBufOut1, lenInBytes1);
+        }
+
+        if (lenInBytes2) {
+                snow3gKeyState1_t ctx2;
+
+                snow3gStateConvert_8(&ctx, &ctx2, 1);
+                f8_snow3g(&ctx2, pBufIn2, pBufOut2, lenInBytes2);
+        }
+
+        if (lenInBytes3) {
+                snow3gKeyState1_t ctx3;
+
+                snow3gStateConvert_8(&ctx, &ctx3, 2);
+                f8_snow3g(&ctx3, pBufIn3, pBufOut3, lenInBytes3);
+        }
+
+        if (lenInBytes4) {
+                snow3gKeyState1_t ctx4;
+
+                snow3gStateConvert_8(&ctx, &ctx4, 3);
+                f8_snow3g(&ctx4, pBufIn4, pBufOut4, lenInBytes4);
+        }
+
+        if (lenInBytes5) {
+                snow3gKeyState1_t ctx5;
+
+                snow3gStateConvert_8(&ctx, &ctx5, 4);
+                f8_snow3g(&ctx5, pBufIn5, pBufOut5, lenInBytes5);
+        }
+
+        if (lenInBytes6) {
+                snow3gKeyState1_t ctx6;
+
+                snow3gStateConvert_8(&ctx, &ctx6, 5);
+                f8_snow3g(&ctx6, pBufIn6, pBufOut6, lenInBytes6);
+        }
+
+        if (lenInBytes7) {
+                snow3gKeyState1_t ctx7;
+
+                snow3gStateConvert_8(&ctx, &ctx7, 6);
+                f8_snow3g(&ctx7, pBufIn7, pBufOut7, lenInBytes7);
+        }
+
+        if (lenInBytes8) {
+                snow3gKeyState1_t ctx8;
+
+                snow3gStateConvert_8(&ctx, &ctx8, 7);
+                f8_snow3g(&ctx8, pBufIn8, pBufOut8, lenInBytes8);
+        }
+
+#ifdef SAFE_DATA
+        H = _mm256_setzero_si256();
+        L = _mm256_setzero_si256();
+        CLEAR_MEM(&ctx, sizeof(ctx));
+#endif /* SAFE_DATA */
+}
+
+/*---------------------------------------------------------
+ * @description
+ *      Snow3G 8 buffer ks 32 multi:
+ *      Processes 8 packets 32 bytes at a time.
+ *      Uses same key schedule for each buffer.
+ *---------------------------------------------------------*/
+static inline void
+snow3g_8_buffer_ks_32(uint32_t bytes,
+                      const snow3g_key_schedule_t *pKey,
+                      const void *pIV1, const void *pIV2,
+                      const void *pIV3, const void *pIV4,
+                      const void *pIV5, const void *pIV6,
+                      const void *pIV7, const void *pIV8,
+                      const void *pBufferIn1, void *pBufferOut1,
+                      const uint32_t lengthInBytes1,
+                      const void *pBufferIn2, void *pBufferOut2,
+                      const uint32_t lengthInBytes2,
+                      const void *pBufferIn3, void *pBufferOut3,
+                      const uint32_t lengthInBytes3,
+                      const void *pBufferIn4, void *pBufferOut4,
+                      const uint32_t lengthInBytes4,
+                      const void *pBufferIn5, void *pBufferOut5,
+                      const uint32_t lengthInBytes5,
+                      const void *pBufferIn6, void *pBufferOut6,
+                      const uint32_t lengthInBytes6,
+                      const void *pBufferIn7, void *pBufferOut7,
+                      const uint32_t lengthInBytes7,
+                      const void *pBufferIn8, void *pBufferOut8,
+                      const uint32_t lengthInBytes8)
+{
+        snow3gKeyState8_t ctx;
+        uint32_t i;
+        uint32_t lenInBytes1 = lengthInBytes1;
+        uint32_t lenInBytes2 = lengthInBytes2;
+        uint32_t lenInBytes3 = lengthInBytes3;
+        uint32_t lenInBytes4 = lengthInBytes4;
+        uint32_t lenInBytes5 = lengthInBytes5;
+        uint32_t lenInBytes6 = lengthInBytes6;
+        uint32_t lenInBytes7 = lengthInBytes7;
+        uint32_t lenInBytes8 = lengthInBytes8;
+        uint8_t *pBufOut1 = pBufferOut1;
+        uint8_t *pBufOut2 = pBufferOut2;
+        uint8_t *pBufOut3 = pBufferOut3;
+        uint8_t *pBufOut4 = pBufferOut4;
+        uint8_t *pBufOut5 = pBufferOut5;
+        uint8_t *pBufOut6 = pBufferOut6;
+        uint8_t *pBufOut7 = pBufferOut7;
+        uint8_t *pBufOut8 = pBufferOut8;
+        const uint8_t *pBufIn1 = pBufferIn1;
+        const uint8_t *pBufIn2 = pBufferIn2;
+        const uint8_t *pBufIn3 = pBufferIn3;
+        const uint8_t *pBufIn4 = pBufferIn4;
+        const uint8_t *pBufIn5 = pBufferIn5;
+        const uint8_t *pBufIn6 = pBufferIn6;
+        const uint8_t *pBufIn7 = pBufferIn7;
+        const uint8_t *pBufIn8 = pBufferIn8;
+
+        uint32_t blocks = bytes / 32;
+
+        bytes = blocks * 32; /* rounded down minimum length */
+
+        /* Initialize the schedule from the IV */
+        snow3gStateInitialize_8(&ctx, pKey, pIV1, pIV2, pIV3, pIV4, pIV5, pIV6,
+                                pIV7, pIV8);
+
+        /* Clock FSM and LFSR once, ignore the keystream */
+        __m256i ks[8];
+
+        snow3g_keystream_8_4(&ctx, ks);
+
+        lenInBytes1 -= bytes;
+        lenInBytes2 -= bytes;
+        lenInBytes3 -= bytes;
+        lenInBytes4 -= bytes;
+        lenInBytes5 -= bytes;
+        lenInBytes6 -= bytes;
+        lenInBytes7 -= bytes;
+        lenInBytes8 -= bytes;
+
+        __m256i in[8];
+
+        /* generates 8 sets at a time on all streams */
+        for (i = 0; i < blocks; i++) {
+
+                in[0] = _mm256_loadu_si256((const __m256i *)pBufIn1);
+                in[1] = _mm256_loadu_si256((const __m256i *)pBufIn2);
+                in[2] = _mm256_loadu_si256((const __m256i *)pBufIn3);
+                in[3] = _mm256_loadu_si256((const __m256i *)pBufIn4);
+                in[4] = _mm256_loadu_si256((const __m256i *)pBufIn5);
+                in[5] = _mm256_loadu_si256((const __m256i *)pBufIn6);
+                in[6] = _mm256_loadu_si256((const __m256i *)pBufIn7);
+                in[7] = _mm256_loadu_si256((const __m256i *)pBufIn8);
+
+                snow3g_keystream_8_32(&ctx, ks);
+
+                _mm256_storeu_si256((__m256i *)pBufOut1,
+                                    _mm256_xor_si256(in[0], ks[0]));
+                _mm256_storeu_si256((__m256i *)pBufOut2,
+                                    _mm256_xor_si256(in[1], ks[1]));
+                _mm256_storeu_si256((__m256i *)pBufOut3,
+                                    _mm256_xor_si256(in[2], ks[2]));
+                _mm256_storeu_si256((__m256i *)pBufOut4,
+                                    _mm256_xor_si256(in[3], ks[3]));
+                _mm256_storeu_si256((__m256i *)pBufOut5,
+                                    _mm256_xor_si256(in[4], ks[4]));
+                _mm256_storeu_si256((__m256i *)pBufOut6,
+                                    _mm256_xor_si256(in[5], ks[5]));
+                _mm256_storeu_si256((__m256i *)pBufOut7,
+                                    _mm256_xor_si256(in[6], ks[6]));
+                _mm256_storeu_si256((__m256i *)pBufOut8,
+                                    _mm256_xor_si256(in[7], ks[7]));
+
+                pBufIn1 += 32;
+                pBufIn2 += 32;
+                pBufIn3 += 32;
+                pBufIn4 += 32;
+                pBufIn5 += 32;
+                pBufIn6 += 32;
+                pBufIn7 += 32;
+                pBufIn8 += 32;
+
+                pBufOut1 += 32;
+                pBufOut2 += 32;
+                pBufOut3 += 32;
+                pBufOut4 += 32;
+                pBufOut5 += 32;
+                pBufOut6 += 32;
+                pBufOut7 += 32;
+                pBufOut8 += 32;
+        }
+
+        /* process the remaining of each buffer
+         *  - extract the LFSR and FSM structures
+         *  - Continue process 1 buffer
+         */
+        if (lenInBytes1) {
+                snow3gKeyState1_t ctx1;
+
+                snow3gStateConvert_8(&ctx, &ctx1, 0);
+                f8_snow3g(&ctx1, pBufIn1, pBufOut1, lenInBytes1);
+        }
+
+        if (lenInBytes2) {
+                snow3gKeyState1_t ctx2;
+
+                snow3gStateConvert_8(&ctx, &ctx2, 1);
+                f8_snow3g(&ctx2, pBufIn2, pBufOut2, lenInBytes2);
+        }
+
+        if (lenInBytes3) {
+                snow3gKeyState1_t ctx3;
+
+                snow3gStateConvert_8(&ctx, &ctx3, 2);
+                f8_snow3g(&ctx3, pBufIn3, pBufOut3, lenInBytes3);
+        }
+
+        if (lenInBytes4) {
+                snow3gKeyState1_t ctx4;
+
+                snow3gStateConvert_8(&ctx, &ctx4, 3);
+                f8_snow3g(&ctx4, pBufIn4, pBufOut4, lenInBytes4);
+        }
+
+        if (lenInBytes5) {
+                snow3gKeyState1_t ctx5;
+
+                snow3gStateConvert_8(&ctx, &ctx5, 4);
+                f8_snow3g(&ctx5, pBufIn5, pBufOut5, lenInBytes5);
+        }
+
+        if (lenInBytes6) {
+                snow3gKeyState1_t ctx6;
+
+                snow3gStateConvert_8(&ctx, &ctx6, 5);
+                f8_snow3g(&ctx6, pBufIn6, pBufOut6, lenInBytes6);
+        }
+
+        if (lenInBytes7) {
+                snow3gKeyState1_t ctx7;
+
+                snow3gStateConvert_8(&ctx, &ctx7, 6);
+                f8_snow3g(&ctx7, pBufIn7, pBufOut7, lenInBytes7);
+        }
+
+        if (lenInBytes8) {
+                snow3gKeyState1_t ctx8;
+
+                snow3gStateConvert_8(&ctx, &ctx8, 7);
+                f8_snow3g(&ctx8, pBufIn8, pBufOut8, lenInBytes8);
+        }
+
+#ifdef SAFE_DATA
+        CLEAR_MEM(&ctx, sizeof(ctx));
+        CLEAR_MEM(&ks, sizeof(ks));
+        CLEAR_MEM(&in, sizeof(in));
+#endif /* SAFE_DATA */
+}
+#endif /* AVX2 */
+
+/*---------------------------------------------------------
+ * @description
+ *      Snow3G F8 8 buffer, multi-key:
+ *      Eight packets enc/dec with eight respective key schedules.
+ *      The 8 IVs are independent and are passed as an array of pointers.
+ *      Each buffer and data length are separate.
+ *---------------------------------------------------------*/
+void SNOW3G_F8_8_BUFFER_MULTIKEY(const snow3g_key_schedule_t * const pKey[],
+                                 const void * const IV[],
+                                 const void * const BufferIn[],
+                                 void *BufferOut[],
+                                 const uint32_t lengthInBytes[])
+{
+        int i;
+
+#ifdef SAFE_PARAM
+        if ((pKey == NULL) || (IV == NULL) || (BufferIn == NULL) ||
+            (BufferOut == NULL) || (lengthInBytes == NULL))
+                return;
+
+        for (i = 0; i < 8; i++)
+                if ((pKey[i] == NULL) || (IV[i] == NULL) ||
+                    (BufferIn[i] == NULL) || (BufferOut[i] == NULL) ||
+                    (lengthInBytes[i] == 0) ||
+                    (lengthInBytes[i] > SNOW3G_MAX_BYTELEN))
+                        return;
+#endif
+
+#ifndef AVX2
+        /* basic C workaround for lack of non AVX2 implementation */
+        for (i = 0; i < 8; i++)
+                SNOW3G_F8_1_BUFFER(pKey[i], IV[i], BufferIn[i], BufferOut[i],
+                                   lengthInBytes[i]);
+#else
+        uint32_t bytes = lengthInBytes[0];
+
+        /* find min byte lenght */
+        for (i = 1; i < 8; i++)
+                if (lengthInBytes[i] < bytes)
+                        bytes = lengthInBytes[i];
+
+        if (bytes % 32) {
+                snow3g_8_buffer_ks_8_multi(bytes, pKey, IV, BufferIn, BufferOut,
+                                           lengthInBytes);
+        } else {
+                snow3g_8_buffer_ks_32_multi(bytes, pKey, IV, BufferIn,
+                                            BufferOut, lengthInBytes);
+        }
+#ifdef SAFE_DATA
+        CLEAR_SCRATCH_GPS();
+        CLEAR_SCRATCH_SIMD_REGS();
+#endif
+#endif /* AVX2 */
+}
+
+/*---------------------------------------------------------
+ * @description
+ *      Snow3G F8 8 buffer:
+ *      Eight packets enc/dec with the same key schedule.
+ *      The 8 IVs are independent and are passed as an array of pointers.
+ *      Each buffer and data length are separate.
+ *      Uses AVX instructions.
+ *---------------------------------------------------------*/
+void SNOW3G_F8_8_BUFFER(const snow3g_key_schedule_t *pHandle,
+                        const void *pIV1,
+                        const void *pIV2,
+                        const void *pIV3,
+                        const void *pIV4,
+                        const void *pIV5,
+                        const void *pIV6,
+                        const void *pIV7,
+                        const void *pIV8,
+                        const void *pBufIn1,
+                        void *pBufOut1,
+                        const uint32_t lenInBytes1,
+                        const void *pBufIn2,
+                        void *pBufOut2,
+                        const uint32_t lenInBytes2,
+                        const void *pBufIn3,
+                        void *pBufOut3,
+                        const uint32_t lenInBytes3,
+                        const void *pBufIn4,
+                        void *pBufOut4,
+                        const uint32_t lenInBytes4,
+                        const void *pBufIn5,
+                        void *pBufOut5,
+                        const uint32_t lenInBytes5,
+                        const void *pBufIn6,
+                        void *pBufOut6,
+                        const uint32_t lenInBytes6,
+                        const void *pBufIn7,
+                        void *pBufOut7,
+                        const uint32_t lenInBytes7,
+                        const void *pBufIn8,
+                        void *pBufOut8,
+                        const uint32_t lenInBytes8)
+{
+#ifdef SAFE_PARAM
+        if ((pHandle == NULL) ||
+            (pIV1 == NULL) || (pIV2 == NULL) ||
+            (pIV3 == NULL) || (pIV4 == NULL) ||
+            (pIV5 == NULL) || (pIV6 == NULL) ||
+            (pIV7 == NULL) || (pIV8 == NULL) ||
+            (pBufIn1 == NULL) || (pBufOut1 == NULL) ||
+            (pBufIn2 == NULL) || (pBufOut2 == NULL) ||
+            (pBufIn3 == NULL) || (pBufOut3 == NULL) ||
+            (pBufIn4 == NULL) || (pBufOut4 == NULL) ||
+            (pBufIn5 == NULL) || (pBufOut5 == NULL) ||
+            (pBufIn6 == NULL) || (pBufOut6 == NULL) ||
+            (pBufIn7 == NULL) || (pBufOut7 == NULL) ||
+            (pBufIn8 == NULL) || (pBufOut8 == NULL) ||
+            (lenInBytes1 == 0) || (lenInBytes1 > SNOW3G_MAX_BYTELEN) ||
+            (lenInBytes2 == 0) || (lenInBytes2 > SNOW3G_MAX_BYTELEN) ||
+            (lenInBytes3 == 0) || (lenInBytes3 > SNOW3G_MAX_BYTELEN) ||
+            (lenInBytes4 == 0) || (lenInBytes4 > SNOW3G_MAX_BYTELEN) ||
+            (lenInBytes5 == 0) || (lenInBytes5 > SNOW3G_MAX_BYTELEN) ||
+            (lenInBytes6 == 0) || (lenInBytes6 > SNOW3G_MAX_BYTELEN) ||
+            (lenInBytes7 == 0) || (lenInBytes7 > SNOW3G_MAX_BYTELEN) ||
+            (lenInBytes8 == 0) || (lenInBytes8 > SNOW3G_MAX_BYTELEN))
+                return;
+#endif
+
+#ifdef AVX2
+        uint32_t bytes1 =
+                (lenInBytes1 < lenInBytes2 ? lenInBytes1
+                                           : lenInBytes2); /* number of bytes */
+        uint32_t bytes2 =
+                (lenInBytes3 < lenInBytes4 ? lenInBytes3
+                                           : lenInBytes4); /* number of bytes */
+        uint32_t bytes3 =
+                (lenInBytes5 < lenInBytes6 ? lenInBytes5
+                                           : lenInBytes6); /* number of bytes */
+        uint32_t bytes4 =
+                (lenInBytes7 < lenInBytes8 ? lenInBytes7
+                                           : lenInBytes8); /* number of bytes */
+        uint32_t bytesq1 =
+                (bytes1 < bytes2) ? bytes1 : bytes2; /* min number of bytes */
+        uint32_t bytesq2 = (bytes3 < bytes4) ? bytes3 : bytes4;
+        uint32_t bytes = (bytesq1 < bytesq2) ? bytesq1 : bytesq2;
+
+        if (bytes % 32) {
+                snow3g_8_buffer_ks_8(
+                        bytes, pHandle, pIV1, pIV2, pIV3, pIV4, pIV5, pIV6,
+                        pIV7, pIV8, pBufIn1, pBufOut1, lenInBytes1, pBufIn2,
+                        pBufOut2, lenInBytes2, pBufIn3, pBufOut3, lenInBytes3,
+                        pBufIn4, pBufOut4, lenInBytes4, pBufIn5, pBufOut5,
+                        lenInBytes5, pBufIn6, pBufOut6, lenInBytes6, pBufIn7,
+                        pBufOut7, lenInBytes7, pBufIn8, pBufOut8, lenInBytes8);
+        } else {
+                snow3g_8_buffer_ks_32(
+                        bytes, pHandle, pIV1, pIV2, pIV3, pIV4, pIV5, pIV6,
+                        pIV7, pIV8, pBufIn1, pBufOut1, lenInBytes1, pBufIn2,
+                        pBufOut2, lenInBytes2, pBufIn3, pBufOut3, lenInBytes3,
+                        pBufIn4, pBufOut4, lenInBytes4, pBufIn5, pBufOut5,
+                        lenInBytes5, pBufIn6, pBufOut6, lenInBytes6, pBufIn7,
+                        pBufOut7, lenInBytes7, pBufIn8, pBufOut8, lenInBytes8);
+        }
+#ifdef SAFE_DATA
+        CLEAR_SCRATCH_GPS();
+        CLEAR_SCRATCH_SIMD_REGS();
+#endif
+#else  /* ~AVX2 */
+        SNOW3G_F8_2_BUFFER(pHandle, pIV1, pIV2, pBufIn1, pBufOut1, lenInBytes1,
+                           pBufIn2, pBufOut2, lenInBytes2);
+
+        SNOW3G_F8_2_BUFFER(pHandle, pIV3, pIV4, pBufIn3, pBufOut3, lenInBytes3,
+                           pBufIn4, pBufOut4, lenInBytes4);
+
+        SNOW3G_F8_2_BUFFER(pHandle, pIV5, pIV6, pBufIn5, pBufOut5, lenInBytes5,
+                           pBufIn6, pBufOut6, lenInBytes6);
+
+        SNOW3G_F8_2_BUFFER(pHandle, pIV7, pIV8, pBufIn7, pBufOut7, lenInBytes7,
+                           pBufIn8, pBufOut8, lenInBytes8);
+#endif /* AVX */
+}
+
+/******************************************************************************
+ * @description
+ *      Snow3G F8 multi packet:
+ *      Performs F8 enc/dec on [n] packets. The operation is performed in-place.
+ *      The input IV's are passed in Little Endian format.
+ *      The KeySchedule is in Little Endian format.
+ ******************************************************************************/
+void SNOW3G_F8_N_BUFFER(const snow3g_key_schedule_t *pCtx,
+                        const void * const IV[],
+                        const void * const pBufferIn[],
+                        void *pBufferOut[],
+                        const uint32_t bufLenInBytes[],
+                        const uint32_t packetCount)
+{
+#ifdef SAFE_PARAM
+        uint32_t i;
+
+        if ((pCtx == NULL) || (IV == NULL) || (pBufferIn == NULL) ||
+            (pBufferOut == NULL) || (bufLenInBytes == NULL))
+                return;
+
+        for (i = 0; i < packetCount; i++)
+                if ((IV[i] == NULL) || (pBufferIn[i] == NULL) ||
+                    (pBufferOut[i] == NULL) || (bufLenInBytes[i] == 0) ||
+                    (bufLenInBytes[i] > SNOW3G_MAX_BYTELEN))
+                        return;
+#endif
+        if (packetCount > 16) {
+                pBufferOut[0] = NULL;
+                printf("packetCount too high (%d)\n", packetCount);
+                return;
+        }
+
+        uint32_t packet_index, inner_index, pktCnt = packetCount;
+        int sortNeeded = 0, tempLen = 0;
+        uint8_t *srctempbuff;
+        uint8_t *dsttempbuff;
+        uint8_t *ivtempbuff;
+        uint8_t *pSrcBuf[NUM_PACKETS_16] = {NULL};
+        uint8_t *pDstBuf[NUM_PACKETS_16] = {NULL};
+        uint8_t *pIV[NUM_PACKETS_16] = {NULL};
+        uint32_t lensBuf[NUM_PACKETS_16] = {0};
+
+        memcpy((void *)lensBuf, bufLenInBytes, packetCount * sizeof(uint32_t));
+        memcpy((void *)pSrcBuf, pBufferIn, packetCount * sizeof(void *));
+        memcpy((void *)pDstBuf, pBufferOut, packetCount * sizeof(void *));
+        memcpy((void *)pIV, IV, packetCount * sizeof(void *));
+
+        packet_index = packetCount;
+
+        while (packet_index--) {
+
+                /* check if all packets are sorted by decreasing length */
+                if (packet_index > 0 && lensBuf[packet_index - 1] <
+                                                lensBuf[packet_index]) {
+                        /* this packet array is not correctly sorted */
+                        sortNeeded = 1;
+                }
+        }
+
+        if (sortNeeded) {
+
+                /* sort packets in decreasing buffer size from [0] to
+                   [n]th packet, ** where buffer[0] will contain longest
+                   buffer and buffer[n] will contain the shortest buffer.
+                   4 arrays are swapped :
+                   - pointers to input buffers
+                   - pointers to output buffers
+                   - pointers to input IV's
+                   - input buffer lengths */
+                packet_index = packetCount;
+                while (packet_index--) {
+
+                        inner_index = packet_index;
+                        while (inner_index--) {
+
+                                if (lensBuf[packet_index] >
+                                    lensBuf[inner_index]) {
+
+                                        /* swap buffers to arrange in
+                                           descending order from [0]. */
+                                        srctempbuff = pSrcBuf[packet_index];
+                                        dsttempbuff = pDstBuf[packet_index];
+                                        ivtempbuff = pIV[packet_index];
+                                        tempLen = lensBuf[packet_index];
+
+                                        pSrcBuf[packet_index] =
+                                                pSrcBuf[inner_index];
+                                        pDstBuf[packet_index] =
+                                                pDstBuf[inner_index];
+                                        pIV[packet_index] = pIV[inner_index];
+                                        lensBuf[packet_index] =
+                                                lensBuf[inner_index];
+
+                                        pSrcBuf[inner_index] = srctempbuff;
+                                        pDstBuf[inner_index] = dsttempbuff;
+                                        pIV[inner_index] = ivtempbuff;
+                                        lensBuf[inner_index] = tempLen;
+                                }
+                        } /* for inner packet index (inner bubble-sort) */
+                }         /* for outer packet index (outer bubble-sort) */
+        }                 /* if sortNeeded */
+
+        packet_index = 0;
+        /* process 8 buffers at-a-time */
+#ifdef AVX2
+        while (pktCnt >= 8) {
+                pktCnt -= 8;
+                SNOW3G_F8_8_BUFFER(pCtx, pIV[packet_index],
+                                   pIV[packet_index + 1],
+                                   pIV[packet_index + 2],
+                                   pIV[packet_index + 3],
+                                   pIV[packet_index + 4],
+                                   pIV[packet_index + 5],
+                                   pIV[packet_index + 6],
+                                   pIV[packet_index + 7],
+                                   pSrcBuf[packet_index],
+                                   pDstBuf[packet_index],
+                                   lensBuf[packet_index],
+                                   pSrcBuf[packet_index + 1],
+                                   pDstBuf[packet_index + 1],
+                                   lensBuf[packet_index + 1],
+                                   pSrcBuf[packet_index + 2],
+                                   pDstBuf[packet_index + 2],
+                                   lensBuf[packet_index + 2],
+                                   pSrcBuf[packet_index + 3],
+                                   pDstBuf[packet_index + 3],
+                                   lensBuf[packet_index + 3],
+                                   pSrcBuf[packet_index + 4],
+                                   pDstBuf[packet_index + 4],
+                                   lensBuf[packet_index + 4],
+                                   pSrcBuf[packet_index + 5],
+                                   pDstBuf[packet_index + 5],
+                                   lensBuf[packet_index + 5],
+                                   pSrcBuf[packet_index + 6],
+                                   pDstBuf[packet_index + 6],
+                                   lensBuf[packet_index + 6],
+                                   pSrcBuf[packet_index + 7],
+                                   pDstBuf[packet_index + 7],
+                                   lensBuf[packet_index + 7]);
+                packet_index += 8;
+        }
+#endif
+        /* process 4 buffers at-a-time */
+        while (pktCnt >= 4) {
+                pktCnt -= 4;
+                SNOW3G_F8_4_BUFFER(pCtx, pIV[packet_index + 0],
+                                   pIV[packet_index + 1],
+                                   pIV[packet_index + 2],
+                                   pIV[packet_index + 3],
+                                   pSrcBuf[packet_index + 0],
+                                   pDstBuf[packet_index + 0],
+                                   lensBuf[packet_index + 0],
+                                   pSrcBuf[packet_index + 1],
+                                   pDstBuf[packet_index + 1],
+                                   lensBuf[packet_index + 1],
+                                   pSrcBuf[packet_index + 2],
+                                   pDstBuf[packet_index + 2],
+                                   lensBuf[packet_index + 2],
+                                   pSrcBuf[packet_index + 3],
+                                   pDstBuf[packet_index + 3],
+                                   lensBuf[packet_index + 3]);
+                packet_index += 4;
+        }
+
+        /* process 2 packets at-a-time */
+        while (pktCnt >= 2) {
+                pktCnt -= 2;
+                SNOW3G_F8_2_BUFFER(pCtx, pIV[packet_index + 0],
+                                   pIV[packet_index + 1],
+                                   pSrcBuf[packet_index + 0],
+                                   pDstBuf[packet_index + 0],
+                                   lensBuf[packet_index + 0],
+                                   pSrcBuf[packet_index + 1],
+                                   pDstBuf[packet_index + 1],
+                                   lensBuf[packet_index + 1]);
+                packet_index += 2;
+        }
+
+        /* remaining packets are processed 1 at a time */
+        while (pktCnt--) {
+                SNOW3G_F8_1_BUFFER(pCtx, pIV[packet_index + 0],
+                                   pSrcBuf[packet_index + 0],
+                                   pDstBuf[packet_index + 0],
+                                   lensBuf[packet_index + 0]);
+                packet_index++;
+        }
+}
+
+void SNOW3G_F8_N_BUFFER_MULTIKEY(const snow3g_key_schedule_t * const pCtx[],
+                                 const void * const IV[],
+                                 const void * const pBufferIn[],
+                                 void *pBufferOut[],
+                                 const uint32_t bufLenInBytes[],
+                                 const uint32_t packetCount)
+{
+#ifdef SAFE_PARAM
+        uint32_t i;
+
+        if ((pCtx == NULL) || (IV == NULL) || (pBufferIn == NULL) ||
+            (pBufferOut == NULL) || (bufLenInBytes == NULL))
+                return;
+
+        for (i = 0; i < packetCount; i++)
+                if ((pCtx[i] == NULL) || (IV[i] == NULL) ||
+                    (pBufferIn[i] == NULL) || (pBufferOut[i] == NULL) ||
+                    (bufLenInBytes[i] == 0) ||
+                    (bufLenInBytes[i] > SNOW3G_MAX_BYTELEN))
+                        return;
+#endif
+        if (packetCount > 16) {
+                pBufferOut[0] = NULL;
+                printf("packetCount too high (%d)\n", packetCount);
+                return;
+        }
+
+        uint32_t packet_index, inner_index, pktCnt = packetCount;
+        int sortNeeded = 0, tempLen = 0;
+        uint8_t *srctempbuff;
+        uint8_t *dsttempbuff;
+        uint8_t *ivtempbuff;
+        snow3g_key_schedule_t *pCtxBuf[NUM_PACKETS_16] = {NULL};
+        uint8_t *pSrcBuf[NUM_PACKETS_16] = {NULL};
+        uint8_t *pDstBuf[NUM_PACKETS_16] = {NULL};
+        uint8_t *pIV[NUM_PACKETS_16] = {NULL};
+        uint32_t lensBuf[NUM_PACKETS_16] = {0};
+        snow3g_key_schedule_t *tempCtx;
+
+        memcpy((void *)pCtxBuf, pCtx, packetCount * sizeof(void *));
+        memcpy((void *)lensBuf, bufLenInBytes, packetCount * sizeof(uint32_t));
+        memcpy((void *)pSrcBuf, pBufferIn, packetCount * sizeof(void *));
+        memcpy((void *)pDstBuf, pBufferOut, packetCount * sizeof(void *));
+        memcpy((void *)pIV, IV, packetCount * sizeof(void *));
+
+        packet_index = packetCount;
+
+        while (packet_index--) {
+
+                /* check if all packets are sorted by decreasing length */
+                if (packet_index > 0 && lensBuf[packet_index - 1] <
+                                                lensBuf[packet_index]) {
+                        /* this packet array is not correctly sorted */
+                        sortNeeded = 1;
+                }
+        }
+
+        if (sortNeeded) {
+                /* sort packets in decreasing buffer size from [0] to [n]th
+                   packet, where buffer[0] will contain longest buffer and
+                   buffer[n] will contain the shortest buffer.
+                   4 arrays are swapped :
+                   - pointers to input buffers
+                   - pointers to output buffers
+                   - pointers to input IV's
+                   - input buffer lengths */
+                packet_index = packetCount;
+                while (packet_index--) {
+                        inner_index = packet_index;
+                        while (inner_index--) {
+                                if (lensBuf[packet_index] >
+                                    lensBuf[inner_index]) {
+                                        /* swap buffers to arrange in
+                                           descending order from [0]. */
+                                        srctempbuff = pSrcBuf[packet_index];
+                                        dsttempbuff = pDstBuf[packet_index];
+                                        ivtempbuff = pIV[packet_index];
+                                        tempLen = lensBuf[packet_index];
+                                        tempCtx = pCtxBuf[packet_index];
+
+                                        pSrcBuf[packet_index] =
+                                                pSrcBuf[inner_index];
+                                        pDstBuf[packet_index] =
+                                                pDstBuf[inner_index];
+                                        pIV[packet_index] = pIV[inner_index];
+                                        lensBuf[packet_index] =
+                                                lensBuf[inner_index];
+                                        pCtxBuf[packet_index] =
+                                                pCtxBuf[inner_index];
+
+                                        pSrcBuf[inner_index] = srctempbuff;
+                                        pDstBuf[inner_index] = dsttempbuff;
+                                        pIV[inner_index] = ivtempbuff;
+                                        lensBuf[inner_index] = tempLen;
+                                        pCtxBuf[inner_index] = tempCtx;
+                                }
+                        } /* for inner packet index (inner bubble-sort) */
+                }         /* for outer packet index (outer bubble-sort) */
+        }                 /* if sortNeeded */
+
+        packet_index = 0;
+        /* process 8 buffers at-a-time */
+#ifdef AVX2
+        while (pktCnt >= 8) {
+                pktCnt -= 8;
+                SNOW3G_F8_8_BUFFER_MULTIKEY(
+                        (const snow3g_key_schedule_t * const *)
+                        &pCtxBuf[packet_index],
+                        (const void * const *)&pIV[packet_index],
+                        (const void * const *)&pSrcBuf[packet_index],
+                        (void **)&pDstBuf[packet_index],
+                        &lensBuf[packet_index]);
+                packet_index += 8;
+        }
+#endif
+        /* TODO process 4 buffers at-a-time */
+        /* TODO process 2 packets at-a-time */
+        /* remaining packets are processed 1 at a time */
+        while (pktCnt--) {
+                SNOW3G_F8_1_BUFFER(pCtxBuf[packet_index + 0],
+                                   pIV[packet_index + 0],
+                                   pSrcBuf[packet_index + 0],
+                                   pDstBuf[packet_index + 0],
+                                   lensBuf[packet_index + 0]);
+                packet_index++;
+        }
+}
+
+/*---------------------------------------------------------
+ * @description
+ *      Snow3G F9 1 buffer
+ *      Single buffer digest with IV and precomputed key schedule
+ *---------------------------------------------------------*/
+void SNOW3G_F9_1_BUFFER(const snow3g_key_schedule_t *pHandle,
+                        const void *pIV,
+                        const void *pBufferIn,
+                        const uint64_t lengthInBits,
+                        void *pDigest)
+{
+#ifdef SAFE_PARAM
+        if ((pHandle == NULL) || (pIV == NULL) ||
+            (pBufferIn == NULL) || (pDigest == NULL) ||
+            (lengthInBits == 0) || (lengthInBits > SNOW3G_MAX_BITLEN))
+                return;
+#endif
+        snow3gKeyState1_t ctx;
+        uint32_t z[5];
+        uint64_t lengthInQwords, E, V, P;
+        uint64_t i, rem_bits;
+        const uint64_t *inputBuffer;
+
+        inputBuffer = (const uint64_t *)pBufferIn;
+
+        /* Initialize the snow3g key schedule */
+        snow3gStateInitialize_1(&ctx, pHandle, pIV);
+
+        /*Generate 5 keystream words*/
+        snow3g_f9_keystream_words(&ctx, &z[0]);
+
+        P = ((uint64_t)z[0] << 32) | ((uint64_t)z[1]);
+
+        lengthInQwords = lengthInBits / 64;
+
+        E = 0;
+        /* all blocks except the last one */
+        for (i = 0; i < lengthInQwords; i++) {
+                V = BSWAP64(inputBuffer[i]);
+                E = multiply_and_reduce64(E ^ V, P);
+        }
+
+        /* last bits of last block if any left */
+        rem_bits = lengthInBits % 64;
+        if (rem_bits) {
+                /* last bytes, do not go past end of buffer */
+                memcpy(&V, &inputBuffer[i], (rem_bits + 7) / 8);
+                V = BSWAP64(V);
+                V &= (((uint64_t)-1) << (64 - rem_bits)); /* mask extra bits */
+                E = multiply_and_reduce64(E ^ V, P);
+        }
+
+        /* Multiply by Q */
+        E = multiply_and_reduce64(E ^ lengthInBits,
+                                  (((uint64_t)z[2] << 32) | ((uint64_t)z[3])));
+
+        /* Final MAC */
+        *(uint32_t *)pDigest =
+                (uint32_t)BSWAP64(E ^ ((uint64_t)z[4] << 32));
+#ifdef SAFE_DATA
+        CLEAR_VAR(&E, sizeof(E));
+        CLEAR_VAR(&V, sizeof(V));
+        CLEAR_VAR(&P, sizeof(P));
+        CLEAR_MEM(&z, sizeof(z));
+        CLEAR_MEM(&ctx, sizeof(ctx));
+        CLEAR_SCRATCH_GPS();
+        CLEAR_SCRATCH_SIMD_REGS();
+#endif /* SAFE_DATA */
+}
+
+#endif /* SNOW3G_COMMON_H */
diff --git a/src/spdk/intel-ipsec-mb/include/snow3g_internal.h b/src/spdk/intel-ipsec-mb/include/snow3g_internal.h
new file mode 100644
index 000000000..287d60be1
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/include/snow3g_internal.h
@@ -0,0 +1,638 @@
+/*******************************************************************************
+  Copyright (c) 2009-2019, Intel Corporation
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are met:
+
+      * Redistributions of source code must retain the above copyright notice,
+        this list of conditions and the following disclaimer.
+      * Redistributions in binary form must reproduce the above copyright
+        notice, this list of conditions and the following disclaimer in the
+        documentation and/or other materials provided with the distribution.
+      * Neither the name of Intel Corporation nor the names of its contributors
+        may be used to endorse or promote products derived from this software
+        without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+  DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+  DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+  SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+  CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+  OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+
+#ifndef _SNOW3G_INTERNAL_H_
+#define _SNOW3G_INTERNAL_H_
+
+#include "intel-ipsec-mb.h"
+#include "wireless_common.h"
+#include "constant_lookup.h"
+
+#define MAX_KEY_LEN (16)
+#define SNOW3G_4_BYTES (4)
+#define SNOW3G_8_BYTES (8)
+#define SNOW3G_8_BITS (8)
+#define SNOW3G_16_BYTES (16)
+#define SNOW3G_16_BITS (16)
+
+#define SNOW3G_BLOCK_SIZE (8)
+
+#define SNOW3G_KEY_LEN_IN_BYTES (16) /* 128b */
+#define SNOW3G_IV_LEN_IN_BYTES (16)  /* 128b */
+
+#define SNOW3GCONSTANT (0x1b)
+
+/* Range of input data for SNOW3G is from 1 to 2^32 bits */
+#define SNOW3G_MIN_LEN 1
+#define SNOW3G_MAX_BITLEN (UINT32_MAX)
+#define SNOW3G_MAX_BYTELEN (UINT32_MAX / 8)
+
+#define ComplementaryMask64(x) ((~(x) % 64) + 1)
+#define ComplementaryMask32(x) ((~(x) % 32) + 1)
+
+#ifndef SAFE_LOOKUP
+/*standard lookup */
+#define SNOW3G_LOOKUP_W0(table, idx, size) \
+        table[idx].w0.v
+#define SNOW3G_LOOKUP_W1(table, idx, size) \
+        table[idx].w1.v
+#define SNOW3G_LOOKUP_W2(table, idx, size) \
+        table[idx].w2.v
+#define SNOW3G_LOOKUP_W3(table, idx, size) \
+        table[idx].w3.v
+#else
+/* contant time lookup */
+#if defined (AVX) || defined (AVX2)
+#define SNOW3G_LOOKUP_W0(table, idx, size) \
+        ((uint32_t)(LOOKUP64_AVX(table, idx, size) >> 0))
+#define SNOW3G_LOOKUP_W1(table, idx, size) \
+        ((uint32_t)(LOOKUP64_AVX(table, idx, size) >> 8))
+#define SNOW3G_LOOKUP_W2(table, idx, size) \
+        ((uint32_t)(LOOKUP64_AVX(table, idx, size) >> 16))
+#define SNOW3G_LOOKUP_W3(table, idx, size) \
+        ((uint32_t)(LOOKUP64_AVX(table, idx, size) >> 24))
+#else
+#define SNOW3G_LOOKUP_W0(table, idx, size) \
+        ((uint32_t)(LOOKUP64_SSE(table, idx, size) >> 0))
+#define SNOW3G_LOOKUP_W1(table, idx, size) \
+        ((uint32_t)(LOOKUP64_SSE(table, idx, size) >> 8))
+#define SNOW3G_LOOKUP_W2(table, idx, size) \
+        ((uint32_t)(LOOKUP64_SSE(table, idx, size) >> 16))
+#define SNOW3G_LOOKUP_W3(table, idx, size) \
+        ((uint32_t)(LOOKUP64_SSE(table, idx, size) >> 24))
+#endif /* AVX || AVX2 */
+#endif /* SAFE_LOOKUP */
+
+typedef union SafeBuffer {
+        uint64_t b64;
+        uint32_t b32[2];
+        uint8_t b8[SNOW3G_8_BYTES];
+} SafeBuf;
+
+typedef struct snow3gKeyState1_s {
+        /* 16 LFSR stages */
+        uint32_t LFSR_S[16];
+        /* 3 FSM states */
+        uint32_t FSM_R3;
+        uint32_t FSM_R2;
+        uint32_t FSM_R1;
+} DECLARE_ALIGNED(snow3gKeyState1_t, 16);
+
+typedef struct snow3gKeyState4_s {
+        /* 16 LFSR stages */
+        __m128i LFSR_X[16];
+        /* 3 FSM states */
+        __m128i FSM_X[3];
+        uint32_t iLFSR_X;
+
+} snow3gKeyState4_t;
+
+
+#ifdef _WIN32
+#pragma pack(push,1)
+#define DECLARE_PACKED_UINT32(x) uint32_t x
+#else
+#define DECLARE_PACKED_UINT32(x) uint32_t x __attribute__((__packed__))
+#endif
+
+typedef union snow3gTableEntry_u {
+        uint64_t v;
+        struct {
+                uint8_t shift[3];
+                DECLARE_PACKED_UINT32(v);
+        } w3;
+        struct {
+                uint8_t shift[2];
+                DECLARE_PACKED_UINT32(v);
+        } w2;
+        struct {
+                uint8_t shift[1];
+                DECLARE_PACKED_UINT32(v);
+        } w1;
+        struct {
+                uint8_t shift[4];
+                DECLARE_PACKED_UINT32(v);
+        } w0;
+} snow3gTableEntry_t;
+#ifdef _WIN32
+#pragma pack(pop)
+#endif
+
+#define rotl32(x, n) (((x) << (n)) | ((x) >> (32 - (n))))
+
+#define rotr32(x, n) (((x) << (32 - (n))) | ((x) >> (n)))
+
+#define rotl8(x, n) (((x) << (n)) | ((x) >> (8 - (n))))
+
+#define rotr8(x, n) (((x) << (8 - (n))) | ((x) >> (n)))
+
+/*************************************************************************
+ * @description - snow3g internal tables
+ *************************************************************************/
+
+extern const int snow3g_table_A_mul[256];
+extern const int snow3g_table_A_div[256];
+extern snow3gTableEntry_t snow3g_table_S1[256];
+extern snow3gTableEntry_t snow3g_table_S2[256];
+extern const int S1_T0[256];
+extern const int S1_T1[256];
+extern const int S1_T2[256];
+extern const int S1_T3[256];
+extern const int S2_T0[256];
+extern const int S2_T1[256];
+extern const int S2_T2[256];
+extern const int S2_T3[256];
+
+/* -------------------------------------------------------------------
+ * combined S-Box processing for reduced instruction dependencies
+ *
+ * S1_S2_1    : 2 S-Box , 1 packet at a time
+ * S1_S2_S3_1 : 3 S-Box at the same time
+ *
+ * S1_S2_4    : 2 S-Box , 4 packets at a time
+ *
+ * ------------------------------------------------------------------ */
+#ifdef AVX2
+#define _mm256_set_m128i(/* __m128i */ hi, /* __m128i */ lo)                   \
+        _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 0x1)
+
+#ifndef _mm256_loadu2_m128i
+#define _mm256_loadu2_m128i(hi, lo)                                            \
+        _mm256_inserti128_si256(                                               \
+            _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)lo)),      \
+            _mm_loadu_si128((const __m128i *)hi), 1)
+#endif /* _mm256_loadu2_m128i */
+
+typedef struct snow3gKeyState8_s {
+        /* 16 LFSR stages */
+        __m256i LFSR_X[16];
+        /* 3 FSM states */
+        __m256i FSM_X[3];
+        uint32_t iLFSR_X;
+
+} snow3gKeyState8_t;
+
+/* Sbox Snow3g_S1 and Snow3g_S2 with dependency unrolling
+ * for n in [0..3]
+ *     w[n-1] = k; y[n] = Snow3g_S2(w[n]); k = Snow3g_S1(x[n])
+ *
+ *
+ */
+#define S1_S2_8(y, w, x, k, l, n)                                              \
+        do {                                                                   \
+                uint8_t w0, w1, w2, w3;                                        \
+                uint8_t x0, x1, x2, x3;                                        \
+                uint32_t ty = l;                                               \
+                w3 = _mm256_extract_epi8(w, (4 * n + 0));                      \
+                w2 = _mm256_extract_epi8(w, (4 * n + 1));                      \
+                w1 = _mm256_extract_epi8(w, (4 * n + 2));                      \
+                w0 = _mm256_extract_epi8(w, (4 * n + 3));                      \
+                l = snow3g_table_S2[w3].w3.v ^ snow3g_table_S2[w2].w2.v ^      \
+                    snow3g_table_S2[w1].w1.v ^ snow3g_table_S2[w0].w0.v;       \
+                if (n != 0)                                                    \
+                        w = _mm256_insert_epi32(w, k, (n - 1));                \
+                if (n != 0)                                                    \
+                        y = _mm256_insert_epi32(y, ty, (n - 1));               \
+                x3 = _mm256_extract_epi8(x, (4 * n + 0));                      \
+                x2 = _mm256_extract_epi8(x, (4 * n + 1));                      \
+                x1 = _mm256_extract_epi8(x, (4 * n + 2));                      \
+                x0 = _mm256_extract_epi8(x, (4 * n + 3));                      \
+                k = snow3g_table_S1[x3].w3.v ^ snow3g_table_S1[x2].w2.v ^      \
+                    snow3g_table_S1[x1].w1.v ^ snow3g_table_S1[x0].w0.v;       \
+                if (n == 7)                                                    \
+                        w = _mm256_insert_epi32(w, k, n);                      \
+                if (n == 7)                                                    \
+                        y = _mm256_insert_epi32(y, l, n);                      \
+        } while (0)
+#endif /* AVX2 */
+
+
+#if defined (NO_AESNI) || defined (SAFE_LOOKUP)
+/* help compilers to interleave the
+ * operations and table access latencies
+ */
+
+/* Sbox Snow3g_S1 and Snow3g_S2, simple C code
+ *  y = Snow3g_S2(w); w = Snow3g_S1(x);
+ */
+#define S1_S2_1(y, w, x)                                                       \
+        do {                                                                   \
+                uint32_t w0, w1, w2, w3;                                       \
+                uint32_t x0, x1, x2, x3;                                       \
+                uint32_t tw, tx;                                               \
+                w3 = w & 0xff;                                                 \
+                x3 = x & 0xff;                                                 \
+                tw = SNOW3G_LOOKUP_W3(snow3g_table_S2, w3,                     \
+                                      sizeof(snow3g_table_S2));                \
+                tx = SNOW3G_LOOKUP_W3(snow3g_table_S1, x3,                     \
+                                      sizeof(snow3g_table_S1));                \
+                w0 = w >> 24;                                                  \
+                x0 = x >> 24;                                                  \
+                tw ^= SNOW3G_LOOKUP_W0(snow3g_table_S2, w0,                    \
+                                       sizeof(snow3g_table_S2));               \
+                tx ^= SNOW3G_LOOKUP_W0(snow3g_table_S1, x0,                    \
+                                       sizeof(snow3g_table_S1));               \
+                w1 = (w >> 16) & 0xff;                                         \
+                x1 = (x >> 16) & 0xff;                                         \
+                tw ^= SNOW3G_LOOKUP_W1(snow3g_table_S2, w1,                    \
+                                       sizeof(snow3g_table_S2));               \
+                tx ^= SNOW3G_LOOKUP_W1(snow3g_table_S1, x1,                    \
+                                       sizeof(snow3g_table_S1));               \
+                w2 = (w >> 8) & 0xff;                                          \
+                x2 = (x >> 8) & 0xff;                                          \
+                y = tw ^ SNOW3G_LOOKUP_W2(snow3g_table_S2, w2,                 \
+                                          sizeof(snow3g_table_S2));            \
+                w = tx ^ SNOW3G_LOOKUP_W2(snow3g_table_S1, x2,                 \
+                                          sizeof(snow3g_table_S1));            \
+               } while (0)
+
+/* Sbox Snow3g_S1 and Snow3g_S2, simple C code
+ *  y = Snow3g_S2(w); w = Snow3g_S1(x); u = Snow3g_S1(z);
+ */
+#define S1_S2_S3_1(y, w, x, u, z)                                              \
+        do {                                                                   \
+                unsigned w0, w1, w2, w3;                                       \
+                unsigned x0, x1, x2, x3;                                       \
+                unsigned z0, z1, z2, z3;                                       \
+                uint32_t tw, tx, tz;                                           \
+                w3 = w & 0xff;                                                 \
+                x3 = x & 0xff;                                                 \
+                z3 = z & 0xff;                                                 \
+                tw = SNOW3G_LOOKUP_W3(snow3g_table_S2, w3,                     \
+                                      sizeof(snow3g_table_S2));                \
+                tx = SNOW3G_LOOKUP_W3(snow3g_table_S1, x3,                     \
+                                      sizeof(snow3g_table_S1));                \
+                tz = SNOW3G_LOOKUP_W3(snow3g_table_S1, z3,                     \
+                                      sizeof(snow3g_table_S1));                \
+                w0 = w >> 24;                                                  \
+                x0 = x >> 24;                                                  \
+                z0 = z >> 24;                                                  \
+                tw ^= SNOW3G_LOOKUP_W0(snow3g_table_S2, w0,                    \
+                                       sizeof(snow3g_table_S2));               \
+                tx ^= SNOW3G_LOOKUP_W0(snow3g_table_S1, x0,                    \
+                                       sizeof(snow3g_table_S1));               \
+                tz ^= SNOW3G_LOOKUP_W0(snow3g_table_S1, z0,                    \
+                                       sizeof(snow3g_table_S1));               \
+                w1 = (w >> 16) & 0xff;                                         \
+                x1 = (x >> 16) & 0xff;                                         \
+                z1 = (z >> 16) & 0xff;                                         \
+                tw ^= SNOW3G_LOOKUP_W1(snow3g_table_S2, w1,                    \
+                                       sizeof(snow3g_table_S2));               \
+                tx ^= SNOW3G_LOOKUP_W1(snow3g_table_S1, x1,                    \
+                                       sizeof(snow3g_table_S1));               \
+                tz ^= SNOW3G_LOOKUP_W1(snow3g_table_S1, z1,                    \
+                                       sizeof(snow3g_table_S1));               \
+                w2 = (w >> 8) & 0xff;                                          \
+                x2 = (x >> 8) & 0xff;                                          \
+                z2 = (z >> 8) & 0xff;                                          \
+                y = tw ^ SNOW3G_LOOKUP_W2(snow3g_table_S2, w2,                 \
+                                          sizeof(snow3g_table_S2));            \
+                w = tx ^ SNOW3G_LOOKUP_W2(snow3g_table_S1, x2,                 \
+                                          sizeof(snow3g_table_S1));            \
+                u = tz ^ SNOW3G_LOOKUP_W2(snow3g_table_S1, z2,                 \
+                                          sizeof(snow3g_table_S1));            \
+        } while (0)
+
+/* Sbox Snow3g_S1 and Snow3g_S2 with dependency unrolling
+ * for n in [0..3]
+ *     w[n-1] = k; y[n] = Snow3g_S2(w[n]); k = Snow3g_S1(x[n])
+ *
+ *
+ */
+#define S1_S2_4(y, w, x, k, l, n)                                              \
+        do {                                                                   \
+                unsigned w0, w1, w2, w3;                                       \
+                unsigned x0, x1, x2, x3;                                       \
+                uint32_t ty = l;                                               \
+                w3 = _mm_extract_epi8(w, (4 * n + 0));                         \
+                w2 = _mm_extract_epi8(w, (4 * n + 1));                         \
+                w1 = _mm_extract_epi8(w, (4 * n + 2));                         \
+                w0 = _mm_extract_epi8(w, (4 * n + 3));                         \
+                l = SNOW3G_LOOKUP_W3(snow3g_table_S2, w3,                      \
+                                     sizeof(snow3g_table_S2)) ^                \
+                        SNOW3G_LOOKUP_W2(snow3g_table_S2, w2,                  \
+                                         sizeof(snow3g_table_S2)) ^            \
+                        SNOW3G_LOOKUP_W1(snow3g_table_S2, w1,                  \
+                                         sizeof(snow3g_table_S2)) ^            \
+                        SNOW3G_LOOKUP_W0(snow3g_table_S2, w0,                  \
+                                         sizeof(snow3g_table_S2));             \
+                if (n != 0)                                                    \
+                        w = _mm_insert_epi32(w, k, (n - 1));                   \
+                if (n != 0)                                                    \
+                        y = _mm_insert_epi32(y, ty, (n - 1));                  \
+                x3 = _mm_extract_epi8(x, (4 * n + 0));                         \
+                x2 = _mm_extract_epi8(x, (4 * n + 1));                         \
+                x1 = _mm_extract_epi8(x, (4 * n + 2));                         \
+                x0 = _mm_extract_epi8(x, (4 * n + 3));                         \
+                k = SNOW3G_LOOKUP_W3(snow3g_table_S1, x3,                      \
+                                     sizeof(snow3g_table_S1)) ^                \
+                        SNOW3G_LOOKUP_W2(snow3g_table_S1, x2,                  \
+                                         sizeof(snow3g_table_S1)) ^            \
+                        SNOW3G_LOOKUP_W1(snow3g_table_S1, x1,                  \
+                                         sizeof(snow3g_table_S1)) ^            \
+                        SNOW3G_LOOKUP_W0(snow3g_table_S1, x0,                  \
+                                         sizeof(snow3g_table_S1));             \
+                if (n == 3)                                                    \
+                        w = _mm_insert_epi32(w, k, n);                         \
+                if (n == 3)                                                    \
+                        y = _mm_insert_epi32(y, l, n);                         \
+        } while (0)
+
+#else /* SSE/AVX */
+
+/* use AES-NI Rijndael for Snow3G Sbox, overlap the latency
+ * of AESENC with Snow3g_S2 sbox calculations
+ */
+
+/* Sbox Snow3g_S1 and Snow3g_S2, simple C code
+ *  y = Snow3g_S2(w); w = rijndael Snow3g_S1(x);
+ */
+#define S1_S2_1(y, w, x)                                                       \
+        do {                                                                   \
+                __m128i m10, m11;                                              \
+                m11 = _mm_cvtsi32_si128(x);                                    \
+                m10 = _mm_setzero_si128();                                     \
+                m11 = _mm_shuffle_epi32(m11, 0x0);                             \
+                m11 = _mm_aesenc_si128(m11, m10);                              \
+                y = Snow3g_S2(w);                                              \
+                w = _mm_cvtsi128_si32(m11);                                    \
+        } while (0)
+
+/* Sbox Snow3g_S1 and Snow3g_S2
+ *  y = Snow3g_S2(w); w = rijndael Snow3g_S1(x); u = rijndael Snow3g_S1(z);
+ */
+#define S1_S2_S3_1(y, w, x, v, z)                                              \
+        do {                                                                   \
+                __m128i m10, m11, m12;                                         \
+                m11 = _mm_cvtsi32_si128(x);                                    \
+                m10 = _mm_setzero_si128();                                     \
+                m11 = _mm_shuffle_epi32(m11, 0x0);                             \
+                m11 = _mm_aesenc_si128(m11, m10);                              \
+                m12 = _mm_cvtsi32_si128(z);                                    \
+                m12 = _mm_shuffle_epi32(m12, 0x0);                             \
+                m12 = _mm_aesenc_si128(m12, m10);                              \
+                y = Snow3g_S2(w);                                              \
+                w = _mm_cvtsi128_si32(m11);                                    \
+                v = _mm_cvtsi128_si32(m12);                                    \
+        } while (0)
+/* Sbox Snow3g_S1 and Snow3g_S2
+ * for n in [0..3]
+ *     extract packet data
+ *     y = Snow3g_S2(w); w = rijndael Snow3g_S1(x)
+ *     insert the result data
+ */
+#define S1_S2_4(y, w, x, k, n)                                                 \
+        do {                                                                   \
+                uint32_t ty;                                                   \
+                unsigned w0, w1, w2, w3;                                       \
+                __m128i m10, m11;                                              \
+                m10 = _mm_setzero_si128();                                     \
+                m11 = _mm_shuffle_epi32(                                       \
+                    x, ((n << 6) | (n << 4) | (n << 2) | (n << 0)));           \
+                m11 = _mm_aesenc_si128(m11, m10);                              \
+                w3 = _mm_extract_epi8(w, (4 * n + 0));                         \
+                w2 = _mm_extract_epi8(w, (4 * n + 1));                         \
+                w1 = _mm_extract_epi8(w, (4 * n + 2));                         \
+                w0 = _mm_extract_epi8(w, (4 * n + 3));                         \
+                ty = snow3g_table_S2[w3].w3.v ^ snow3g_table_S2[w1].w1.v ^     \
+                     snow3g_table_S2[w2].w2.v ^ snow3g_table_S2[w0].w0.v;      \
+                if (n != 0)                                                    \
+                        w = _mm_insert_epi32(w, k, (n - 1));                   \
+                k = _mm_cvtsi128_si32(m11);                                    \
+                if (n == 3)                                                    \
+                        w = _mm_insert_epi32(w, k, n);                         \
+                y = _mm_insert_epi32(y, ty, n);                                \
+        } while (0)
+
+#endif /* NO_AESNI || SAFE_LOOKUP */
+
+/* -------------------------------------------------------------------
+ * Sbox Snow3g_S1 maps a 32bit input to a 32bit output
+ * ------------------------------------------------------------------ */
+static inline uint32_t Snow3g_S1(uint32_t w)
+{
+        uint32_t w0, w1, w2, w3;
+
+        w3 = w & 0xff;
+        w1 = (w >> 16) & 0xff;
+        w2 = (w >> 8) & 0xff;
+        w0 = w >> 24;
+        return snow3g_table_S1[w3].w3.v ^ snow3g_table_S1[w1].w1.v ^
+               snow3g_table_S1[w2].w2.v ^ snow3g_table_S1[w0].w0.v;
+}
+
+/* -------------------------------------------------------------------
+ * Sbox Snow3g_S2 maps a 32bit input to a 32bit output
+ * ------------------------------------------------------------------ */
+static inline uint32_t Snow3g_S2(uint32_t w)
+{
+        uint32_t w0, w1, w2, w3;
+
+        w3 = w & 0xff;
+        w1 = (w >> 16) & 0xff;
+        w2 = (w >> 8) & 0xff;
+        w0 = w >> 24;
+
+        return snow3g_table_S2[w3].w3.v ^ snow3g_table_S2[w1].w1.v ^
+               snow3g_table_S2[w2].w2.v ^ snow3g_table_S2[w0].w0.v;
+}
+
+/* -------------------------------------------------------------------
+ * LFSR array shift by 1 position
+ * ------------------------------------------------------------------ */
+static inline void ShiftLFSR_1(snow3gKeyState1_t *pCtx)
+{
+        uint32_t i;
+
+        for (i = 0; i < 15; i++)
+                pCtx->LFSR_S[i] = pCtx->LFSR_S[i + 1];
+}
+
+/* -------------------------------------------------------------------
+ * LFSR array shift by 2 positions
+ * ------------------------------------------------------------------ */
+static inline void ShiftTwiceLFSR_1(snow3gKeyState1_t *pCtx)
+{
+        int i;
+
+        for (i = 0; i < 14; i++)
+                pCtx->LFSR_S[i] = pCtx->LFSR_S[i + 2];
+}
+
+/* -------------------------------------------------------------------
+ * ClockFSM function as defined in snow3g standard
+ * The FSM has 2 input words S5 and S15 from the LFSR
+ * produces a 32 bit output word F
+ * ------------------------------------------------------------------ */
+static inline void ClockFSM_1(snow3gKeyState1_t *pCtx, uint32_t *data)
+{
+        uint32_t F, R;
+
+        F = pCtx->LFSR_S[15] + pCtx->FSM_R1;
+        R = pCtx->FSM_R3 ^ pCtx->LFSR_S[5];
+        *data = F ^ pCtx->FSM_R2;
+        R += pCtx->FSM_R2;
+        S1_S2_1(pCtx->FSM_R3, pCtx->FSM_R2, pCtx->FSM_R1);
+        pCtx->FSM_R1 = R;
+}
+
+/* -------------------------------------------------------------------
+ * ClockLFSR functin as defined in snow3g standard
+ * ------------------------------------------------------------------ */
+static inline void ClockLFSR_1(snow3gKeyState1_t *pCtx)
+{
+        uint32_t V = pCtx->LFSR_S[2];
+        uint32_t S0 = pCtx->LFSR_S[0];
+        uint32_t S11 = pCtx->LFSR_S[11];
+
+        V ^= snow3g_table_A_mul[S0 >> 24];
+        V ^= snow3g_table_A_div[S11 & 0xff];
+        V ^= S0 << 8;
+        V ^= S11 >> 8;
+
+        ShiftLFSR_1(pCtx);
+
+        pCtx->LFSR_S[15] = V;
+}
+
+/**
+ *******************************************************************************
+ * @description
+ * This function initializes the key schedule for 1 buffer for snow3g f8/f9.
+ *
+ * @param[in]       pCtx        Context where the scheduled keys are stored
+ * @param [in]      pKeySched    Key schedule
+ * @param [in]      pIV          IV
+ *
+ ******************************************************************************/
+static inline void
+snow3gStateInitialize_1(snow3gKeyState1_t *pCtx,
+                        const snow3g_key_schedule_t *pKeySched,
+                        const void *pIV)
+{
+        uint32_t K, L;
+        int i;
+        uint32_t V0, V1;
+        uint32_t F0, F1;
+        uint32_t L0, L1, L11, L12;
+        uint32_t R0, R1;
+        uint32_t FSM2, FSM3, FSM4;
+        const uint32_t *pIV32 = pIV;
+
+        /* LFSR initialisation */
+        for (i = 0; i < 4; i++) {
+                K = pKeySched->k[i];
+                L = ~K;
+                pCtx->LFSR_S[i + 4] = K;
+                pCtx->LFSR_S[i + 12] = K;
+                pCtx->LFSR_S[i + 0] = L;
+                pCtx->LFSR_S[i + 8] = L;
+        }
+
+        pCtx->LFSR_S[15] ^= BSWAP32(pIV32[3]);
+        pCtx->LFSR_S[12] ^= BSWAP32(pIV32[2]);
+        pCtx->LFSR_S[10] ^= BSWAP32(pIV32[1]);
+        pCtx->LFSR_S[9] ^= BSWAP32(pIV32[0]);
+
+        /* FSM initialialization */
+        FSM2 = 0x0;
+        FSM3 = 0x0;
+        FSM4 = 0x0;
+        R1 = 0x0;
+        V1 = pCtx->LFSR_S[15];
+
+        for (i = 0; i < 16; i++) {
+                /* clock FSM + clock LFSR + clockFSM + clock LFSR */
+                L0 = pCtx->LFSR_S[0];
+                L1 = pCtx->LFSR_S[1];
+                V0 = pCtx->LFSR_S[2];
+                F0 = V1 + R1; /**  (s15 +  R1) **/
+                V1 = pCtx->LFSR_S[3];
+                V0 ^= snow3g_table_A_mul[L0 >> 24]; /* MUL(s0,0 ) */
+                F0 ^= FSM2;                         /** (s15 + R1) ^ R2 **/
+                V1 ^= snow3g_table_A_mul[L1 >> 24];
+                L11 = pCtx->LFSR_S[11];
+                L12 = pCtx->LFSR_S[12];
+                R0 = FSM3 ^ pCtx->LFSR_S[5];          /*** (R3 ^ s5 ) ***/
+                V0 ^= snow3g_table_A_div[L11 & 0xff]; /* DIV(s11,3 )*/
+                R0 += FSM2;                           /*** R2 + (R3 ^ s5 ) ***/
+                V1 ^= snow3g_table_A_div[L12 & 0xff];
+                V0 ^= L0 << 8; /*  (s0,1 || s0,2 || s0,3 || 0x00) */
+                V1 ^= L1 << 8;
+                V0 ^= L11 >> 8; /* (0x00 || s11,0 || s11,1 || s11,2 ) */
+                V1 ^= L12 >> 8;
+                S1_S2_S3_1(FSM3, FSM2, R1, FSM4, R0);
+                V0 ^= F0; /* ^F */
+                R1 = FSM3 ^ pCtx->LFSR_S[6];
+                F1 = V0 + R0;
+                F1 ^= FSM2;
+                R1 += FSM2;
+                FSM3 = Snow3g_S2(FSM2);
+                FSM2 = FSM4;
+                V1 ^= F1;
+
+                /* shift LFSR twice */
+                ShiftTwiceLFSR_1(pCtx);
+
+                pCtx->LFSR_S[14] = V0;
+                pCtx->LFSR_S[15] = V1;
+        }
+
+        /* set FSM into scheduling structure */
+        pCtx->FSM_R3 = FSM3;
+        pCtx->FSM_R2 = FSM2;
+        pCtx->FSM_R1 = R1;
+}
+
+/**
+ *******************************************************************************
+ * @description
+ * This function generates 5 words of keystream used in the initial stages
+ * of snow3g F9.
+ *
+ * @param[in]       pCtx                         Context where the scheduled
+ *keys are stored
+ * @param[in/out]   pKeyStream          Pointer to the generated keystream
+ *
+ ******************************************************************************/
+static inline void snow3g_f9_keystream_words(snow3gKeyState1_t *pCtx,
+                                             uint32_t *pKeyStream)
+{
+        uint32_t F, XX;
+        int i;
+
+        ClockFSM_1(pCtx, &XX);
+        ClockLFSR_1(pCtx);
+
+        for (i = 0; i < 5; i++) {
+                ClockFSM_1(pCtx, &F);
+                pKeyStream[i] = F ^ pCtx->LFSR_S[0];
+                ClockLFSR_1(pCtx);
+        }
+}
+
+#endif /* _SNOW3G_INTERNAL_H_  */
diff --git a/src/spdk/intel-ipsec-mb/include/transpose_avx2.asm b/src/spdk/intel-ipsec-mb/include/transpose_avx2.asm
new file mode 100644
index 000000000..fed12cf4b
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/include/transpose_avx2.asm
@@ -0,0 +1,218 @@
+;;
+;; Copyright (c) 2012-2019, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;;     * Redistributions of source code must retain the above copyright notice,
+;;       this list of conditions and the following disclaimer.
+;;     * Redistributions in binary form must reproduce the above copyright
+;;       notice, this list of conditions and the following disclaimer in the
+;;       documentation and/or other materials provided with the distribution.
+;;     * Neither the name of Intel Corporation nor the names of its contributors
+;;       may be used to endorse or promote products derived from this software
+;;       without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+%ifndef _TRANSPOSE_AVX2_ASM_
+%define _TRANSPOSE_AVX2_ASM_
+
+%include "include/reg_sizes.asm"
+
+; LOAD ALL 8 LANES FOR 8x8 32-BIT TRANSPOSE
+;
+; r0-r7       [out] ymm registers which will contain the data to be transposed
+; addr0-addr7 [in]  pointers to the next 32-byte block of data to be fetch for all 8 lanes
+; ptr_offset  [in] offset to be applied on all pointers (addr0-addr7)
+%macro TRANSPOSE8_U32_LOAD8 17
+%define %%r0 %1
+%define %%r1 %2
+%define %%r2 %3
+%define %%r3 %4
+%define %%r4 %5
+%define %%r5 %6
+%define %%r6 %7
+%define %%r7 %8
+%define %%addr0 %9
+%define %%addr1 %10
+%define %%addr2 %11
+%define %%addr3 %12
+%define %%addr4 %13
+%define %%addr5 %14
+%define %%addr6 %15
+%define %%addr7 %16
+%define %%ptr_offset %17
+
+; Expected output data
+;
+; r0 = {e3 e2 e1 e0  a3 a2 a1 a0}
+; r1 = {f3 f2 f1 f0  b3 b2 b1 b0}
+; r2 = {g3 g2 g1 g0  c3 c2 c1 c0}
+; r3 = {h3 h2 h1 h0  d3 d2 d1 d0}
+; r4 = {e7 e6 e5 e4  a7 a6 a5 a4}
+; r5 = {f7 f6 f5 f4  b7 b6 b5 b4}
+; r6 = {g7 g6 g5 g4  c7 c6 c5 c4}
+; r7 = {h7 h6 h5 h4  d7 d6 d5 d4}
+
+	vmovups	XWORD(%%r0),[%%addr0+%%ptr_offset]
+	vmovups	XWORD(%%r1),[%%addr1+%%ptr_offset]
+	vmovups	XWORD(%%r2),[%%addr2+%%ptr_offset]
+	vmovups	XWORD(%%r3),[%%addr3+%%ptr_offset]
+	vmovups	XWORD(%%r4),[%%addr0+%%ptr_offset+16]
+	vmovups	XWORD(%%r5),[%%addr1+%%ptr_offset+16]
+	vmovups	XWORD(%%r6),[%%addr2+%%ptr_offset+16]
+	vmovups	XWORD(%%r7),[%%addr3+%%ptr_offset+16]
+
+	vinserti128 %%r0, %%r0, [%%addr4+%%ptr_offset], 0x01
+	vinserti128 %%r1, %%r1, [%%addr5+%%ptr_offset], 0x01
+	vinserti128 %%r2, %%r2, [%%addr6+%%ptr_offset], 0x01
+	vinserti128 %%r3, %%r3, [%%addr7+%%ptr_offset], 0x01
+	vinserti128 %%r4, %%r4, [%%addr4+%%ptr_offset+16], 0x01
+	vinserti128 %%r5, %%r5, [%%addr5+%%ptr_offset+16], 0x01
+	vinserti128 %%r6, %%r6, [%%addr6+%%ptr_offset+16], 0x01
+	vinserti128 %%r7, %%r7, [%%addr7+%%ptr_offset+16], 0x01
+
+%endmacro
+
+; 8x8 32-BIT TRANSPOSE
+;
+; Before calling this macro, TRANSPOSE8_U32_LOAD8 must be called.
+;
+; r0-r3          [in/out]    ymm registers containing bytes 0-15 of each 32B block (e.g. ymm0 = [e3-e0 a3-a0])
+; r4-r7          [in/out]    ymm registers containing bytes 16-31 of each 32B block (e.g. ymm4 = [e4-e7 a4-a7])
+; t0-t1          [clobbered] ymm temporary registers
+%macro TRANSPOSE8_U32 10
+%define %%r0 %1
+%define %%r1 %2
+%define %%r2 %3
+%define %%r3 %4
+%define %%r4 %5
+%define %%r5 %6
+%define %%r6 %7
+%define %%r7 %8
+%define %%t0 %9
+%define %%t1 %10
+; Input looks like: {r0 r1 r2 r3 r4 r5 r6 r7}
+; r0 = {e3 e2 e1 e0   a3 a2 a1 a0}
+; r1 = {f3 f2 f1 f0   b3 b2 b1 b0}
+; r2 = {g3 g2 g1 g0   c3 c2 c1 c0}
+; r3 = {h3 h2 h1 h0   d3 d2 d1 d0}
+; r4 = {e7 e6 e5 e4   a7 a6 a5 a4}
+; r5 = {f7 f6 f5 f4   b7 b6 b5 b4}
+; r6 = {g7 g6 g5 g4   c7 c6 c5 c4}
+; r7 = {h7 h6 h5 h4   d7 d6 d5 d4}
+;
+; Output looks like: {r0 r1 r2 r3 r4 r5 r6 r7}
+; r0 = {h0 g0 f0 e0   d0 c0 b0 a0}
+; r1 = {h1 g1 f1 e1   d1 c1 b1 a1}
+; r2 = {h2 g2 f2 e2   d2 c2 b2 a2}
+; r3 = {h3 g3 f3 e3   d3 c3 b3 a3}
+; r4 = {h4 g4 f4 e4   d4 c4 b4 a4}
+; r5 = {h5 g5 f5 e5   d5 c5 b5 a5}
+; r6 = {h6 g6 f6 e6   d6 c6 b6 a6}
+; r7 = {h7 g7 f7 e7   d7 c7 b7 a7}
+;
+	; process top half (r0..r3)
+	vshufps	%%t0, %%r0, %%r1, 0x44	; t0 = {f1 f0 e1 e0   b1 b0 a1 a0}
+	vshufps	%%r0, %%r0, %%r1, 0xEE	; r0 = {f3 f2 e3 e2   b3 b2 a3 a2}
+	vshufps %%t1, %%r2, %%r3, 0x44	; t1 = {h1 h0 g1 g0   d1 d0 c1 c0}
+	vshufps	%%r2, %%r2, %%r3, 0xEE	; r2 = {h3 h2 g3 g2   d3 d2 c3 c2}
+
+	vshufps	%%r1, %%t0, %%t1, 0xDD	; r1 = {h1 g1 f1 e1   d1 c1 b1 a1}
+	vshufps	%%r3, %%r0, %%r2, 0xDD	; r3 = {h3 g3 f3 e3   d3 c3 b3 a3}
+	vshufps	%%r2, %%r0, %%r2, 0x88	; r2 = {h2 g2 f2 e2   d2 c2 b2 a2}
+	vshufps	%%r0, %%t0, %%t1, 0x88	; r0 = {h0 g0 f0 e0   d0 c0 b0 a0}
+
+	;; process bottom half (r4..r7)
+	vshufps	%%t0, %%r4, %%r5, 0x44	; t0 = {f5 f4 e5 e4   b5 b4 a5 a4}
+	vshufps	%%r4, %%r4, %%r5, 0xEE	; r4 = {f7 f6 e7 e6   b7 b6 a7 a6}
+	vshufps %%t1, %%r6, %%r7, 0x44	; t1 = {h5 h4 g5 g4   d5 d4 c5 c4}
+	vshufps	%%r6, %%r6, %%r7, 0xEE	; r6 = {h7 h6 g7 g6   d7 d6 c7 c6}
+
+	vshufps	%%r5, %%t0, %%t1, 0xDD	; r5 = {h5 g5 f5 e5   d5 c5 b5 a5}
+	vshufps	%%r7, %%r4, %%r6, 0xDD	; r7 = {h7 g7 f7 e7   d7 c7 b7 a7}
+	vshufps	%%r6, %%r4, %%r6, 0x88	; r6 = {h6 g6 f6 e6   d6 c6 b6 a6}
+	vshufps	%%r4, %%t0, %%t1, 0x88	; r4 = {h4 g4 f4 e4   d4 c4 b4 a4}
+%endmacro
+
+; LOAD ALL 4 LANES FOR 4x4 64-BIT TRANSPOSE
+;
+; r0-r3 [out] ymm registers which will contain the data to be transposed
+; addr0-addr3 [in] pointers to the next 32-byte block of data to be fetch for the 4 lanes
+; ptr_offset  [in] offset to be applied on all pointers (addr0-addr3)
+%macro TRANSPOSE4_U64_LOAD4 9
+%define %%r0 %1
+%define %%r1 %2
+%define %%r2 %3
+%define %%r3 %4
+%define %%addr0 %5
+%define %%addr1 %6
+%define %%addr2 %7
+%define %%addr3 %8
+%define %%ptr_offset %9
+
+; Expected output data
+;
+; r0 = {c1 c0 a1 a0}
+; r1 = {d1 d0 b1 b0}
+; r2 = {c3 c2 a3 a2}
+; r3 = {d3 d2 b3 b2}
+
+	vmovupd	XWORD(%%r0),[%%addr0+%%ptr_offset]
+	vmovupd	XWORD(%%r1),[%%addr1+%%ptr_offset]
+	vmovupd	XWORD(%%r2),[%%addr0+%%ptr_offset+16]
+	vmovupd	XWORD(%%r3),[%%addr1+%%ptr_offset+16]
+
+	vinserti128 %%r0, %%r0, [%%addr2+%%ptr_offset], 0x01
+	vinserti128 %%r1, %%r1, [%%addr3+%%ptr_offset], 0x01
+	vinserti128 %%r2, %%r2, [%%addr2+%%ptr_offset+16], 0x1
+	vinserti128 %%r3, %%r3, [%%addr3+%%ptr_offset+16], 0x01
+
+%endmacro
+
+; 4x4 64-BIT TRANSPOSE
+;
+; Before calling this macro, TRANSPOSE4_U64_LOAD4 must be called.
+;
+; This macro takes 4 registers as input (r0-r3)
+; and transposes their content (64-bit elements)
+; outputing the data in registers (o0,r1,o2,r3),
+; using two additional registers
+%macro TRANSPOSE4_U64 6
+%define %%r0 %1 ; [in]     ymm register for row 0 input (c0-c1 a1-a0)
+%define %%r1 %2 ; [in/out] ymm register for row 1 input (d0-d1 b1-b0) and output
+%define %%r2 %3 ; [in]     ymm register for row 2 input (c3-c2 a3-a2)
+%define %%r3 %4 ; [in/out] ymm register for row 3 input (d3-d2 b3-b2) and output
+%define %%o0 %5 ; [out]    ymm register for row 0 output
+%define %%o2 %6 ; [out]    ymm register for row 2 output
+; Input looks like: {r0 r1 r2 r3}
+; r0 = {c1 c0 a1 a0}
+; r1 = {d1 d0 b1 b0}
+; r2 = {c3 c2 a3 a2}
+; r3 = {d3 d2 b3 b2}
+;
+; output looks like: {o0 r1 o2 r3}
+; o0 = {d0 c0 b0 a0}
+; r1 = {d1 c1 b1 a1}
+; o2 = {d2 c2 b2 a2}
+; r3 = {d3 c3 b3 a3}
+	; vshufps does not cross the mid-way boundary and hence is cheaper
+	vshufps	%%o0, %%r0, %%r1, 0x44	; o0 = {d0 c0 b0 a0}
+	vshufps	%%r1, %%r0, %%r1, 0xEE	; r1 = {d1 d0 b1 b0}
+
+	vshufps	%%o2, %%r2, %%r3, 0x44	; o1 = {d2 c2 b2 a2}
+	vshufps	%%r3, %%r2, %%r3, 0xEE	; r3 = {d3 c3 b3 a3}
+%endmacro
+
+%endif ;; _TRANSPOSE_AVX2_ASM_
diff --git a/src/spdk/intel-ipsec-mb/include/transpose_avx512.asm b/src/spdk/intel-ipsec-mb/include/transpose_avx512.asm
new file mode 100644
index 000000000..6937ceb00
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/include/transpose_avx512.asm
@@ -0,0 +1,497 @@
+;;
+;; Copyright (c) 2012-2019, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;;     * Redistributions of source code must retain the above copyright notice,
+;;       this list of conditions and the following disclaimer.
+;;     * Redistributions in binary form must reproduce the above copyright
+;;       notice, this list of conditions and the following disclaimer in the
+;;       documentation and/or other materials provided with the distribution.
+;;     * Neither the name of Intel Corporation nor the names of its contributors
+;;       may be used to endorse or promote products derived from this software
+;;       without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+%ifndef _TRANSPOSE_AVX512_ASM_
+%define _TRANSPOSE_AVX512_ASM_
+
+%include "include/reg_sizes.asm"
+
+section .data
+default rel
+align 64
+PSHUFFLE_TRANSPOSE_MASK1: 	dq 0x0000000000000000
+				dq 0x0000000000000001
+				dq 0x0000000000000008
+				dq 0x0000000000000009
+				dq 0x0000000000000004
+				dq 0x0000000000000005
+				dq 0x000000000000000C
+				dq 0x000000000000000D
+
+align 64
+PSHUFFLE_TRANSPOSE_MASK2: 	dq 0x0000000000000002
+				dq 0x0000000000000003
+				dq 0x000000000000000A
+				dq 0x000000000000000B
+				dq 0x0000000000000006
+				dq 0x0000000000000007
+				dq 0x000000000000000E
+				dq 0x000000000000000F
+
+
+; LOAD FIRST 8 LANES FOR 16x16 32-BIT TRANSPOSE
+;
+; r0-r15 [out] zmm registers which will contain the data to be transposed
+; addr0-addr7 [in] pointers to the next 64-byte block of data to be fetch for the first 8 lanes
+; ptr_offset [in] offset to be applied on all pointers (addr0-addr7)
+%macro TRANSPOSE16_U32_LOAD_FIRST8 25
+%define %%r0 %1
+%define %%r1 %2
+%define %%r2 %3
+%define %%r3 %4
+%define %%r4 %5
+%define %%r5 %6
+%define %%r6 %7
+%define %%r7 %8
+%define %%r8 %9
+%define %%r9 %10
+%define %%r10 %11
+%define %%r11 %12
+%define %%r12 %13
+%define %%r13 %14
+%define %%r14 %15
+%define %%r15 %16
+%define %%addr0 %17
+%define %%addr1 %18
+%define %%addr2 %19
+%define %%addr3 %20
+%define %%addr4 %21
+%define %%addr5 %22
+%define %%addr6 %23
+%define %%addr7 %24
+%define %%ptr_offset %25
+
+; Expected output data
+;
+; r0  = {X X X X  X X X X  a7  a6  a5  a4    a3  a2  a1 a0}
+; r1  = {X X X X  X X X X  b7  b6  b5  b4    b3  b2  b1 b0}
+; r2  = {X X X X  X X X X  c7  c6  c5  c4    c3  c2  c1 c0}
+; r3  = {X X X X  X X X X  d7  d6  d5  d4    d3  d2  d1 d0}
+; r4  = {X X X X  X X X X  e7  e6  e5  e4    e3  e2  e1 e0}
+; r5  = {X X X X  X X X X  f7  f6  f5  f4    f3  f2  f1 f0}
+; r6  = {X X X X  X X X X  g7  g6  g5  g4    g3  g2  g1 g0}
+; r7  = {X X X X  X X X X  h7  h6  h5  h4    h3  h2  h1 h0}
+; r8  = {X X X X  X X X X  a15 a14 a13 a12   a11 a10 a9 a8}
+; r9  = {X X X X  X X X X  b15 b14 b13 b12   b11 b10 b9 b8}
+; r10 = {X X X X  X X X X  c15 c14 c13 c12   c11 c10 c9 c8}
+; r11 = {X X X X  X X X X  d15 d14 d13 d12   d11 d10 d9 d8}
+; r12 = {X X X X  X X X X  e15 e14 e13 e12   e11 e10 e9 e8}
+; r13 = {X X X X  X X X X  f15 f14 f13 f12   f11 f10 f9 f8}
+; r14 = {X X X X  X X X X  g15 g14 g13 g12   g11 g10 g9 g8}
+; r15 = {X X X X  X X X X  h15 h14 h13 h12   h11 h10 h9 h8}
+	vmovups	YWORD(%%r0),[%%addr0+%%ptr_offset]
+	vmovups	YWORD(%%r1),[%%addr1+%%ptr_offset]
+	vmovups	YWORD(%%r2),[%%addr2+%%ptr_offset]
+	vmovups	YWORD(%%r3),[%%addr3+%%ptr_offset]
+	vmovups	YWORD(%%r4),[%%addr4+%%ptr_offset]
+	vmovups	YWORD(%%r5),[%%addr5+%%ptr_offset]
+	vmovups	YWORD(%%r6),[%%addr6+%%ptr_offset]
+	vmovups	YWORD(%%r7),[%%addr7+%%ptr_offset]
+	vmovups	YWORD(%%r8),[%%addr0+%%ptr_offset+32]
+	vmovups	YWORD(%%r9),[%%addr1+%%ptr_offset+32]
+	vmovups	YWORD(%%r10),[%%addr2+%%ptr_offset+32]
+	vmovups	YWORD(%%r11),[%%addr3+%%ptr_offset+32]
+	vmovups	YWORD(%%r12),[%%addr4+%%ptr_offset+32]
+	vmovups	YWORD(%%r13),[%%addr5+%%ptr_offset+32]
+	vmovups	YWORD(%%r14),[%%addr6+%%ptr_offset+32]
+	vmovups	YWORD(%%r15),[%%addr7+%%ptr_offset+32]
+
+%endmacro
+
+; LOAD LAST 8 LANES FOR 16x16 32-BIT TRANSPOSE
+;
+; r0-r15 [in/out] zmm registers which will contain the data to be transposed
+; addr0-addr7 [in] pointers to the next 64-byte block of data to be fetch for the last 8 lanes
+; ptr_offset [in] offset to be applied on all pointers (addr0-addr7)
+%macro TRANSPOSE16_U32_LOAD_LAST8 25
+%define %%r0 %1
+%define %%r1 %2
+%define %%r2 %3
+%define %%r3 %4
+%define %%r4 %5
+%define %%r5 %6
+%define %%r6 %7
+%define %%r7 %8
+%define %%r8 %9
+%define %%r9 %10
+%define %%r10 %11
+%define %%r11 %12
+%define %%r12 %13
+%define %%r13 %14
+%define %%r14 %15
+%define %%r15 %16
+%define %%addr0 %17
+%define %%addr1 %18
+%define %%addr2 %19
+%define %%addr3 %20
+%define %%addr4 %21
+%define %%addr5 %22
+%define %%addr6 %23
+%define %%addr7 %24
+%define %%ptr_offset %25
+
+; Expected output data
+;
+; r0  = {i7  i6  i5  i4    i3  i2  i1 i0   a7  a6  a5  a4    a3  a2  a1 a0}
+; r1  = {j7  j6  j5  j4    j3  j2  j1 j0   b7  b6  b5  b4    b3  b2  b1 b0}
+; r2  = {k7  k6  k5  k4    k3  k2  k1 k0   c7  c6  c5  c4    c3  c2  c1 c0}
+; r3  = {l7  l6  l5  l4    l3  l2  l1 l0   d7  d6  d5  d4    d3  d2  d1 d0}
+; r4  = {m7  m6  m5  m4    m3  m2  m1 m0   e7  e6  e5  e4    e3  e2  e1 e0}
+; r5  = {n7  n6  n5  n4    n3  n2  n1 n0   f7  f6  f5  f4    f3  f2  f1 f0}
+; r6  = {o7  o6  o5  o4    o3  o2  o1 o0   g7  g6  g5  g4    g3  g2  g1 g0}
+; r7  = {p7  p6  p5  p4    p3  p2  p1 p0   h7  h6  h5  h4    h3  h2  h1 h0}
+; r8  = {i15 i14 i13 i12   i11 i10 i9 i8   a15 a14 a13 a12   a11 a10 a9 a8}
+; r9  = {j15 j14 j13 j12   j11 j10 j9 j8   b15 b14 b13 b12   b11 b10 b9 b8}
+; r10 = {k15 k14 k13 k12   k11 k10 k9 k8   c15 c14 c13 c12   c11 c10 c9 c8}
+; r11 = {l15 l14 l13 l12   l11 l10 l9 l8   d15 d14 d13 d12   d11 d10 d9 d8}
+; r12 = {m15 m14 m13 m12   m11 m10 m9 m8   e15 e14 e13 e12   e11 e10 e9 e8}
+; r13 = {n15 n14 n13 n12   n11 n10 n9 n8   f15 f14 f13 f12   f11 f10 f9 f8}
+; r14 = {o15 o14 o13 o12   o11 o10 o9 o8   g15 g14 g13 g12   g11 g10 g9 g8}
+; r15 = {p15 p14 p13 p12   p11 p10 p9 p8   h15 h14 h13 h12   h11 h10 h9 h8}
+
+	vinserti64x4 %%r0, %%r0, [%%addr0+%%ptr_offset], 0x01
+	vinserti64x4 %%r1, %%r1, [%%addr1+%%ptr_offset], 0x01
+	vinserti64x4 %%r2, %%r2, [%%addr2+%%ptr_offset], 0x01
+	vinserti64x4 %%r3, %%r3, [%%addr3+%%ptr_offset], 0x01
+	vinserti64x4 %%r4, %%r4, [%%addr4+%%ptr_offset], 0x01
+	vinserti64x4 %%r5, %%r5, [%%addr5+%%ptr_offset], 0x01
+	vinserti64x4 %%r6, %%r6, [%%addr6+%%ptr_offset], 0x01
+	vinserti64x4 %%r7, %%r7, [%%addr7+%%ptr_offset], 0x01
+	vinserti64x4 %%r8, %%r8, [%%addr0+%%ptr_offset+32], 0x01
+	vinserti64x4 %%r9, %%r9, [%%addr1+%%ptr_offset+32], 0x01
+	vinserti64x4 %%r10, %%r10, [%%addr2+%%ptr_offset+32], 0x01
+	vinserti64x4 %%r11, %%r11, [%%addr3+%%ptr_offset+32], 0x01
+	vinserti64x4 %%r12, %%r12, [%%addr4+%%ptr_offset+32], 0x01
+	vinserti64x4 %%r13, %%r13, [%%addr5+%%ptr_offset+32], 0x01
+	vinserti64x4 %%r14, %%r14, [%%addr6+%%ptr_offset+32], 0x01
+	vinserti64x4 %%r15, %%r15, [%%addr7+%%ptr_offset+32], 0x01
+
+%endmacro
+
+; 16x16 32-BIT TRANSPOSE
+;
+; Before calling this macro, TRANSPOSE16_U32_LOAD_FIRST8 and TRANSPOSE16_U32_LOAD_LAST8
+; must be called.
+;
+; r0-r7 [in/out] zmm registers containing bytes 0-31 of each 64B block (e.g. zmm0 = [i7-i0 a7-a0])
+; r8-r15 [in/out] zmm registers containing bytes 32-63 of each 64B block (e.g. zmm8 = [i15-i8 a15-a8])
+; t0-t1 [clobbered] zmm temporary registers
+; m0-m1 [clobbered] zmm registers for shuffle mask storing
+%macro TRANSPOSE16_U32 20
+%define %%r0 %1
+%define %%r1 %2
+%define %%r2 %3
+%define %%r3 %4
+%define %%r4 %5
+%define %%r5 %6
+%define %%r6 %7
+%define %%r7 %8
+%define %%r8 %9
+%define %%r9 %10
+%define %%r10 %11
+%define %%r11 %12
+%define %%r12 %13
+%define %%r13 %14
+%define %%r14 %15
+%define %%r15 %16
+%define %%t0 %17
+%define %%t1 %18
+%define %%m0 %19
+%define %%m1 %20
+
+; Input data
+;
+; r0  = {i7  i6  i5  i4    i3  i2  i1 i0   a7  a6  a5  a4    a3  a2  a1 a0}
+; r1  = {j7  j6  j5  j4    j3  j2  j1 j0   b7  b6  b5  b4    b3  b2  b1 b0}
+; r2  = {k7  k6  k5  k4    k3  k2  k1 k0   c7  c6  c5  c4    c3  c2  c1 c0}
+; r3  = {l7  l6  l5  l4    l3  l2  l1 l0   d7  d6  d5  d4    d3  d2  d1 d0}
+; r4  = {m7  m6  m5  m4    m3  m2  m1 m0   e7  e6  e5  e4    e3  e2  e1 e0}
+; r5  = {n7  n6  n5  n4    n3  n2  n1 n0   f7  f6  f5  f4    f3  f2  f1 f0}
+; r6  = {o7  o6  o5  o4    o3  o2  o1 o0   g7  g6  g5  g4    g3  g2  g1 g0}
+; r7  = {p7  p6  p5  p4    p3  p2  p1 p0   h7  h6  h5  h4    h3  h2  h1 h0}
+; r8  = {i15 i14 i13 i12   i11 i10 i9 i8   a15 a14 a13 a12   a11 a10 a9 a8}
+; r9  = {j15 j14 j13 j12   j11 j10 j9 j8   b15 b14 b13 b12   b11 b10 b9 b8}
+; r10 = {k15 k14 k13 k12   k11 k10 k9 k8   c15 c14 c13 c12   c11 c10 c9 c8}
+; r11 = {l15 l14 l13 l12   l11 l10 l9 l8   d15 d14 d13 d12   d11 d10 d9 d8}
+; r12 = {m15 m14 m13 m12   m11 m10 m9 m8   e15 e14 e13 e12   e11 e10 e9 e8}
+; r13 = {n15 n14 n13 n12   n11 n10 n9 n8   f15 f14 f13 f12   f11 f10 f9 f8}
+; r14 = {o15 o14 o13 o12   o11 o10 o9 o8   g15 g14 g13 g12   g11 g10 g9 g8}
+; r15 = {p15 p14 p13 p12   p11 p10 p9 p8   h15 h14 h13 h12   h11 h10 h9 h8}
+
+; Expected output data
+;
+; r0   = {p0  o0  n0  m0    l0  k0  j0  i0    h0  g0  f0  e0    d0  c0  b0  a0}
+; r1   = {p1  o1  n1  m1    l1  k1  j1  i1    h1  g1  f1  e1    d1  c1  b1  a1}
+; r2   = {p2  o2  n2  m2    l2  k2  j2  i2    h2  g2  f2  e2    d2  c2  b2  a2}
+; r3   = {p3  o3  n3  m3    l3  k3  j3  i3    h3  g3  f3  e3    d3  c3  b3  a3}
+; r4   = {p4  o4  n4  m4    l4  k4  j4  i4    h4  g4  f4  e4    d4  c4  b4  a4}
+; r5   = {p5  o5  n5  m5    l5  k5  j5  i5    h5  g5  f5  e5    d5  c5  b5  a5}
+; r6   = {p6  o6  n6  m6    l6  k6  j6  i6    h6  g6  f6  e6    d6  c6  b6  a6}
+; r7   = {p7  o7  n7  m7    l7  k7  j7  i7    h7  g7  f7  e7    d7  c7  b7  a7}
+; r8   = {p8  o8  n8  m8    l8  k8  j8  i8    h8  g8  f8  e8    d8  c8  b8  a8}
+; r9   = {p9  o9  n9  m9    l9  k9  j9  i9    h9  g9  f9  e9    d9  c9  b9  a9}
+; r10  = {p10 o10 n10 m10   l10 k10 j10 i10   h10 g10 f10 e10   d10 c10 b10 a10}
+; r11  = {p11 o11 n11 m11   l11 k11 j11 i11   h11 g11 f11 e11   d11 c11 b11 a11}
+; r12  = {p12 o12 n12 m12   l12 k12 j12 i12   h12 g12 f12 e12   d12 c12 b12 a12}
+; r13  = {p13 o13 n13 m13   l13 k13 j13 i13   h13 g13 f13 e13   d13 c13 b13 a13}
+; r14  = {p14 o14 n14 m14   l14 k14 j14 i14   h14 g14 f14 e14   d14 c14 b14 a14}
+; r15  = {p15 o15 n15 m15   l15 k15 j15 i15   h15 g15 f15 e15   d15 c15 b15 a15}
+
+
+	; process first 4 rows (r0..r3)
+	vshufps	%%t0, %%r0, %%r1, 0x44	; t0 = {j5 j4 i5 i4  j1 j0 i1 i0  b5 b4 a5 a4  b1 b0 a1 a0}
+	vshufps	%%r0, %%r0, %%r1, 0xEE	; r0 = {j7 j6 i7 i6  j3 j2 i3 i2  b7 b6 a7 a6  b3 b2 a3 a2}
+	vshufps	%%t1, %%r2, %%r3, 0x44	; t1 = {l5 l4 k5 k4  l1 l0 k1 k0  d5 d4 c5 c4  d1 d0 c1 c0}
+	vshufps	%%r2, %%r2, %%r3, 0xEE	; r2 = {l7 l6 k7 k6  l3 l2 k3 k2  d7 d6 c7 c6  d3 d2 c3 c2}
+
+	vshufps	%%r3, %%t0, %%t1, 0xDD	; r3 = {l5 k5 j5 i5  l1 k1 j1 i1  d5 c5 b5 a5  d1 c1 b1 a1}
+	vshufps	%%r1, %%r0, %%r2, 0x88	; r1 = {l6 k6 j6 i6  l2 k2 j2 i2  d6 c6 b6 a6  d2 c2 b2 a2}
+	vshufps	%%r0, %%r0, %%r2, 0xDD	; r0 = {l7 k7 j7 i7  l3 k3 j3 i3  d7 c7 b7 a7  d3 c3 b3 a3}
+	vshufps	%%t0, %%t0, %%t1, 0x88	; t0 = {l4 k4 j4 i4  l0 k0 j0 i0  d4 c4 b4 a4  d0 c0 b0 a0}
+
+	; Load permute masks
+	vmovdqa64	%%m0, [PSHUFFLE_TRANSPOSE_MASK1]
+	vmovdqa64	%%m1, [PSHUFFLE_TRANSPOSE_MASK2]
+
+	; process second 4 rows (r4..r7)
+	vshufps	%%r2, %%r4, %%r5, 0x44	; r2 = {n5 n4 m5 m4  n1 n0 m1 m0  f5 f4 e5 e4  f1 f0 e1 e0}
+	vshufps	%%r4, %%r4, %%r5, 0xEE	; r4 = {n7 n6 m7 m6  n3 n2 m3 m2  f7 f6 e7 e6  f3 f2 e3 e2}
+	vshufps %%t1, %%r6, %%r7, 0x44	; t1 = {p5 p4 o5 o4  p1 p0 o1 o0  h5 h4 g5 g4  h1 h0 g1 g0}
+	vshufps	%%r6, %%r6, %%r7, 0xEE	; r6 = {p7 p6 o7 o6  p3 p2 o3 o2  h7 h6 g7 g6  h3 h2 g3 g2}
+
+	vshufps	%%r7, %%r2, %%t1, 0xDD	; r7 = {p5 o5 n5 m5  p1 o1 n1 m1  h5 g5 f5 e5   h1 g1 f1 e1}
+	vshufps	%%r5, %%r4, %%r6, 0x88	; r5 = {p6 o6 n6 m6  p2 o2 n2 m2  h6 g6 f6 e6   h2 g2 f2 e2}
+	vshufps	%%r4, %%r4, %%r6, 0xDD	; r4 = {p7 o7 n7 m7  p3 o3 n3 m3  h7 g7 f7 e7   h3 g3 f3 e3}
+	vshufps	%%r2, %%r2, %%t1, 0x88	; r2 = {p4 o4 n4 m4  p0 o0 n0 m0  h4 g4 f4 e4   h0 g0 f0 e0}
+
+	; process third 4 rows (r8..r11)
+	vshufps	%%r6, %%r8, %%r9,    0x44	; r6  = {j13 j12 i13 i12  j9  j8  i9  i8   b13 b12 a13 a12  b9  b8  a9  a8 }
+	vshufps	%%r8, %%r8, %%r9,    0xEE	; r8  = {j15 j14 i15 i14  j11 j10 i11 i10  b15 b14 a15 a14  b11 b10 a11 a10}
+	vshufps	%%t1, %%r10, %%r11,  0x44	; t1  = {l13 l12 k13 k12  l9  l8  k9  k8   d13 d12 c13 c12  d9  d8  c9  c8 }
+	vshufps	%%r10, %%r10, %%r11, 0xEE	; r10 = {l15 l14 k15 k14  l11 l10 k11 k10  d15 d14 c15 c14  d11 d10 c11 c10}
+
+	vshufps	%%r11, %%r6, %%t1, 0xDD		; r11 = {l13 k13 j13 i13  l9  k9  j9  i9   d13 c13 b13 a13  d9  c9  b9  a9 }
+	vshufps	%%r9, %%r8, %%r10, 0x88		; r9  = {l14 k14 j14 i14  l10 k10 j10 i10  d14 c14 b14 a14  d10 c10 b10 a10}
+	vshufps	%%r8, %%r8, %%r10, 0xDD		; r8  = {l15 k15 j15 i15  l11 k11 j11 i11  d15 c15 b15 a15  d11 c11 b11 a11}
+	vshufps	%%r6, %%r6, %%t1,  0x88		; r6  = {l12 k12 j12 i12  l8  k8  j8  i8   d12 c12 b12 a12  d8  c8  b8  a8 }
+
+	; process fourth 4 rows (r12..r15)
+	vshufps	%%r10, %%r12, %%r13, 0x44	; r10 = {n13 n12 m13 m12  n9  n8  m9  m8   f13 f12 e13 e12  f9  f8  e9  e8 }
+	vshufps	%%r12, %%r12, %%r13, 0xEE	; r12 = {n15 n14 m15 m14  n11 n10 m11 m10  f15 f14 e15 e14  f11 f10 e11 e10}
+	vshufps	%%t1, %%r14, %%r15,  0x44	; t1  = {p13 p12 o13 o12  p9  p8  o9  o8   h13 h12 g13 g12  h9  h8  g9  g8 }
+	vshufps	%%r14, %%r14, %%r15, 0xEE	; r14 = {p15 p14 o15 o14  p11 p10 o11 o10  h15 h14 g15 g14  h11 h10 g11 g10}
+
+	vshufps	%%r15, %%r10, %%t1,  0xDD	; r15 = {p13 o13 n13 m13  p9  o9  n9  m9   h13 g13 f13 e13  h9  g9  f9  e9 }
+	vshufps	%%r13, %%r12, %%r14, 0x88	; r13 = {p14 o14 n14 m14  p10 o10 n10 m10  h14 g14 f14 e14  h10 g10 f10 e10}
+	vshufps	%%r12, %%r12, %%r14, 0xDD	; r12 = {p15 o15 n15 m15  p11 o11 n11 m11  h15 g15 f15 e15  h11 g11 f11 e11}
+	vshufps	%%r10, %%r10, %%t1,  0x88	; r10 = {p12 o12 n12 m12  p8  o8  n8  m8   h12 g12 f12 e12  h8  g8  f8  e8 }
+
+	; perform final shuffles on bottom half, producing r8-r15
+	vmovdqu32 %%t1, %%m0
+	vpermi2q  %%t1, %%r9, %%r13		; t1 =  {p10 o10 n10 m10  l10 k10 j10 i10  h10 g10 f10 e10  d10 c10 b10 a10}
+	vmovdqu32 %%r14, %%m1
+	vpermi2q  %%r14, %%r9, %%r13		; r14 = {p14 o14 n14 m14  l14 k14 j14 i14  h14 g14 f14 e14  d14 c14 b14 a14}
+
+	vmovdqu32 %%r9, %%m0
+	vpermi2q  %%r9, %%r11, %%r15		; r9  = {p9  o9  n9  m9   l9  k9  j9  i9   h9  g9  f9  e9   d9  c9  b9  a9}
+	vmovdqu32 %%r13, %%m1
+	vpermi2q  %%r13, %%r11, %%r15		; r13 = {p13 o13 n13 m13  l13 k13 j13 i13  h13 g13 f13 e13  d13 c13 b13 a13}
+
+	vmovdqu32 %%r11, %%m0
+	vpermi2q  %%r11, %%r8, %%r12		; r11 = {p11 o11 n11 m11  l11 k11 j11 i11  h11 g11 f11 e11  d11 c11 b11 a11}
+	vmovdqu32 %%r15, %%m1
+	vpermi2q  %%r15, %%r8, %%r12		; r15 = {p15 o15 n15 m15  l15 k15 j15 i15  h15 g15 f15 e15  d15 c15 b15 a15}
+
+	vmovdqu32 %%r8, %%m0
+	vpermi2q  %%r8, %%r6, %%r10		; r8  = {p8  o8  n8  m8   l8  k8  j8  i8   h8  g8  f8  e8    d8  c8  b8  a8}
+	vmovdqu32 %%r12, %%m1
+	vpermi2q  %%r12, %%r6, %%r10		; r12 = {p12 o12 n12 m12  l12 k12 j12 i12  h12 g12 f12 e12   d12 c12 b12 a12}
+
+	vmovdqu32 %%r10, %%t1			; r10 = {p10 o10 n10 m10  l10 k10 j10 i10  h10 g10 f10 e10  d10 c10 b10 a10}
+
+	; perform final shuffles on top half, producing r0-r7
+	vmovdqu32 %%t1, %%m0
+	vpermi2q  %%t1, %%r1, %%r5		; t1 = {p2 o2 n2 m2  l2 k2 j2 i2  h2 g2 f2 e2  d2 c2 b2 a2}
+	vmovdqu32 %%r6, %%m1
+	vpermi2q  %%r6, %%r1, %%r5		; r6 = {p6 o6 n6 m6  l6 k6 j6 i6  h6 g6 f6 e6  d6 c6 b6 a6}
+
+	vmovdqu32 %%r1, %%m0
+	vpermi2q  %%r1, %%r3, %%r7		; r1 = {p1 o1 n1 m1  l1 k1 j1 i1  h1 g1 f1 e1  d1 c1 b1 a1}
+	vmovdqu32 %%r5, %%m1
+	vpermi2q  %%r5, %%r3, %%r7		; r5 = {p5 o5 n5 m5  l5 k5 j5 i5  h5 g5 f5 e5  d5 c5 b5 a5}
+
+	vmovdqu32 %%r3, %%m0
+	vpermi2q  %%r3, %%r0, %%r4		; r3 = {p3 o3 n3 m3  l3 k3 j3 i3  h3 g3 f3 e3  d3 c3 b3 a3}
+	vmovdqu32 %%r7, %%m1
+	vpermi2q  %%r7, %%r0, %%r4		; r7 = {p7 o7 n7 m7  l7 k7 j7 i7  h7 g7 f7 e7  d7 c7 b7 a7}
+
+	vmovdqu32 %%r0, %%m0
+	vpermi2q  %%r0, %%t0, %%r2		; r0 = {p0 o0 n0 m0  l0 k0 j0 i0  h0 g0 f0 e0  d0 c0 b0 a0}
+	vmovdqu32 %%r4, %%m1
+	vpermi2q  %%r4,  %%t0, %%r2		; r4 = {p4 o4 n4 m4  l4 k4 j4 i4  h4 g4 f4 e4  d4 c4 b4 a4}
+
+	vmovdqu32 %%r2, %%t1			; r2 = {p2 o2 n2 m2  l2 k2 j2 i2  h2 g2 f2 e2  d2 c2 b2 a2}
+
+%endmacro
+
+; LOAD ALL 8 LANES FOR 8x8 64-BIT TRANSPOSE
+;
+; r0-r7       [out] zmm registers which will contain the data to be transposed
+; addr0-addr7 [in]  pointers to the next 64-byte block of data to be fetch for all 8 lanes
+; ptr_offset  [in] offset to be applied on all pointers (addr0-addr7)
+%macro TRANSPOSE8_U64_LOAD8 17
+%define %%r0 %1
+%define %%r1 %2
+%define %%r2 %3
+%define %%r3 %4
+%define %%r4 %5
+%define %%r5 %6
+%define %%r6 %7
+%define %%r7 %8
+%define %%addr0 %9
+%define %%addr1 %10
+%define %%addr2 %11
+%define %%addr3 %12
+%define %%addr4 %13
+%define %%addr5 %14
+%define %%addr6 %15
+%define %%addr7 %16
+%define %%ptr_offset %17
+
+; Expected output data
+;
+; r0 = {e3 e2 e1 e0  a3 a2 a1 a0}
+; r1 = {f3 f2 f1 f0  b3 b2 b1 b0}
+; r2 = {g3 g2 g1 g0  c3 c2 c1 c0}
+; r3 = {h3 h2 h1 h0  d3 d2 d1 d0}
+; r4 = {e7 e6 e5 e4  a7 a6 a5 a4}
+; r5 = {f7 f6 f5 f4  b7 b6 b5 b4}
+; r6 = {g7 g6 g5 g4  c7 c6 c5 c4}
+; r7 = {h7 h6 h5 h4  d7 d6 d5 d4}
+
+	vmovups	YWORD(%%r0),[%%addr0+%%ptr_offset]
+	vmovups	YWORD(%%r1),[%%addr1+%%ptr_offset]
+	vmovups	YWORD(%%r2),[%%addr2+%%ptr_offset]
+	vmovups	YWORD(%%r3),[%%addr3+%%ptr_offset]
+	vmovups	YWORD(%%r4),[%%addr0+%%ptr_offset+32]
+	vmovups	YWORD(%%r5),[%%addr1+%%ptr_offset+32]
+	vmovups	YWORD(%%r6),[%%addr2+%%ptr_offset+32]
+	vmovups	YWORD(%%r7),[%%addr3+%%ptr_offset+32]
+
+	vinserti64x4 %%r0, %%r0, [%%addr4+%%ptr_offset], 0x01
+	vinserti64x4 %%r1, %%r1, [%%addr5+%%ptr_offset], 0x01
+	vinserti64x4 %%r2, %%r2, [%%addr6+%%ptr_offset], 0x01
+	vinserti64x4 %%r3, %%r3, [%%addr7+%%ptr_offset], 0x01
+	vinserti64x4 %%r4, %%r4, [%%addr4+%%ptr_offset+32], 0x01
+	vinserti64x4 %%r5, %%r5, [%%addr5+%%ptr_offset+32], 0x01
+	vinserti64x4 %%r6, %%r6, [%%addr6+%%ptr_offset+32], 0x01
+	vinserti64x4 %%r7, %%r7, [%%addr7+%%ptr_offset+32], 0x01
+
+%endmacro
+
+; 8x8 64-BIT TRANSPOSE
+;
+; Before calling this macro, TRANSPOSE8_U64_LOAD8 must be called.
+;
+; r0-r3          [in/out]    zmm registers containing bytes 0-31 of each 64B block (e.g. zmm0 = [e3-e0 a3-a0])
+; r4-r7          [in/out]    zmm registers containing bytes 32-63 of each 64B block (e.g. zmm4 = [e4-e7 a4-a7])
+; t0-t1          [clobbered] zmm temporary registers
+; PERM_INDEX1-2  [clobbered] zmm registers for shuffle mask storing
+%macro TRANSPOSE8_U64 12
+%define %%r0 %1
+%define %%r1 %2
+%define %%r2 %3
+%define %%r3 %4
+%define %%r4 %5
+%define %%r5 %6
+%define %%r6 %7
+%define %%r7 %8
+%define %%t0 %9
+%define %%t1 %10
+%define %%PERM_INDEX1 %11
+%define %%PERM_INDEX2 %12
+
+; each x(i) is 64 bits, 8 * 64 = 512 ==> a full digest length, 64-bit double precision quantities
+
+; Input data
+;
+; r0 = {e3 e2 e1 e0  a3 a2 a1 a0}
+; r1 = {f3 f2 f1 f0  b3 b2 b1 b0}
+; r2 = {g3 g2 g1 g0  c3 c2 c1 c0}
+; r3 = {h3 h2 h1 h0  d3 d2 d1 d0}
+; r4 = {e7 e6 e5 e4  a7 a6 a5 a4}
+; r5 = {f7 f6 f5 f4  b7 b6 b5 b4}
+; r6 = {g7 g6 g5 g4  c7 c6 c5 c4}
+; r7 = {h7 h6 h5 h4  d7 d6 d5 d4}
+;
+; Expected output data
+;
+; r0 = {h0 g0 f0 e0  d0 c0 b0 a0}
+; r1 = {h1 g1 f1 e1  d1 c1 b1 a1}
+; r2 = {h2 g2 f2 e2  d2 c2 b2 a2}
+; r3 = {h3 g3 f3 e3  d3 c3 b3 a3}
+; r4 = {h4 g4 f4 e4  d4 c4 b4 a4}
+; r5 = {h5 g5 f5 e5  d5 c5 b5 a5}
+; r6 = {h6 g6 f6 e6  d6 c6 b6 a6}
+; r7 = {h7 g7 f7 e7  d7 c7 b7 a7}
+
+        ;; ;;;  will not get clobbered
+        vmovdqa32 %%PERM_INDEX1, [PSHUFFLE_TRANSPOSE_MASK1] ; temp
+        vmovdqa32 %%PERM_INDEX2, [PSHUFFLE_TRANSPOSE_MASK2] ; temp
+
+        ; process top half (r0..r3)
+        vshufpd	%%t0, %%r0, %%r1, 0x00	; t0 = {f2 e2 f0 e0  b2 a2 b0 a0}
+        vshufpd	%%r1, %%r0, %%r1, 0xFF	; r0 = {f3 e3 f1 e1  b3 a3 b1 a1}
+        vshufpd	%%t1, %%r2, %%r3, 0x00	; t1 = {h2 g2 h0 g0  d2 c2 d0 c0}
+        vshufpd	%%r2, %%r2, %%r3, 0xFF	; r2 = {h3 g3 h1 g1  d3 c3 d1 c1}
+
+        vmovdqa32   %%r3, %%r1
+        vpermt2q    %%r1, %%PERM_INDEX1,%%r2   ; r1 = {h1 g1 f1 e1  d1 c1 b1 a1}
+        vpermt2q    %%r3, %%PERM_INDEX2,%%r2   ; r3 = {h3 g3 f3 e3  d3 c3 b3 a3}
+
+        vmovdqa32   %%r0, %%t0
+        vmovdqa32   %%r2, %%t0
+        vpermt2q    %%r0, %%PERM_INDEX1,%%t1   ; r0 = {h0 g0 f0 e0  d0 c0 b0 a0}
+        vpermt2q    %%r2, %%PERM_INDEX2,%%t1   ; r2 = {h2 g2 f2 e2  d2 c2 b2 a2}
+
+        ; process top bottom (r4..r7)
+        vshufpd	%%t0, %%r4, %%r5, 0x00	; t0 = {f6 e6 f4 e4  b6 a6 b4 a4}
+        vshufpd	%%r5, %%r4, %%r5, 0xFF	; r0 = {f7 e7 f5 e5  b7 a7 b5 a5}
+        vshufpd	%%t1, %%r6, %%r7, 0x00	; t1 = {h6 g6 h4 g4  d6 c6 d4 c4}
+        vshufpd	%%r6, %%r6, %%r7, 0xFF	; r2 = {h7 g7 h5 g5  d7 c7 d5 c5}
+
+        vmovdqa32   %%r7, %%r5
+        vpermt2q    %%r5, %%PERM_INDEX1,%%r6   ; r5 = {h5 g5 f5 e5  d5 c5 b5 a5}
+        vpermt2q    %%r7, %%PERM_INDEX2,%%r6   ; r7 = {h7 g7 f7 e7  d7 c7 b7 a7}
+
+        vmovdqa32   %%r4, %%t0
+        vmovdqa32   %%r6, %%t0
+        vpermt2q    %%r4, %%PERM_INDEX1,%%t1   ; r4 = {h4 g4 f4 e4  d4 c4 b4 a4}
+        vpermt2q    %%r6, %%PERM_INDEX2,%%t1   ; r6 = {h6 g6 f6 e6  d6 c6 b6 a6}
+%endmacro
+
+%endif ;; _TRANSPOSE_AVX512_ASM_
diff --git a/src/spdk/intel-ipsec-mb/include/wireless_common.asm b/src/spdk/intel-ipsec-mb/include/wireless_common.asm
new file mode 100644
index 000000000..811c2c256
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/include/wireless_common.asm
@@ -0,0 +1,128 @@
+;;
+;; Copyright (c) 2019, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;;     * Redistributions of source code must retain the above copyright notice,
+;;       this list of conditions and the following disclaimer.
+;;     * Redistributions in binary form must reproduce the above copyright
+;;       notice, this list of conditions and the following disclaimer in the
+;;       documentation and/or other materials provided with the distribution.
+;;     * Neither the name of Intel Corporation nor the names of its contributors
+;;       may be used to endorse or promote products derived from this software
+;;       without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+%include "include/os.asm"
+
+section .data
+default rel
+align 16
+swap_mask:
+db      0x03, 0x02, 0x01, 0x00, 0x07, 0x06, 0x05, 0x04
+db      0x0b, 0x0a, 0x09, 0x08, 0x0f, 0x0e, 0x0d, 0x0c
+
+section .text
+
+; Function which XOR's 64 bytes of the input buffer with 64 bytes of the
+; KeyStream, placing the result in the output buffer.
+; KeyStream bytes must be swapped on 32 bit boundary before this operation
+%macro xor_keystream 1
+%define %%SIMDTYPE %1 ; "SSE" or "AVX"
+
+%ifidn %%SIMDTYPE, AVX
+        %define %%MOVDQU  vmovdqu
+        %define %%MOVDQA  vmovdqa
+        %define %%PXOR    vpxor
+        %define %%PSHUFB  vpshufb
+%else
+        %define %%MOVDQU  movdqu
+        %define %%MOVDQA  movdqa
+        %define %%PXOR    pxor
+        %define %%PSHUFB  pshufb
+%endif
+%ifdef LINUX
+        %define	        %%pIn	rdi
+        %define	        %%pOut	rsi
+        %define	        %%pKS	rdx
+%else
+        %define	        %%pIn	rcx
+        %define	        %%pOut	rdx
+        %define	        %%pKS	r8
+
+        mov             rax, rsp
+        sub             rsp, 48
+        and             rsp, ~15
+        %%MOVDQA        [rsp], xmm6
+        %%MOVDQA        [rsp + 16], xmm7
+        %%MOVDQA        [rsp + 32], xmm8
+%endif
+        %define         XKEY0   xmm0
+        %define         XKEY1   xmm1
+        %define         XKEY2   xmm2
+        %define         XKEY3   xmm3
+        %define         XIN0    xmm4
+        %define         XIN1    xmm5
+        %define         XIN2    xmm6
+        %define         XIN3    xmm7
+        %define         XSHUF   xmm8
+
+        %%MOVDQA        XSHUF, [rel swap_mask]
+        %%MOVDQA        XKEY0, [%%pKS]
+        %%MOVDQA        XKEY1, [%%pKS + 16]
+        %%MOVDQA        XKEY2, [%%pKS + 32]
+        %%MOVDQA        XKEY3, [%%pKS + 48]
+
+        %%PSHUFB        XKEY0, XSHUF
+        %%PSHUFB        XKEY1, XSHUF
+        %%PSHUFB        XKEY2, XSHUF
+        %%PSHUFB        XKEY3, XSHUF
+
+        %%MOVDQU        XIN0, [%%pIn]
+        %%MOVDQU        XIN1, [%%pIn + 16]
+        %%MOVDQU        XIN2, [%%pIn + 32]
+        %%MOVDQU        XIN3, [%%pIn + 48]
+
+        %%PXOR          XKEY0, XIN0
+        %%PXOR          XKEY1, XIN1
+        %%PXOR          XKEY2, XIN2
+        %%PXOR          XKEY3, XIN3
+
+        %%MOVDQU        [%%pOut],      XKEY0
+        %%MOVDQU        [%%pOut + 16], XKEY1
+        %%MOVDQU        [%%pOut + 32], XKEY2
+        %%MOVDQU        [%%pOut + 48], XKEY3
+
+%ifndef LINUX
+        %%MOVDQA        xmm6, [rsp]
+        %%MOVDQA        xmm7, [rsp + 16]
+        %%MOVDQA        xmm8, [rsp + 32]
+        mov             rsp,rax
+%endif
+%endmacro
+
+MKGLOBAL(asm_XorKeyStream64B_avx,function,internal)
+asm_XorKeyStream64B_avx:
+        xor_keystream AVX
+        ret
+
+MKGLOBAL(asm_XorKeyStream64B_sse,function,internal)
+asm_XorKeyStream64B_sse:
+        xor_keystream SSE
+        ret
+
+%ifdef LINUX
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
diff --git a/src/spdk/intel-ipsec-mb/include/wireless_common.h b/src/spdk/intel-ipsec-mb/include/wireless_common.h
new file mode 100644
index 000000000..a0ba60019
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/include/wireless_common.h
@@ -0,0 +1,216 @@
+/*******************************************************************************
+  Copyright (c) 2009-2019, Intel Corporation
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are met:
+
+      * Redistributions of source code must retain the above copyright notice,
+        this list of conditions and the following disclaimer.
+      * Redistributions in binary form must reproduce the above copyright
+        notice, this list of conditions and the following disclaimer in the
+        documentation and/or other materials provided with the distribution.
+      * Neither the name of Intel Corporation nor the names of its contributors
+        may be used to endorse or promote products derived from this software
+        without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+  DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+  DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+  SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+  CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+  OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+
+#ifndef _WIRELESS_COMMON_H_
+#define _WIRELESS_COMMON_H_
+
+#include <string.h>
+#ifdef LINUX
+#include <x86intrin.h>
+#else
+#include <intrin.h>
+#endif
+
+#define NUM_PACKETS_1 1
+#define NUM_PACKETS_2 2
+#define NUM_PACKETS_3 3
+#define NUM_PACKETS_4 4
+#define NUM_PACKETS_8 8
+#define NUM_PACKETS_16 16
+
+#ifdef LINUX
+#define BSWAP32 __builtin_bswap32
+#define BSWAP64 __builtin_bswap64
+#else
+#define BSWAP32 _byteswap_ulong
+#define BSWAP64 _byteswap_uint64
+#endif
+
+typedef union _m128_u {
+        uint8_t byte[16];
+        uint16_t word[8];
+        uint32_t dword[4];
+        uint64_t qword[2];
+        __m128i m;
+} m128_t;
+
+typedef union _m64_u {
+        uint8_t byte[8];
+        uint16_t word[4];
+        uint32_t dword[2];
+        uint64_t m;
+} m64_t;
+
+static inline uint32_t bswap4(const uint32_t val)
+{
+        return ((val >> 24) |             /**< A*/
+                ((val & 0xff0000) >> 8) | /**< B*/
+                ((val & 0xff00) << 8) |   /**< C*/
+                (val << 24));             /**< D*/
+}
+
+/*************************************************************************
+* @description - this function is used to copy the right number of bytes
+*                from the source to destination buffer
+*
+* @param pSrc [IN] - pointer to an input Byte array (at least len bytes
+*                    available)
+* @param pDst [IN] - pointer to the output buffer (at least len bytes available)
+* @param len  [IN] - length in bytes to copy (0 to 4)
+*
+*************************************************************************/
+static inline void memcpy_keystream_32(uint8_t *pDst,
+                                       const uint8_t *pSrc,
+                                       const uint32_t len)
+{
+        switch (len) {
+        case 4:
+                *(uint32_t *)pDst = *(const uint32_t *)pSrc;
+                break;
+        case 3:
+                pDst[2] = pSrc[2];
+                /* fall-through */
+        case 2:
+                pDst[1] = pSrc[1];
+                /* fall-through */
+        case 1:
+                pDst[0] = pSrc[0];
+                /* fall-through */
+        }
+}
+
+/*************************************************************************
+* @description - this function is used to XOR the right number of bytes
+*                from a keystrea and a source into a destination buffer
+*
+* @param pSrc [IN] - pointer to an input Byte array (at least 4 bytes available)
+* @param pDst [IN] - pointer to the output buffer (at least 4 bytes available)
+* @param KS  [IN]  - 4 bytes of keystream number, must be reversed
+*                    into network byte order before XOR
+*
+*************************************************************************/
+static inline void xor_keystream_reverse_32(uint8_t *pDst,
+                                            const uint8_t *pSrc,
+                                            const uint32_t KS)
+{
+        *(uint32_t *)pDst = (*(const uint32_t *)pSrc) ^ BSWAP32(KS);
+}
+
+/******************************************************************************
+ * @description - this function is used to do a keystream operation
+ * @param pSrc [IN] - pointer to an input Byte array (at least 8 bytes
+ *                    available)
+ * @param pDst [IN] - pointer to the output buffer (at least 8 bytes available)
+ * @param keyStream [IN] -  the Keystream value (8 bytes)
+ ******************************************************************************/
+static inline const uint8_t *
+xor_keystrm_rev(uint8_t *pDst, const uint8_t *pSrc, uint64_t keyStream)
+{
+        /* default: XOR ONLY, read the input buffer, update the output buffer */
+        const uint64_t *pSrc64 = (const uint64_t *)pSrc;
+        uint64_t *pDst64 = (uint64_t *)pDst;
+        *pDst64 = *pSrc64 ^ BSWAP64(keyStream);
+        return (const uint8_t *)(pSrc64 + 1);
+}
+
+/******************************************************************************
+ * @description - this function is used to copy the right number of bytes
+ *                from the source to destination buffer
+ * @param pSrc [IN] - pointer to an input Byte array (at least len bytes
+ *                    available)
+ * @param pDst [IN] - pointer to the output buffer (at least len bytes
+ *                    available)
+ * @param len  [IN] - length in bytes to copy
+ ******************************************************************************/
+static inline void
+memcpy_keystrm(uint8_t *pDst, const uint8_t *pSrc, const uint32_t len)
+{
+        switch (len) {
+        case 8:
+                *(uint64_t *)pDst = *(const uint64_t *)pSrc;
+                break;
+        case 7:
+                pDst[6] = pSrc[6];
+                /* fall-through */
+        case 6:
+                pDst[5] = pSrc[5];
+                /* fall-through */
+        case 5:
+                pDst[4] = pSrc[4];
+                /* fall-through */
+        case 4:
+                *(uint32_t *)pDst = *(const uint32_t *)pSrc;
+                break;
+        case 3:
+                pDst[2] = pSrc[2];
+                /* fall-through */
+        case 2:
+                pDst[1] = pSrc[1];
+                /* fall-through */
+        case 1:
+                pDst[0] = pSrc[0];
+                /* fall-through */
+        }
+}
+
+/**
+ ******************************************************************************
+ *
+ * @description
+ *      Definition of the external SSE function that XOR's 64 bytes of input
+ *      with 64 bytes of keystream, swapping keystream bytes every 4 bytes.
+ *
+ * @param[in]  pIn              Pointer to the input buffer
+ * @param[out] pOut             Pointer to the output buffer
+ * @param[in]  pKey             Pointer to the new 64 byte keystream
+ *
+ * @pre
+ *      None
+ *
+ *****************************************************************************/
+IMB_DLL_LOCAL void asm_XorKeyStream64B_sse(const void *pIn, void *pOut,
+                                        const void *pKey);
+
+/**
+ ******************************************************************************
+ *
+ * @description
+ *      Definition of the external AVX function that XOR's 64 bytes of input
+ *      with 64 bytes of keystream, swapping keystream bytes every 4 bytes.
+ *
+ * @param[in]  pIn              Pointer to the input buffer
+ * @param[out] pOut             Pointer to the output buffer
+ * @param[in]  pKey             Pointer to the new 64 byte keystream
+ *
+ * @pre
+ *      None
+ *
+ *****************************************************************************/
+IMB_DLL_LOCAL void asm_XorKeyStream64B_avx(const void *pIn, void *pOut,
+                                           const void *pKey);
+
+#endif /* _WIRELESS_COMMON_H_ */
diff --git a/src/spdk/intel-ipsec-mb/include/zuc_common.asm b/src/spdk/intel-ipsec-mb/include/zuc_common.asm
new file mode 100644
index 000000000..4b9cdd3ec
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/include/zuc_common.asm
@@ -0,0 +1,740 @@
+;;
+;; Copyright (c) 2009-2019, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;;     * Redistributions of source code must retain the above copyright notice,
+;;       this list of conditions and the following disclaimer.
+;;     * Redistributions in binary form must reproduce the above copyright
+;;       notice, this list of conditions and the following disclaimer in the
+;;       documentation and/or other materials provided with the distribution.
+;;     * Neither the name of Intel Corporation nor the names of its contributors
+;;       may be used to endorse or promote products derived from this software
+;;       without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+%include "include/os.asm"
+%include "include/reg_sizes.asm"
+
+extern lookup_8bit_sse
+
+
+section .data
+default rel
+align 64
+S0:
+db	0x3e,0x72,0x5b,0x47,0xca,0xe0,0x00,0x33,0x04,0xd1,0x54,0x98,0x09,0xb9,0x6d,0xcb
+db	0x7b,0x1b,0xf9,0x32,0xaf,0x9d,0x6a,0xa5,0xb8,0x2d,0xfc,0x1d,0x08,0x53,0x03,0x90
+db	0x4d,0x4e,0x84,0x99,0xe4,0xce,0xd9,0x91,0xdd,0xb6,0x85,0x48,0x8b,0x29,0x6e,0xac
+db	0xcd,0xc1,0xf8,0x1e,0x73,0x43,0x69,0xc6,0xb5,0xbd,0xfd,0x39,0x63,0x20,0xd4,0x38
+db	0x76,0x7d,0xb2,0xa7,0xcf,0xed,0x57,0xc5,0xf3,0x2c,0xbb,0x14,0x21,0x06,0x55,0x9b
+db	0xe3,0xef,0x5e,0x31,0x4f,0x7f,0x5a,0xa4,0x0d,0x82,0x51,0x49,0x5f,0xba,0x58,0x1c
+db	0x4a,0x16,0xd5,0x17,0xa8,0x92,0x24,0x1f,0x8c,0xff,0xd8,0xae,0x2e,0x01,0xd3,0xad
+db	0x3b,0x4b,0xda,0x46,0xeb,0xc9,0xde,0x9a,0x8f,0x87,0xd7,0x3a,0x80,0x6f,0x2f,0xc8
+db	0xb1,0xb4,0x37,0xf7,0x0a,0x22,0x13,0x28,0x7c,0xcc,0x3c,0x89,0xc7,0xc3,0x96,0x56
+db	0x07,0xbf,0x7e,0xf0,0x0b,0x2b,0x97,0x52,0x35,0x41,0x79,0x61,0xa6,0x4c,0x10,0xfe
+db	0xbc,0x26,0x95,0x88,0x8a,0xb0,0xa3,0xfb,0xc0,0x18,0x94,0xf2,0xe1,0xe5,0xe9,0x5d
+db	0xd0,0xdc,0x11,0x66,0x64,0x5c,0xec,0x59,0x42,0x75,0x12,0xf5,0x74,0x9c,0xaa,0x23
+db	0x0e,0x86,0xab,0xbe,0x2a,0x02,0xe7,0x67,0xe6,0x44,0xa2,0x6c,0xc2,0x93,0x9f,0xf1
+db	0xf6,0xfa,0x36,0xd2,0x50,0x68,0x9e,0x62,0x71,0x15,0x3d,0xd6,0x40,0xc4,0xe2,0x0f
+db	0x8e,0x83,0x77,0x6b,0x25,0x05,0x3f,0x0c,0x30,0xea,0x70,0xb7,0xa1,0xe8,0xa9,0x65
+db	0x8d,0x27,0x1a,0xdb,0x81,0xb3,0xa0,0xf4,0x45,0x7a,0x19,0xdf,0xee,0x78,0x34,0x60
+
+S1:
+db	0x55,0xc2,0x63,0x71,0x3b,0xc8,0x47,0x86,0x9f,0x3c,0xda,0x5b,0x29,0xaa,0xfd,0x77
+db	0x8c,0xc5,0x94,0x0c,0xa6,0x1a,0x13,0x00,0xe3,0xa8,0x16,0x72,0x40,0xf9,0xf8,0x42
+db	0x44,0x26,0x68,0x96,0x81,0xd9,0x45,0x3e,0x10,0x76,0xc6,0xa7,0x8b,0x39,0x43,0xe1
+db	0x3a,0xb5,0x56,0x2a,0xc0,0x6d,0xb3,0x05,0x22,0x66,0xbf,0xdc,0x0b,0xfa,0x62,0x48
+db	0xdd,0x20,0x11,0x06,0x36,0xc9,0xc1,0xcf,0xf6,0x27,0x52,0xbb,0x69,0xf5,0xd4,0x87
+db	0x7f,0x84,0x4c,0xd2,0x9c,0x57,0xa4,0xbc,0x4f,0x9a,0xdf,0xfe,0xd6,0x8d,0x7a,0xeb
+db	0x2b,0x53,0xd8,0x5c,0xa1,0x14,0x17,0xfb,0x23,0xd5,0x7d,0x30,0x67,0x73,0x08,0x09
+db	0xee,0xb7,0x70,0x3f,0x61,0xb2,0x19,0x8e,0x4e,0xe5,0x4b,0x93,0x8f,0x5d,0xdb,0xa9
+db	0xad,0xf1,0xae,0x2e,0xcb,0x0d,0xfc,0xf4,0x2d,0x46,0x6e,0x1d,0x97,0xe8,0xd1,0xe9
+db	0x4d,0x37,0xa5,0x75,0x5e,0x83,0x9e,0xab,0x82,0x9d,0xb9,0x1c,0xe0,0xcd,0x49,0x89
+db	0x01,0xb6,0xbd,0x58,0x24,0xa2,0x5f,0x38,0x78,0x99,0x15,0x90,0x50,0xb8,0x95,0xe4
+db	0xd0,0x91,0xc7,0xce,0xed,0x0f,0xb4,0x6f,0xa0,0xcc,0xf0,0x02,0x4a,0x79,0xc3,0xde
+db	0xa3,0xef,0xea,0x51,0xe6,0x6b,0x18,0xec,0x1b,0x2c,0x80,0xf7,0x74,0xe7,0xff,0x21
+db	0x5a,0x6a,0x54,0x1e,0x41,0x31,0x92,0x35,0xc4,0x33,0x07,0x0a,0xba,0x7e,0x0e,0x34
+db	0x88,0xb1,0x98,0x7c,0xf3,0x3d,0x60,0x6c,0x7b,0xca,0xd3,0x1f,0x32,0x65,0x04,0x28
+db	0x64,0xbe,0x85,0x9b,0x2f,0x59,0x8a,0xd7,0xb0,0x25,0xac,0xaf,0x12,0x03,0xe2,0xf2
+
+EK_d:
+dw	0x44D7, 0x26BC, 0x626B, 0x135E, 0x5789, 0x35E2, 0x7135, 0x09AF,
+dw	0x4D78, 0x2F13, 0x6BC4, 0x1AF1, 0x5E26, 0x3C4D, 0x789A, 0x47AC
+
+%ifdef LINUX
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
+
+section .text
+
+%define OFFSET_FR1      (16*4)
+%define OFFSET_FR2      (17*4)
+%define OFFSET_BRC_X0   (18*4)
+%define OFFSET_BRC_X1   (19*4)
+%define OFFSET_BRC_X2   (20*4)
+%define OFFSET_BRC_X3   (21*4)
+
+;
+;   BITS_REORG()
+;
+;   params
+;       %1 - round number
+;   uses
+;       eax, ebx, ecx, edx
+;   return
+;       updates r12d, r13d, r14d, r15d
+;
+%macro  BITS_REORG  1
+    ;
+    ; r12d = LFSR_S15
+    ; eax  = LFSR_S14
+    ; r13d = LFSR_S11
+    ; ebx  = LFSR_S9
+    ; r14d = LFSR_S7
+    ; ecx  = LFSR_S5
+    ; r15d = LFSR_S2
+    ; edx  = LFSR_S0
+
+    mov         r12d, [rsi + ((15 + %1) % 16)*4]
+    mov          eax, [rsi + ((14 + %1) % 16)*4]
+    mov         r13d, [rsi + ((11 + %1) % 16)*4]
+    mov          ebx, [rsi + (( 9 + %1) % 16)*4]
+    mov         r14d, [rsi + (( 7 + %1) % 16)*4]
+    mov          ecx, [rsi + (( 5 + %1) % 16)*4]
+    mov         r15d, [rsi + (( 2 + %1) % 16)*4]
+    mov          edx, [rsi + (( 0 + %1) % 16)*4]
+
+    shr         r12d, 15
+    shl         eax, 16
+    shl         ebx, 1
+    shl         ecx, 1
+    shl         edx, 1
+    shld        r12d, eax, 16   ; BRC_X0
+    shld        r13d, ebx, 16   ; BRC_X1
+    shld        r14d, ecx, 16   ; BRC_X2
+    shld        r15d, edx, 16   ; BRC_X3
+%endmacro
+
+%macro lookup_single_sbox 3
+%define %%table %1 ; [in] Pointer to table to look up
+%define %%idx   %2 ; [in] Index to look up
+%define %%value %3 ; [out] Returned value from lookup function (rcx, rdx, r8, r9)
+
+%ifdef SAFE_LOOKUP
+    ;; Save all registers used in lookup_8bit (xmm0-5, r9,r10)
+    ;; and registers for param passing and return (4 regs, OS dependent)
+    ;; (6*16 + 6*8 = 144 bytes)
+    sub     rsp, 144
+
+    movdqu  [rsp], xmm0
+    movdqu  [rsp + 16], xmm1
+    movdqu  [rsp + 32], xmm2
+    movdqu  [rsp + 48], xmm3
+    movdqu  [rsp + 64], xmm4
+    movdqu  [rsp + 80], xmm5
+    mov     [rsp + 96], r9
+    mov     [rsp + 104], r10
+
+%ifdef LINUX
+    mov     [rsp + 112], rdi
+    mov     [rsp + 120], rsi
+    mov     [rsp + 128], rdx
+
+    mov     rdi, %%table
+    mov     rsi, %%idx
+    mov     rdx, 256
+%else
+    mov     [rsp + 112], rcx
+    mov     [rsp + 120], rdx
+    mov     [rsp + 128], r8
+    mov     rcx, %%table
+    mov     rdx, %%idx
+    mov     r8,  256
+%endif
+    mov     [rsp + 136], rax
+
+    call    lookup_8bit_sse
+
+    ;; Restore all registers
+    movdqu  xmm0, [rsp]
+    movdqu  xmm1, [rsp + 16]
+    movdqu  xmm2, [rsp + 32]
+    movdqu  xmm3, [rsp + 48]
+    movdqu  xmm4, [rsp + 64]
+    movdqu  xmm5, [rsp + 80]
+    mov     r9,   [rsp + 96]
+    mov     r10,  [rsp + 104]
+
+%ifdef LINUX
+    mov     rdi, [rsp + 112]
+    mov     rsi, [rsp + 120]
+    mov     rdx, [rsp + 128]
+%else
+    mov     rcx, [rsp + 112]
+    mov     rdx, [rsp + 120]
+    mov     r8,  [rsp + 128]
+%endif
+
+    ;; Move returned value from lookup function, before restoring rax
+    mov     DWORD(%%value), eax
+    mov     rax, [rsp + 136]
+
+    add     rsp, 144
+
+%else ;; SAFE_LOOKUP
+
+    movzx DWORD(%%value), BYTE [%%table + %%idx]
+
+%endif ;; SAFE_LOOKUP
+%endmacro
+
+;
+;   NONLIN_FUN()
+;
+;   params
+;       %1 == 1, then calculate W
+;   uses
+;           rdi rsi eax rdx edx
+;           r8d r9d ebx
+;   return
+;       eax  = W value
+;       r10d = F_R1
+;       r11d = F_R2
+;
+%macro NONLIN_FUN   1
+
+%if (%1 == 1)
+    mov         eax, r12d
+    xor         eax, r10d
+    add         eax, r11d   ; W = (BRC_X0 ^ F_R1) + F_R2
+%endif
+    lea         rdi, [rel S0]
+    lea         rsi, [rel S1]
+
+    add         r10d, r13d  ; W1= F_R1 + BRC_X1
+    xor         r11d, r14d  ; W2= F_R2 ^ BRC_X2
+
+    mov         rdx, r10
+    shld        edx, r11d, 16   ; P = (W1 << 16) | (W2 >> 16)
+    shld        r11d, r10d, 16  ; Q = (W2 << 16) | (W1 >> 16)
+
+    mov         ebx, edx
+    mov         ecx, edx
+    mov         r8d, edx
+    mov         r9d, edx
+
+    rol         ebx, 2
+    rol         ecx, 10
+    rol         r8d, 18
+    rol         r9d, 24
+    xor         edx, ebx
+    xor         edx, ecx
+    xor         edx, r8d
+    xor         edx, r9d    ; U = L1(P) = EDX, hi(RDX)=0
+    ;
+    xor         r10, r10
+    shld        ebx, edx, 24
+    shld        r8d, edx, 16
+    shld        r9d, edx, 8
+    and         rdx, 0xFF
+    lookup_single_sbox rsi, rdx, rdx
+    and         rbx, 0xFF
+    lookup_single_sbox rdi, rbx, rbx
+    and         r8, 0xFF
+    lookup_single_sbox rsi, r8, r8
+    and         r9, 0xFF
+    lookup_single_sbox rdi, r9, r9
+    shrd        r10d, edx, 8
+    shrd        r10d, ebx, 8
+    shrd        r10d, r8d, 8
+    shrd        r10d, r9d, 8
+    ;
+    mov         ebx, r11d
+    mov         ecx, r11d
+    mov         r8d, r11d
+    mov         r9d, r11d
+    rol         ebx, 8
+    rol         ecx, 14
+    rol         r8d, 22
+    rol         r9d, 30
+    xor         r11d, ebx
+    xor         r11d, ecx
+    xor         r11d, r8d
+    xor         r11d, r9d   ; V = L2(Q) = ECX, hi(RCX)=0
+    ;
+    shld        ebx, r11d, 24
+    shld        r8d, r11d, 16
+    shld        r9d, r11d, 8
+    and         r11, 0xFF
+
+    lookup_single_sbox rsi, r11, r11
+    and         rbx, 0xFF
+    lookup_single_sbox rdi, rbx, rbx
+    and         r8, 0xFF
+    lookup_single_sbox rsi, r8, r8
+    and         r9, 0xFF
+    lookup_single_sbox rdi, r9, r9
+
+    shrd        r11d, r11d, 8
+
+    shrd        r11d, ebx, 8
+    shrd        r11d, r8d, 8
+    shrd        r11d, r9d, 8
+%endmacro
+
+
+;
+;   LFSR_UPDT()
+;
+;   params
+;       %1 - round number
+;   uses
+;       rax as input (ZERO or W)
+;   return
+;
+%macro  LFSR_UPDT   1
+    ;
+    ; ebx = LFSR_S0
+    ; ecx = LFSR_S4
+    ; edx = LFSR_S10
+    ; r8d = LFSR_S13
+    ; r9d = LFSR_S15
+    ;lea         rsi, [LFSR_STA] ; moved to calling function
+
+    mov         ebx, [rsi + (( 0 + %1) % 16)*4]
+    mov         ecx, [rsi + (( 4 + %1) % 16)*4]
+    mov         edx, [rsi + ((10 + %1) % 16)*4]
+    mov         r8d, [rsi + ((13 + %1) % 16)*4]
+    mov         r9d, [rsi + ((15 + %1) % 16)*4]
+
+    ; Calculate 64-bit LFSR feedback
+    add         rax, rbx
+    shl         rbx, 8
+    shl         rcx, 20
+    shl         rdx, 21
+    shl         r8, 17
+    shl         r9, 15
+    add         rax, rbx
+    add         rax, rcx
+    add         rax, rdx
+    add         rax, r8
+    add         rax, r9
+
+    ; Reduce it to 31-bit value
+    mov         rbx, rax
+    and         rax, 0x7FFFFFFF
+    shr         rbx, 31
+    add         rax, rbx
+
+    mov rbx, rax
+    sub rbx, 0x7FFFFFFF
+    cmovns rax, rbx
+
+
+    ; LFSR_S16 = (LFSR_S15++) = eax
+    mov         [rsi + (( 0 + %1) % 16)*4], eax
+%endmacro
+
+
+;
+;   make_u31()
+;
+%macro  make_u31    4
+
+%define %%Rt        %1
+%define %%Ke        %2
+%define %%Ek        %3
+%define %%Iv        %4
+    xor         %%Rt, %%Rt
+    shrd        %%Rt, %%Iv, 8
+    shrd        %%Rt, %%Ek, 15
+    shrd        %%Rt, %%Ke, 9
+%endmacro
+
+
+;
+;	key_expand()
+;
+%macro	key_expand	1
+	movzx		r8d, byte [pKe +  (%1 + 0)]
+	movzx		r9d, word [rbx + ((%1 + 0)*2)]
+	movzx		r10d, byte [pIv + (%1 + 0)]
+	make_u31	r11d, r8d, r9d, r10d
+	mov 		[rax +  ((%1 + 0)*4)], r11d
+
+	movzx		r12d, byte [pKe +  (%1 + 1)]
+	movzx		r13d, word [rbx + ((%1 + 1)*2)]
+	movzx		r14d, byte [pIv +  (%1 + 1)]
+	make_u31	r15d, r12d, r13d, r14d
+	mov 		[rax +  ((%1 + 1)*4)], r15d
+%endmacro
+
+
+
+;----------------------------------------------------------------------------------------
+;;
+;;extern void Zuc_Initialization(uint8_t* pKey, uint8_t* pIV, uint32_t * pState)
+;;
+;; WIN64
+;;	RCX - pKey
+;;	RDX - pIV
+;;      R8  - pState
+;; LIN64
+;;	RDI - pKey
+;;	RSI - pIV
+;;      RDX - pState
+;;
+align 16
+MKGLOBAL(asm_ZucInitialization,function,internal)
+asm_ZucInitialization:
+
+%ifdef LINUX
+	%define		pKe	rdi
+	%define		pIv	rsi
+	%define		pState	rdx
+%else
+	%define		pKe	rcx
+	%define		pIv	rdx
+	%define		pState	r8
+%endif
+
+    ; save the base pointer
+    push rbp
+
+    ;load stack pointer to rbp and reserve memory in the red zone
+    mov rbp, rsp
+    sub rsp, 196
+
+    ; Save non-volatile registers
+    mov [rbp - 8],  rbx
+    mov [rbp - 32], r12
+    mov [rbp - 40], r13
+    mov [rbp - 48], r14
+    mov [rbp - 56], r15
+%ifndef LINUX
+    mov [rbp - 64], rdi
+    mov [rbp - 72], rsi
+%endif
+
+    lea rbx, [rel EK_d]     ; load pointer to D
+    lea rax, [pState]      ; load pointer to pState
+    mov [rbp - 88], pState   ; save pointer to pState
+
+    ; Expand key
+    key_expand  0
+    key_expand  2
+    key_expand  4
+    key_expand  6
+    key_expand  8
+    key_expand  10
+    key_expand  12
+    key_expand  14
+
+    ; Set R1 and R2 to zero
+    xor         r10, r10
+    xor         r11, r11
+
+    ; Shift LFSR 32-times, update state variables
+%assign N 0
+%rep 32
+    mov rdx, [rbp - 88]   ; load pointer to pState
+    lea rsi, [rdx]
+
+    BITS_REORG  N
+
+    NONLIN_FUN  1
+    shr         eax, 1
+
+    mov rdx, [rbp - 88]   ; re-load pointer to pState
+    lea rsi, [rdx]
+
+    LFSR_UPDT   N
+
+%assign N N+1
+%endrep
+
+    ; And once more, initial round from keygen phase = 33 times
+    mov rdx, [rbp - 88]   ; load pointer to pState
+    lea         rsi, [rdx]
+
+
+    BITS_REORG  0
+    NONLIN_FUN  0
+    xor         rax, rax
+
+    mov         rdx, [rbp - 88]   ; load pointer to pState
+    lea         rsi, [rdx]
+
+    LFSR_UPDT   0
+
+    mov         rdx, [rbp - 88]   ; load pointer to pState
+    lea         rsi, [rdx]
+
+    ; Save ZUC's state variables
+    mov         [rsi + (16*4)],r10d  ;F_R1
+    mov         [rsi + (17*4)],r11d  ;F_R2
+    mov         [rsi + (18*4)],r12d  ;BRC_X0
+    mov         [rsi + (19*4)],r13d  ;BRC_X1
+    mov         [rsi + (20*4)],r14d  ;BRC_X2
+    mov         [rsi + (21*4)],r15d  ;BRC_X3
+
+
+    ; Restore non-volatile registers
+    mov rbx, [rbp - 8]
+    mov r12, [rbp - 32]
+    mov r13, [rbp - 40]
+    mov r14, [rbp - 48]
+    mov r15, [rbp - 56]
+%ifndef LINUX
+    mov rdi, [rbp - 64]
+    mov rsi, [rbp - 72]
+%endif
+
+    ; restore base pointer
+    mov rsp, rbp
+    pop rbp
+
+    ret
+
+
+;;
+;; void asm_ZucGenKeystream8B(void *pKeystream, ZucState_t *pState);
+;;
+;; WIN64
+;;	RCX - KS (key stream pointer)
+;; 	RDX - STATE (state pointer)
+;; LIN64
+;;	RDI - KS (key stream pointer)
+;;	RSI - STATE (state pointer)
+;;
+align 16
+MKGLOBAL(asm_ZucGenKeystream8B,function,internal)
+asm_ZucGenKeystream8B:
+
+%ifdef LINUX
+	%define		pKS	rdi
+	%define		pState	rsi
+%else
+	%define		pKS	rcx
+	%define		pState	rdx
+%endif
+    ; save the base pointer
+    push rbp
+
+    ;load stack pointer to rbp and reserve memory in the red zone
+    mov rbp, rsp
+    sub rsp, 196
+
+    ; Save non-volatile registers
+    mov [rbp - 8], rbx
+    mov [rbp - 32], r12
+    mov [rbp - 40], r13
+    mov [rbp - 48], r14
+    mov [rbp - 56], r15
+%ifndef LINUX
+    mov [rbp - 64], rdi
+    mov [rbp - 72], rsi
+%endif
+
+
+    ; Load input keystream pointer parameter in RAX
+    mov         rax, pKS
+
+    ; Restore ZUC's state variables
+    xor         r10, r10
+    xor         r11, r11
+    mov         r10d, [pState + OFFSET_FR1]
+    mov         r11d, [pState + OFFSET_FR2]
+    mov         r12d, [pState + OFFSET_BRC_X0]
+    mov         r13d, [pState + OFFSET_BRC_X1]
+    mov         r14d, [pState + OFFSET_BRC_X2]
+    mov         r15d, [pState + OFFSET_BRC_X3]
+
+    ; Store keystream pointer
+    mov [rbp - 80], rax
+
+    ; Store ZUC State Pointer
+    mov [rbp - 88], pState
+
+    ; Generate 8B of keystream in 2 rounds
+%assign N 1
+%rep 2
+
+    mov rdx, [rbp - 88]       ; load *pState
+    lea rsi, [rdx]
+
+    BITS_REORG  N
+    NONLIN_FUN  1
+
+    ;Store the keystream
+    mov rbx, [rbp - 80]  ; load *pkeystream
+    xor eax, r15d
+    mov [rbx], eax
+    add rbx, 4          ; increment the pointer
+    mov [rbp - 80], rbx   ; save pkeystream
+
+    xor         rax, rax
+
+    mov rdx, [rbp - 88]     ; load *pState
+    lea rsi, [rdx]
+
+    LFSR_UPDT   N
+
+%assign N N+1
+%endrep
+
+    mov rsi, [rbp - 88]   ; load pState
+
+
+    ; Save ZUC's state variables
+    mov         [rsi + OFFSET_FR1], r10d
+    mov         [rsi + OFFSET_FR2], r11d
+    mov         [rsi + OFFSET_BRC_X0], r12d
+    mov         [rsi + OFFSET_BRC_X1], r13d
+    mov         [rsi + OFFSET_BRC_X2], r14d
+    mov         [rsi + OFFSET_BRC_X3], r15d
+
+    ; Restore non-volatile registers
+    mov rbx, [rbp - 8]
+    mov r12, [rbp - 32]
+    mov r13, [rbp - 40]
+    mov r14, [rbp - 48]
+    mov r15, [rbp - 56]
+%ifndef LINUX
+    mov rdi, [rbp - 64]
+    mov rsi, [rbp - 72]
+%endif
+
+    mov rsp, rbp
+    pop rbp
+
+    ret
+
+
+;;
+;; void asm_ZucGenKeystream64B(uint32_t * pKeystream, uint32_t * pState);
+;;
+;; WIN64
+;;	RCX - KS (key stream pointer)
+;; 	RDX - STATE (state pointer)
+;; LIN64
+;;	RDI - KS (key stream pointer)
+;;	RSI - STATE (state pointer)
+;;
+align 16
+MKGLOBAL(asm_ZucGenKeystream64B,function,internal)
+asm_ZucGenKeystream64B:
+
+%ifdef LINUX
+	%define		pKS	rdi
+	%define		pState	rsi
+%else
+	%define		pKS	rcx
+	%define		pState	rdx
+%endif
+    ; save the base pointer
+    push rbp
+
+    ;load stack pointer to rbp and reserve memory in the red zone
+    mov rbp, rsp
+    sub rsp, 196
+
+    ; Save non-volatile registers
+    mov [rbp - 8], rbx
+    mov [rbp - 32], r12
+    mov [rbp - 40], r13
+    mov [rbp - 48], r14
+    mov [rbp - 56], r15
+%ifndef LINUX
+    mov [rbp - 64], rdi
+    mov [rbp - 72], rsi
+%endif
+
+
+    ; Load input keystream pointer parameter in RAX
+    mov         rax, pKS
+
+    ; Restore ZUC's state variables
+    xor         r10, r10
+    xor         r11, r11
+    mov         r10d, [pState + OFFSET_FR1]
+    mov         r11d, [pState + OFFSET_FR2]
+    mov         r12d, [pState + OFFSET_BRC_X0]
+    mov         r13d, [pState + OFFSET_BRC_X1]
+    mov         r14d, [pState + OFFSET_BRC_X2]
+    mov         r15d, [pState + OFFSET_BRC_X3]
+
+    ; Store keystream pointer
+    mov [rbp - 80], rax
+
+    ; Store ZUC State Pointer
+    mov [rbp - 88], pState
+
+    ; Generate 64B of keystream in 16 rounds
+%assign N 1
+%rep 16
+
+    mov rdx, [rbp - 88]       ; load *pState
+    lea rsi, [rdx]
+
+    BITS_REORG  N
+    NONLIN_FUN  1
+
+    ;Store the keystream
+    mov rbx, [rbp - 80]  ; load *pkeystream
+    xor eax, r15d
+    mov [rbx], eax
+    add rbx, 4          ; increment the pointer
+    mov [rbp - 80], rbx   ; save pkeystream
+
+    xor         rax, rax
+
+    mov rdx, [rbp - 88]     ; load *pState
+    lea rsi, [rdx]
+
+    LFSR_UPDT   N
+
+%assign N N+1
+%endrep
+
+    mov rsi, [rbp - 88]   ; load pState
+
+
+    ; Save ZUC's state variables
+    mov         [rsi + OFFSET_FR1], r10d
+    mov         [rsi + OFFSET_FR2], r11d
+    mov         [rsi + OFFSET_BRC_X0], r12d
+    mov         [rsi + OFFSET_BRC_X1], r13d
+    mov         [rsi + OFFSET_BRC_X2], r14d
+    mov         [rsi + OFFSET_BRC_X3], r15d
+
+    ; Restore non-volatile registers
+    mov rbx, [rbp - 8]
+    mov r12, [rbp - 32]
+    mov r13, [rbp - 40]
+    mov r14, [rbp - 48]
+    mov r15, [rbp - 56]
+%ifndef LINUX
+    mov rdi, [rbp - 64]
+    mov rsi, [rbp - 72]
+%endif
+
+    mov rsp, rbp
+    pop rbp
+
+    ret
+
+
diff --git a/src/spdk/intel-ipsec-mb/include/zuc_internal.h b/src/spdk/intel-ipsec-mb/include/zuc_internal.h
new file mode 100755
index 000000000..525a1604c
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/include/zuc_internal.h
@@ -0,0 +1,432 @@
+/*******************************************************************************
+  Copyright (c) 2009-2019, Intel Corporation
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are met:
+
+      * Redistributions of source code must retain the above copyright notice,
+        this list of conditions and the following disclaimer.
+      * Redistributions in binary form must reproduce the above copyright
+        notice, this list of conditions and the following disclaimer in the
+        documentation and/or other materials provided with the distribution.
+      * Neither the name of Intel Corporation nor the names of its contributors
+        may be used to endorse or promote products derived from this software
+        without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+  DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+  DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+  SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+  CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+  OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+
+/**
+ ******************************************************************************
+ * @file zuc_internal.h
+ *
+ * @description
+ *      This header file defines the internal API's and data types for the
+ *      3GPP algorithm ZUC.
+ *
+ *****************************************************************************/
+
+#ifndef ZUC_INTERNAL_H_
+#define ZUC_INTERNAL_H_
+
+#include <stdio.h>
+#include <stdint.h>
+
+#include "intel-ipsec-mb.h"
+#include "immintrin.h"
+#include "include/wireless_common.h"
+
+/* 64 bytes of Keystream will be generated */
+#define ZUC_KEYSTR_LEN                      (64)
+#define NUM_LFSR_STATES                     (16)
+#define ZUC_WORD                            (32)
+
+/* Range of input data for ZUC is from 1 to 65504 bits */
+#define ZUC_MIN_LEN     1
+#define ZUC_MAX_LEN     65504
+
+#ifdef DEBUG
+#ifdef _WIN32
+#define DEBUG_PRINT(_fmt, ...) \
+        fprintf(stderr, "%s()::%d " _fmt , __FUNCTION__, __LINE__, __VA_ARGS__)
+#else
+#define DEBUG_PRINT(_fmt, ...) \
+        fprintf(stderr, "%s()::%d " _fmt , __func__, __LINE__, __VA_ARGS__)
+#endif
+#else
+#define DEBUG_PRINT(_fmt, ...)
+#endif
+
+/**
+ ******************************************************************************
+ * @description
+ *      Macro will loop through keystream of length 64bytes and xor with the
+ *      input buffer placing the result in the output buffer.
+ *      KeyStream bytes must be swaped on 32bit boundary before this operation
+ *
+ *****************************************************************************/
+#define ZUC_XOR_KEYSTREAM(pIn64, pOut64, pKeyStream64)		\
+{									\
+	int i =0;							\
+	union SwapBytes_t {						\
+		uint64_t l64;						\
+		uint32_t w32[2];					\
+	}swapBytes;							\
+	/* loop through the key stream and xor 64 bits at a time */	\
+	for(i =0; i < ZUC_KEYSTR_LEN/8; i++) {				\
+		swapBytes.l64 = *pKeyStream64++;			\
+		swapBytes.w32[0] = bswap4(swapBytes.w32[0]); \
+		swapBytes.w32[1] = bswap4(swapBytes.w32[1]); \
+		*pOut64++ = *pIn64++ ^ swapBytes.l64;			\
+	}								\
+}
+
+/**
+ *****************************************************************************
+ * @description
+ *      Packed structure to store the ZUC state for a single packet. *
+ *****************************************************************************/
+typedef struct zuc_state_s {
+    uint32_t lfsrState[16];
+    /**< State registers of the LFSR */
+    uint32_t fR1;
+    /**< register of F */
+    uint32_t fR2;
+    /**< register of F */
+    uint32_t bX0;
+    /**< Output X0 of the bit reorganization */
+    uint32_t bX1;
+    /**< Output X1 of the bit reorganization */
+    uint32_t bX2;
+    /**< Output X2 of the bit reorganization */
+    uint32_t bX3;
+    /**< Output X3 of the bit reorganization */
+} ZucState_t;
+
+/**
+ *****************************************************************************
+ * @description
+ *      Packed structure to store the ZUC state for a single packet. *
+ *****************************************************************************/
+typedef struct zuc_state_4_s {
+    uint32_t lfsrState[16][4];
+    /**< State registers of the LFSR */
+    uint32_t fR1[4];
+    /**< register of F */
+    uint32_t fR2[4];
+    /**< register of F */
+    uint32_t bX0[4];
+    /**< Output X0 of the bit reorganization for 4 packets */
+    uint32_t bX1[4];
+    /**< Output X1 of the bit reorganization for 4 packets */
+    uint32_t bX2[4];
+    /**< Output X2 of the bit reorganization for 4 packets */
+    uint32_t bX3[4];
+    /**< Output X3 of the bit reorganization for 4 packets */
+} ZucState4_t;
+
+/**
+ *****************************************************************************
+ * @description
+ *      Structure to store pointers to the 4 keys to be used as input to
+ *      @ref asm_ZucInitialization_4 and @ref asm_ZucGenKeystream64B_4
+ *****************************************************************************/
+typedef struct zuc_key_4_s {
+    const uint8_t *pKey1;
+    /**< Pointer to 128-bit key for packet 1 */
+    const uint8_t *pKey2;
+    /**< Pointer to 128-bit key for packet 2 */
+    const uint8_t *pKey3;
+    /**< Pointer to 128-bit key for packet 3 */
+    const uint8_t *pKey4;
+    /**< Pointer to 128-bit key for packet 4 */
+} ZucKey4_t;
+
+/**
+ *****************************************************************************
+ * @description
+ *      Structure to store pointers to the 4 IV's to be used as input to
+ *      @ref asm_ZucInitialization_4 and @ref asm_ZucGenKeystream64B_4
+ *****************************************************************************/
+typedef struct zuc_iv_4_s {
+    const uint8_t *pIv1;
+    /**< Pointer to 128-bit initialization vector for packet 1 */
+    const uint8_t *pIv2;
+    /**< Pointer to 128-bit initialization vector for packet 2 */
+    const uint8_t *pIv3;
+    /**< Pointer to 128-bit initialization vector for packet 3 */
+    const uint8_t *pIv4;
+    /**< Pointer to 128-bit initialization vector for packet 4 */
+} ZucIv4_t;
+
+/**
+ ******************************************************************************
+ *
+ * @description
+ *      Definition of the external function that implements the initialization
+ *      stage of the ZUC algorithm. The function will initialize the state
+ *      for a single packet operation.
+ *
+ * @param[in] pKey                  Pointer to the 128-bit initial key that
+ *                                  will be used when initializing the ZUC
+ *                                  state.
+ * @param[in] pIv                   Pointer to the 128-bit initial vector that
+ *                                  will be used when initializing the ZUC
+ *                                  state.
+ * @param[in,out] pState            Pointer to a ZUC state structure of type
+ *                                  @ref ZucState_t that will be populated
+ *                                  with the initialized ZUC state.
+ *
+ * @pre
+ *      None
+ *
+ *****************************************************************************/
+IMB_DLL_LOCAL void asm_ZucInitialization(const void *pKey,
+                                         const void *pIv,
+                                         ZucState_t *pState);
+
+/**
+ ******************************************************************************
+ * @description
+ *      Definition of the external function that implements the initialization
+ *      stage of the ZUC algorithm for 4 packets. The function will initialize
+ *      the state for 4 individual packets.
+ *
+ * @param[in] pKey                  Pointer to an array of 128-bit initial keys
+ *                                  that will be used when initializing the ZUC
+ *                                  state.
+ * @param[in] pIv                   Pointer to an array of 128-bit initial
+ *                                  vectors that will be used when initializing
+ *                                  the ZUC state.
+ * @param[in,out] pState            Pointer to a ZUC state structure of type
+ *                                  @ref ZucState4_t that will be populated
+ *                                  with the initialized ZUC state.
+ *
+ * @pre
+ *      None
+ *
+ *****************************************************************************/
+IMB_DLL_LOCAL void asm_ZucInitialization_4_sse(ZucKey4_t *pKeys,
+                                               ZucIv4_t *pIvs,
+                                               ZucState4_t *pState);
+
+IMB_DLL_LOCAL void asm_ZucInitialization_4_avx(ZucKey4_t *pKeys,
+                                               ZucIv4_t *pIvs,
+                                               ZucState4_t *pState);
+
+/**
+ ******************************************************************************
+ *
+ * @description
+ *      Definition of the external function that implements the working
+ *      stage of the ZUC algorithm. The function will generate 64 bytes of
+ *      keystream.
+ *
+ * @param[in,out] pKeystream        Pointer to an input buffer that will
+ *                                  contain the generated keystream.
+
+ * @param[in] pState                Pointer to a ZUC state structure of type
+ *                                  @ref ZucState_t
+ *
+ * @pre
+ *      A successful call to @ref asm_ZucInitialization to initialize the ZUC
+ *      state.
+ *
+ *****************************************************************************/
+IMB_DLL_LOCAL void asm_ZucGenKeystream64B(uint32_t *pKeystream,
+                                          ZucState_t *pState);
+
+/**
+ ******************************************************************************
+ *
+ * @description
+ *      Definition of the external function that implements the working
+ *      stage of the ZUC algorithm. The function will generate 8 bytes of
+ *      keystream.
+ *
+ * @param[in,out] pKeystream        Pointer to an input buffer that will
+ *                                  contain the generated keystream.
+
+ * @param[in] pState                Pointer to a ZUC state structure of type
+ *                                  @ref ZucState_t
+ *
+ * @pre
+ *      A successful call to @ref asm_ZucInitialization to initialize the ZUC
+ *      state.
+ *
+ *****************************************************************************/
+IMB_DLL_LOCAL void asm_ZucGenKeystream8B(void *pKeystream,
+                                         ZucState_t *pState);
+
+/**
+ ******************************************************************************
+ *
+ * @description
+ *      Definition of the external function that implements the working
+ *      stage of the ZUC algorithm. The function will generate 64 bytes of
+ *      keystream for four packets in parallel.
+ *
+ * @param[in] pState                Pointer to a ZUC state structure of type
+ *                                  @ref ZucState4_t
+ *
+ * @param[in,out] pKeyStr1          Pointer to an input buffer that will
+ *                                  contain the generated keystream for packet
+ *                                  one.
+ * @param[in,out] pKeyStr2          Pointer to an input buffer that will
+ *                                  contain the generated keystream for packet
+ *                                  two.
+ * @param[in,out] pKeyStr3          Pointer to an input buffer that will
+ *                                  contain the generated keystream for packet
+ *                                  three.
+ * @param[in,out] pKeyStr4          Pointer to an input buffer that will
+ *                                  contain the generated keystream for packet
+ *                                  four.
+ *
+ * @pre
+ *      A successful call to @ref asm_ZucInitialization_4 to initialize the ZUC
+ *      state.
+ *
+ *****************************************************************************/
+IMB_DLL_LOCAL void asm_ZucGenKeystream64B_4_sse(ZucState4_t *pState,
+                                                uint32_t *pKeyStr1,
+                                                uint32_t *pKeyStr2,
+                                                uint32_t *pKeyStr3,
+                                                uint32_t *pKeyStr4);
+
+IMB_DLL_LOCAL void asm_ZucGenKeystream64B_4_avx(ZucState4_t *pState,
+                                                uint32_t *pKeyStr1,
+                                                uint32_t *pKeyStr2,
+                                                uint32_t *pKeyStr3,
+                                                uint32_t *pKeyStr4);
+
+/**
+ ******************************************************************************
+ * @description
+ *      Definition of the external function to update the authentication tag
+ *      based on keystream and data (SSE varient)
+ *
+ * @param[in] T                     Authentication tag
+ *
+ * @param[in] ks                    Pointer to key stream
+ *
+ * @param[in] data                  Pointer to the data
+ *
+ * @pre
+ *      None
+ *
+ *****************************************************************************/
+IMB_DLL_LOCAL uint32_t asm_Eia3Round64BSSE(uint32_t T, const void *ks,
+                                           const void *data);
+
+/**
+ ******************************************************************************
+ * @description
+ *      Definition of the external function to return the authentication
+ *      update value to be XOR'ed with current authentication tag (SSE variant)
+ *
+ * @param[in] ks                    Pointer to key stream
+ *
+ * @param[in] data                  Pointer to the data
+ *
+ * @param[in] n_words               Number of data bits to be processed
+ *
+ * @pre
+ *      None
+ *
+ *****************************************************************************/
+IMB_DLL_LOCAL uint32_t asm_Eia3RemainderSSE(const void *ks, const void *data,
+                                            const uint64_t n_words);
+
+/**
+ ******************************************************************************
+ * @description
+ *      Definition of the external function to update the authentication tag
+ *      based on keystream and data (AVX variant)
+ *
+ * @param[in] T                     Authentication tag
+ *
+ * @param[in] ks                    Pointer to key stream
+ *
+ * @param[in] data                  Pointer to the data
+ *
+ * @pre
+ *      None
+ *
+ *****************************************************************************/
+IMB_DLL_LOCAL uint32_t asm_Eia3Round64BAVX(uint32_t T, const void *ks,
+                                           const void *data);
+
+/**
+ ******************************************************************************
+ * @description
+ *      Definition of the external function to return the authentication
+ *      update value to be XOR'ed with current authentication tag (AVX variant)
+ *
+ * @param[in] ks                    Pointer to key stream
+ *
+ * @param[in] data                  Pointer to the data
+ *
+ * @param[in] n_words               Number of data bits to be processed
+ *
+ * @pre
+ *      None
+ *
+ *****************************************************************************/
+IMB_DLL_LOCAL uint32_t asm_Eia3RemainderAVX(const void *ks, const void *data,
+                                            const uint64_t n_words);
+
+
+/* the s-boxes */
+extern const uint8_t S0[256];
+extern const uint8_t S1[256];
+
+void zuc_eea3_1_buffer_sse(const void *pKey, const void *pIv,
+                           const void *pBufferIn, void *pBufferOut,
+                           const uint32_t lengthInBytes);
+
+void zuc_eea3_4_buffer_sse(const void * const pKey[4],
+                           const void * const pIv[4],
+                           const void * const pBufferIn[4],
+                           void *pBufferOut[4],
+                           const uint32_t lengthInBytes[4]);
+
+void zuc_eea3_n_buffer_sse(const void * const pKey[], const void * const pIv[],
+                           const void * const pBufferIn[], void *pBufferOut[],
+                           const uint32_t lengthInBytes[],
+                           const uint32_t numBuffers);
+
+void zuc_eia3_1_buffer_sse(const void *pKey, const void *pIv,
+                           const void *pBufferIn, const uint32_t lengthInBits,
+                           uint32_t *pMacI);
+
+void zuc_eea3_1_buffer_avx(const void *pKey, const void *pIv,
+                           const void *pBufferIn, void *pBufferOut,
+                           const uint32_t lengthInBytes);
+
+void zuc_eea3_4_buffer_avx(const void * const pKey[4],
+                           const void * const pIv[4],
+                           const void * const pBufferIn[4],
+                           void *pBufferOut[4],
+                           const uint32_t lengthInBytes[4]);
+
+void zuc_eea3_n_buffer_avx(const void * const pKey[], const void * const pIv[],
+                           const void * const pBufferIn[], void *pBufferOut[],
+                           const uint32_t lengthInBytes[],
+                           const uint32_t numBuffers);
+
+void zuc_eia3_1_buffer_avx(const void *pKey, const void *pIv,
+                           const void *pBufferIn, const uint32_t lengthInBits,
+                           uint32_t *pMacI);
+
+
+#endif /* ZUC_INTERNAL_H_ */
+
-- 
cgit v1.2.3