summaryrefslogtreecommitdiffstats
path: root/src/spdk/intel-ipsec-mb/include
diff options
context:
space:
mode:
Diffstat (limited to 'src/spdk/intel-ipsec-mb/include')
-rw-r--r--src/spdk/intel-ipsec-mb/include/aes_common.asm375
-rw-r--r--src/spdk/intel-ipsec-mb/include/aesni_emu.h120
-rw-r--r--src/spdk/intel-ipsec-mb/include/aesni_emu.inc247
-rw-r--r--src/spdk/intel-ipsec-mb/include/clear_regs.asm196
-rw-r--r--src/spdk/intel-ipsec-mb/include/clear_regs_mem.h53
-rw-r--r--src/spdk/intel-ipsec-mb/include/clear_regs_mem_fns.asm124
-rw-r--r--src/spdk/intel-ipsec-mb/include/const.inc163
-rw-r--r--src/spdk/intel-ipsec-mb/include/constant_lookup.asm561
-rw-r--r--src/spdk/intel-ipsec-mb/include/constant_lookup.h173
-rw-r--r--src/spdk/intel-ipsec-mb/include/cpu_feature.h52
-rw-r--r--src/spdk/intel-ipsec-mb/include/datastruct.asm235
-rw-r--r--src/spdk/intel-ipsec-mb/include/dbgprint.asm413
-rw-r--r--src/spdk/intel-ipsec-mb/include/des_utils.h134
-rw-r--r--src/spdk/intel-ipsec-mb/include/gcm.h428
-rw-r--r--src/spdk/intel-ipsec-mb/include/gcm_defines.asm272
-rw-r--r--src/spdk/intel-ipsec-mb/include/gcm_keys_avx2_avx512.asm52
-rw-r--r--src/spdk/intel-ipsec-mb/include/gcm_keys_sse_avx.asm73
-rw-r--r--src/spdk/intel-ipsec-mb/include/gcm_keys_vaes_avx512.asm231
-rwxr-xr-xsrc/spdk/intel-ipsec-mb/include/kasumi_internal.h1853
-rw-r--r--src/spdk/intel-ipsec-mb/include/memcpy.asm613
-rw-r--r--src/spdk/intel-ipsec-mb/include/noaesni.h65
-rw-r--r--src/spdk/intel-ipsec-mb/include/os.asm58
-rw-r--r--src/spdk/intel-ipsec-mb/include/reg_sizes.asm300
-rw-r--r--src/spdk/intel-ipsec-mb/include/save_xmms.asm132
-rw-r--r--src/spdk/intel-ipsec-mb/include/save_xmms.h39
-rw-r--r--src/spdk/intel-ipsec-mb/include/snow3g.h511
-rw-r--r--src/spdk/intel-ipsec-mb/include/snow3g_common.h2840
-rw-r--r--src/spdk/intel-ipsec-mb/include/snow3g_internal.h638
-rw-r--r--src/spdk/intel-ipsec-mb/include/transpose_avx2.asm218
-rw-r--r--src/spdk/intel-ipsec-mb/include/transpose_avx512.asm497
-rw-r--r--src/spdk/intel-ipsec-mb/include/wireless_common.asm128
-rw-r--r--src/spdk/intel-ipsec-mb/include/wireless_common.h216
-rw-r--r--src/spdk/intel-ipsec-mb/include/zuc_common.asm740
-rwxr-xr-xsrc/spdk/intel-ipsec-mb/include/zuc_internal.h432
34 files changed, 13182 insertions, 0 deletions
diff --git a/src/spdk/intel-ipsec-mb/include/aes_common.asm b/src/spdk/intel-ipsec-mb/include/aes_common.asm
new file mode 100644
index 000000000..5c8cbb48c
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/include/aes_common.asm
@@ -0,0 +1,375 @@
+;;
+;; Copyright (c) 2019, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;; * Redistributions of source code must retain the above copyright notice,
+;; this list of conditions and the following disclaimer.
+;; * Redistributions in binary form must reproduce the above copyright
+;; notice, this list of conditions and the following disclaimer in the
+;; documentation and/or other materials provided with the distribution.
+;; * Neither the name of Intel Corporation nor the names of its contributors
+;; may be used to endorse or promote products derived from this software
+;; without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+%ifndef _AES_COMMON_ASM_
+%define _AES_COMMON_ASM_
+
+%include "include/reg_sizes.asm"
+
+;; =============================================================================
+;; Generic macro to produce code that executes %%OPCODE instruction
+;; on selected number of AES blocks (16 bytes long ) between 0 and 16.
+;; All three operands of the instruction come from registers.
+;; Note: if 3 blocks are left at the end instruction is produced to operate all
+;; 4 blocks (full width of ZMM)
+
+%macro ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16 14
+%define %%NUM_BLOCKS %1 ; [in] numerical value, number of AES blocks (0 to 16)
+%define %%OPCODE %2 ; [in] instruction name
+%define %%DST0 %3 ; [out] destination ZMM register
+%define %%DST1 %4 ; [out] destination ZMM register
+%define %%DST2 %5 ; [out] destination ZMM register
+%define %%DST3 %6 ; [out] destination ZMM register
+%define %%SRC1_0 %7 ; [in] source 1 ZMM register
+%define %%SRC1_1 %8 ; [in] source 1 ZMM register
+%define %%SRC1_2 %9 ; [in] source 1 ZMM register
+%define %%SRC1_3 %10 ; [in] source 1 ZMM register
+%define %%SRC2_0 %11 ; [in] source 2 ZMM register
+%define %%SRC2_1 %12 ; [in] source 2 ZMM register
+%define %%SRC2_2 %13 ; [in] source 2 ZMM register
+%define %%SRC2_3 %14 ; [in] source 2 ZMM register
+
+%assign reg_idx 0
+%assign blocks_left %%NUM_BLOCKS
+
+%rep (%%NUM_BLOCKS / 4)
+%xdefine %%DSTREG %%DST %+ reg_idx
+%xdefine %%SRC1REG %%SRC1_ %+ reg_idx
+%xdefine %%SRC2REG %%SRC2_ %+ reg_idx
+ %%OPCODE %%DSTREG, %%SRC1REG, %%SRC2REG
+%undef %%DSTREG
+%undef %%SRC1REG
+%undef %%SRC2REG
+%assign reg_idx (reg_idx + 1)
+%assign blocks_left (blocks_left - 4)
+%endrep
+
+%xdefine %%DSTREG %%DST %+ reg_idx
+%xdefine %%SRC1REG %%SRC1_ %+ reg_idx
+%xdefine %%SRC2REG %%SRC2_ %+ reg_idx
+
+%if blocks_left == 1
+ %%OPCODE XWORD(%%DSTREG), XWORD(%%SRC1REG), XWORD(%%SRC2REG)
+%elif blocks_left == 2
+ %%OPCODE YWORD(%%DSTREG), YWORD(%%SRC1REG), YWORD(%%SRC2REG)
+%elif blocks_left == 3
+ %%OPCODE %%DSTREG, %%SRC1REG, %%SRC2REG
+%endif
+
+%endmacro
+
+;; =============================================================================
+;; Loads specified number of AES blocks into ZMM registers
+;; %%FLAGS are optional and only affect behavior when 3 trailing blocks are left
+;; - if %%FlAGS not provided then exactly 3 blocks are loaded (move and insert)
+;; - if "load_4_instead_of_3" option is passed then 4 blocks are loaded
+%macro ZMM_LOAD_BLOCKS_0_16 7-8
+%define %%NUM_BLOCKS %1 ; [in] numerical value, number of AES blocks (0 to 16)
+%define %%INP %2 ; [in] input data pointer to read from
+%define %%DATA_OFFSET %3 ; [in] offset to the output pointer (GP or numerical)
+%define %%DST0 %4 ; [out] ZMM register with loaded data
+%define %%DST1 %5 ; [out] ZMM register with loaded data
+%define %%DST2 %6 ; [out] ZMM register with loaded data
+%define %%DST3 %7 ; [out] ZMM register with loaded data
+%define %%FLAGS %8 ; [in] optional "load_4_instead_of_3"
+
+%assign src_offset 0
+%assign dst_idx 0
+
+%rep (%%NUM_BLOCKS / 4)
+%xdefine %%DSTREG %%DST %+ dst_idx
+ vmovdqu8 %%DSTREG, [%%INP + %%DATA_OFFSET + src_offset]
+%undef %%DSTREG
+%assign src_offset (src_offset + 64)
+%assign dst_idx (dst_idx + 1)
+%endrep
+
+%assign blocks_left (%%NUM_BLOCKS % 4)
+%xdefine %%DSTREG %%DST %+ dst_idx
+
+%if blocks_left == 1
+ vmovdqu8 XWORD(%%DSTREG), [%%INP + %%DATA_OFFSET + src_offset]
+%elif blocks_left == 2
+ vmovdqu8 YWORD(%%DSTREG), [%%INP + %%DATA_OFFSET + src_offset]
+%elif blocks_left == 3
+%ifidn %%FLAGS, load_4_instead_of_3
+ vmovdqu8 %%DSTREG, [%%INP + %%DATA_OFFSET + src_offset]
+%else
+ vmovdqu8 YWORD(%%DSTREG), [%%INP + %%DATA_OFFSET + src_offset]
+ vinserti64x2 %%DSTREG, [%%INP + %%DATA_OFFSET + src_offset + 32], 2
+%endif
+%endif
+
+%endmacro
+
+;; =============================================================================
+;; Loads specified number of AES blocks into ZMM registers using mask register
+;; for the last loaded register (xmm, ymm or zmm).
+;; Loads take place at 1 byte granularity.
+%macro ZMM_LOAD_MASKED_BLOCKS_0_16 8
+%define %%NUM_BLOCKS %1 ; [in] numerical value, number of AES blocks (0 to 16)
+%define %%INP %2 ; [in] input data pointer to read from
+%define %%DATA_OFFSET %3 ; [in] offset to the output pointer (GP or numerical)
+%define %%DST0 %4 ; [out] ZMM register with loaded data
+%define %%DST1 %5 ; [out] ZMM register with loaded data
+%define %%DST2 %6 ; [out] ZMM register with loaded data
+%define %%DST3 %7 ; [out] ZMM register with loaded data
+%define %%MASK %8 ; [in] mask register
+
+%assign src_offset 0
+%assign dst_idx 0
+%assign blocks_left %%NUM_BLOCKS
+
+%if %%NUM_BLOCKS > 0
+%rep (((%%NUM_BLOCKS + 3) / 4) - 1)
+%xdefine %%DSTREG %%DST %+ dst_idx
+ vmovdqu8 %%DSTREG, [%%INP + %%DATA_OFFSET + src_offset]
+%undef %%DSTREG
+%assign src_offset (src_offset + 64)
+%assign dst_idx (dst_idx + 1)
+%assign blocks_left (blocks_left - 4)
+%endrep
+%endif ; %if %%NUM_BLOCKS > 0
+
+%xdefine %%DSTREG %%DST %+ dst_idx
+
+%if blocks_left == 1
+ vmovdqu8 XWORD(%%DSTREG){%%MASK}{z}, [%%INP + %%DATA_OFFSET + src_offset]
+%elif blocks_left == 2
+ vmovdqu8 YWORD(%%DSTREG){%%MASK}{z}, [%%INP + %%DATA_OFFSET + src_offset]
+%elif (blocks_left == 3 || blocks_left == 4)
+ vmovdqu8 %%DSTREG{%%MASK}{z}, [%%INP + %%DATA_OFFSET + src_offset]
+%endif
+
+%endmacro
+
+;; =============================================================================
+;; Stores specified number of AES blocks from ZMM registers
+%macro ZMM_STORE_BLOCKS_0_16 7
+%define %%NUM_BLOCKS %1 ; [in] numerical value, number of AES blocks (0 to 16)
+%define %%OUTP %2 ; [in] output data pointer to write to
+%define %%DATA_OFFSET %3 ; [in] offset to the output pointer (GP or numerical)
+%define %%SRC0 %4 ; [in] ZMM register with data to store
+%define %%SRC1 %5 ; [in] ZMM register with data to store
+%define %%SRC2 %6 ; [in] ZMM register with data to store
+%define %%SRC3 %7 ; [in] ZMM register with data to store
+
+%assign dst_offset 0
+%assign src_idx 0
+
+%rep (%%NUM_BLOCKS / 4)
+%xdefine %%SRCREG %%SRC %+ src_idx
+ vmovdqu8 [%%OUTP + %%DATA_OFFSET + dst_offset], %%SRCREG
+%undef %%SRCREG
+%assign dst_offset (dst_offset + 64)
+%assign src_idx (src_idx + 1)
+%endrep
+
+%assign blocks_left (%%NUM_BLOCKS % 4)
+%xdefine %%SRCREG %%SRC %+ src_idx
+
+%if blocks_left == 1
+ vmovdqu8 [%%OUTP + %%DATA_OFFSET + dst_offset], XWORD(%%SRCREG)
+%elif blocks_left == 2
+ vmovdqu8 [%%OUTP + %%DATA_OFFSET + dst_offset], YWORD(%%SRCREG)
+%elif blocks_left == 3
+ vmovdqu8 [%%OUTP + %%DATA_OFFSET + dst_offset], YWORD(%%SRCREG)
+ vextracti32x4 [%%OUTP + %%DATA_OFFSET + dst_offset + 32], %%SRCREG, 2
+%endif
+
+%endmacro
+
+;; =============================================================================
+;; Stores specified number of AES blocks from ZMM registers with mask register
+;; for the last loaded register (xmm, ymm or zmm).
+;; Stores take place at 1 byte granularity.
+%macro ZMM_STORE_MASKED_BLOCKS_0_16 8
+%define %%NUM_BLOCKS %1 ; [in] numerical value, number of AES blocks (0 to 16)
+%define %%OUTP %2 ; [in] output data pointer to write to
+%define %%DATA_OFFSET %3 ; [in] offset to the output pointer (GP or numerical)
+%define %%SRC0 %4 ; [in] ZMM register with data to store
+%define %%SRC1 %5 ; [in] ZMM register with data to store
+%define %%SRC2 %6 ; [in] ZMM register with data to store
+%define %%SRC3 %7 ; [in] ZMM register with data to store
+%define %%MASK %8 ; [in] mask register
+
+%assign dst_offset 0
+%assign src_idx 0
+%assign blocks_left %%NUM_BLOCKS
+
+%if %%NUM_BLOCKS > 0
+%rep (((%%NUM_BLOCKS + 3) / 4) - 1)
+%xdefine %%SRCREG %%SRC %+ src_idx
+ vmovdqu8 [%%OUTP + %%DATA_OFFSET + dst_offset], %%SRCREG
+%undef %%SRCREG
+%assign dst_offset (dst_offset + 64)
+%assign src_idx (src_idx + 1)
+%assign blocks_left (blocks_left - 4)
+%endrep
+%endif ; %if %%NUM_BLOCKS > 0
+
+%xdefine %%SRCREG %%SRC %+ src_idx
+
+%if blocks_left == 1
+ vmovdqu8 [%%OUTP + %%DATA_OFFSET + dst_offset]{%%MASK}, XWORD(%%SRCREG)
+%elif blocks_left == 2
+ vmovdqu8 [%%OUTP + %%DATA_OFFSET + dst_offset]{%%MASK}, YWORD(%%SRCREG)
+%elif (blocks_left == 3 || blocks_left == 4)
+ vmovdqu8 [%%OUTP + %%DATA_OFFSET + dst_offset]{%%MASK}, %%SRCREG
+%endif
+
+%endmacro
+
+;;; ===========================================================================
+;;; Handles AES encryption rounds
+;;; It handles special cases: the last and first rounds
+;;; Optionally, it performs XOR with data after the last AES round.
+;;; Uses NROUNDS parameterto check what needs to be done for the current round.
+;;; If 3 blocks are trailing then operation on whole ZMM is performed (4 blocks).
+%macro ZMM_AESENC_ROUND_BLOCKS_0_16 12
+%define %%L0B0_3 %1 ; [in/out] zmm; blocks 0 to 3
+%define %%L0B4_7 %2 ; [in/out] zmm; blocks 4 to 7
+%define %%L0B8_11 %3 ; [in/out] zmm; blocks 8 to 11
+%define %%L0B12_15 %4 ; [in/out] zmm; blocks 12 to 15
+%define %%KEY %5 ; [in] zmm containing round key
+%define %%ROUND %6 ; [in] round number
+%define %%D0_3 %7 ; [in] zmm or no_data; plain/cipher text blocks 0-3
+%define %%D4_7 %8 ; [in] zmm or no_data; plain/cipher text blocks 4-7
+%define %%D8_11 %9 ; [in] zmm or no_data; plain/cipher text blocks 8-11
+%define %%D12_15 %10 ; [in] zmm or no_data; plain/cipher text blocks 12-15
+%define %%NUMBL %11 ; [in] number of blocks; numerical value
+%define %%NROUNDS %12 ; [in] number of rounds; numerical value
+
+;;; === first AES round
+%if (%%ROUND < 1)
+ ;; round 0
+ ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16 %%NUMBL, vpxorq, \
+ %%L0B0_3, %%L0B4_7, %%L0B8_11, %%L0B12_15, \
+ %%L0B0_3, %%L0B4_7, %%L0B8_11, %%L0B12_15, \
+ %%KEY, %%KEY, %%KEY, %%KEY
+%endif ; ROUND 0
+
+;;; === middle AES rounds
+%if (%%ROUND >= 1 && %%ROUND <= %%NROUNDS)
+ ;; rounds 1 to 9/11/13
+ ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16 %%NUMBL, vaesenc, \
+ %%L0B0_3, %%L0B4_7, %%L0B8_11, %%L0B12_15, \
+ %%L0B0_3, %%L0B4_7, %%L0B8_11, %%L0B12_15, \
+ %%KEY, %%KEY, %%KEY, %%KEY
+%endif ; rounds 1 to 9/11/13
+
+;;; === last AES round
+%if (%%ROUND > %%NROUNDS)
+ ;; the last round - mix enclast with text xor's
+ ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16 %%NUMBL, vaesenclast, \
+ %%L0B0_3, %%L0B4_7, %%L0B8_11, %%L0B12_15, \
+ %%L0B0_3, %%L0B4_7, %%L0B8_11, %%L0B12_15, \
+ %%KEY, %%KEY, %%KEY, %%KEY
+
+;;; === XOR with data
+%ifnidn %%D0_3, no_data
+%ifnidn %%D4_7, no_data
+%ifnidn %%D8_11, no_data
+%ifnidn %%D12_15, no_data
+ ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16 %%NUMBL, vpxorq, \
+ %%L0B0_3, %%L0B4_7, %%L0B8_11, %%L0B12_15, \
+ %%L0B0_3, %%L0B4_7, %%L0B8_11, %%L0B12_15, \
+ %%D0_3, %%D4_7, %%D8_11, %%D12_15
+%endif ; !no_data
+%endif ; !no_data
+%endif ; !no_data
+%endif ; !no_data
+
+%endif ; The last round
+
+%endmacro
+
+;;; ===========================================================================
+;;; Handles AES decryption rounds
+;;; It handles special cases: the last and first rounds
+;;; Optionally, it performs XOR with data after the last AES round.
+;;; Uses NROUNDS parameter to check what needs to be done for the current round.
+;;; If 3 blocks are trailing then operation on whole ZMM is performed (4 blocks).
+%macro ZMM_AESDEC_ROUND_BLOCKS_0_16 12
+%define %%L0B0_3 %1 ; [in/out] zmm; blocks 0 to 3
+%define %%L0B4_7 %2 ; [in/out] zmm; blocks 4 to 7
+%define %%L0B8_11 %3 ; [in/out] zmm; blocks 8 to 11
+%define %%L0B12_15 %4 ; [in/out] zmm; blocks 12 to 15
+%define %%KEY %5 ; [in] zmm containing round key
+%define %%ROUND %6 ; [in] round number
+%define %%D0_3 %7 ; [in] zmm or no_data; cipher text blocks 0-3
+%define %%D4_7 %8 ; [in] zmm or no_data; cipher text blocks 4-7
+%define %%D8_11 %9 ; [in] zmm or no_data; cipher text blocks 8-11
+%define %%D12_15 %10 ; [in] zmm or no_data; cipher text blocks 12-15
+%define %%NUMBL %11 ; [in] number of blocks; numerical value
+%define %%NROUNDS %12 ; [in] number of rounds; numerical value
+
+;;; === first AES round
+%if (%%ROUND < 1)
+ ;; round 0
+ ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16 %%NUMBL, vpxorq, \
+ %%L0B0_3, %%L0B4_7, %%L0B8_11, %%L0B12_15, \
+ %%L0B0_3, %%L0B4_7, %%L0B8_11, %%L0B12_15, \
+ %%KEY, %%KEY, %%KEY, %%KEY
+%endif ; ROUND 0
+
+;;; === middle AES rounds
+%if (%%ROUND >= 1 && %%ROUND <= %%NROUNDS)
+ ;; rounds 1 to 9/11/13
+ ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16 %%NUMBL, vaesdec, \
+ %%L0B0_3, %%L0B4_7, %%L0B8_11, %%L0B12_15, \
+ %%L0B0_3, %%L0B4_7, %%L0B8_11, %%L0B12_15, \
+ %%KEY, %%KEY, %%KEY, %%KEY
+%endif ; rounds 1 to 9/11/13
+
+;;; === last AES round
+%if (%%ROUND > %%NROUNDS)
+ ;; the last round - mix enclast with text xor's
+ ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16 %%NUMBL, vaesdeclast, \
+ %%L0B0_3, %%L0B4_7, %%L0B8_11, %%L0B12_15, \
+ %%L0B0_3, %%L0B4_7, %%L0B8_11, %%L0B12_15, \
+ %%KEY, %%KEY, %%KEY, %%KEY
+
+;;; === XOR with data
+%ifnidn %%D0_3, no_data
+%ifnidn %%D4_7, no_data
+%ifnidn %%D8_11, no_data
+%ifnidn %%D12_15, no_data
+ ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16 %%NUMBL, vpxorq, \
+ %%L0B0_3, %%L0B4_7, %%L0B8_11, %%L0B12_15, \
+ %%L0B0_3, %%L0B4_7, %%L0B8_11, %%L0B12_15, \
+ %%D0_3, %%D4_7, %%D8_11, %%D12_15
+%endif ; !no_data
+%endif ; !no_data
+%endif ; !no_data
+%endif ; !no_data
+
+%endif ; The last round
+
+%endmacro
+
+%endif ;; _AES_COMMON_ASM
diff --git a/src/spdk/intel-ipsec-mb/include/aesni_emu.h b/src/spdk/intel-ipsec-mb/include/aesni_emu.h
new file mode 100644
index 000000000..575fada22
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/include/aesni_emu.h
@@ -0,0 +1,120 @@
+/*******************************************************************************
+ Copyright (c) 2018, Intel Corporation
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice,
+ this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+ * Neither the name of Intel Corporation nor the names of its contributors
+ may be used to endorse or promote products derived from this software
+ without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+ FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+
+#ifndef _AESNI_EMU_H_
+#define _AESNI_EMU_H_
+#include <stdint.h>
+
+/* Interface to AESNI emulation routines */
+
+/* XMM type definitions and constants */
+
+#define MAX_BYTES_PER_XMM 16
+#define MAX_WORDS_PER_XMM 8
+#define MAX_DWORDS_PER_XMM 4
+#define MAX_QWORDS_PER_XMM 2
+
+union xmm_reg {
+ uint8_t byte[MAX_BYTES_PER_XMM];
+ uint16_t word[MAX_WORDS_PER_XMM];
+ uint32_t dword[MAX_DWORDS_PER_XMM];
+ uint64_t qword[MAX_QWORDS_PER_XMM];
+};
+
+/* AESNI emulation API */
+
+/**
+ * @brief AESKEYGENASIST instruction emulation function
+ *
+ * Assist in AES round key generation using an 8 bits Round Constant
+ * (RCON) specified in \a imm8, operating on 128 bits of data
+ *
+ * @param dst pointer to 128 bit buffer to store generated key
+ * @param src pointer to 128 bit src key
+ * @param imm8 round constant used to generate key
+ */
+IMB_DLL_LOCAL void emulate_AESKEYGENASSIST(union xmm_reg *dst,
+ const union xmm_reg *src,
+ const uint32_t imm8);
+
+/**
+ * @brief AESENC instruction emulation function
+ *
+ * Perform one round of an AES encryption flow
+ *
+ * @param dst pointer to 128 bit data (state) to operate on
+ * @param src pointer to 128 bit round key
+ */
+IMB_DLL_LOCAL void emulate_AESENC(union xmm_reg *dst,
+ const union xmm_reg *src);
+
+/**
+ * @brief AESENCLAST instruction emulation function
+ *
+ * Perform last round of an AES encryption flow
+ *
+ * @param dst pointer to 128 bit data (state) to operate on
+ * @param src pointer to 128 bit round key
+ */
+IMB_DLL_LOCAL void emulate_AESENCLAST(union xmm_reg *dst,
+ const union xmm_reg *src);
+
+/**
+ * @brief AESDEC instruction emulation function
+ *
+ * Perform one round of an AES decryption flow
+ *
+ * @param dst pointer to 128 bit data (state) to operate on
+ * @param src pointer to 128 bit round key
+ */
+IMB_DLL_LOCAL void emulate_AESDEC(union xmm_reg *dst,
+ const union xmm_reg *src);
+
+/**
+ * @brief AESDECLAST instruction emulation function
+ *
+ * Perform last round of an AES decryption flow
+ *
+ * @param dst pointer to 128 bit data (state) to operate on
+ * @param src pointer to 128 bit round key
+ */
+IMB_DLL_LOCAL void emulate_AESDECLAST(union xmm_reg *dst,
+ const union xmm_reg *src);
+
+/**
+ * @brief AESIMC instruction emulation function
+ *
+ * Perform the InvMixColumn transformation on
+ * a 128 bit round key
+ *
+ * @param dst pointer to 128 bit buffer to store result
+ * @param src pointer to 128 bit round key
+ */
+IMB_DLL_LOCAL void emulate_AESIMC(union xmm_reg *dst,
+ const union xmm_reg *src);
+
+#endif /* _AESNI_EMU_H_ */
diff --git a/src/spdk/intel-ipsec-mb/include/aesni_emu.inc b/src/spdk/intel-ipsec-mb/include/aesni_emu.inc
new file mode 100644
index 000000000..5a40180c8
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/include/aesni_emu.inc
@@ -0,0 +1,247 @@
+;;
+;; Copyright (c) 2018, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;; * Redistributions of source code must retain the above copyright notice,
+;; this list of conditions and the following disclaimer.
+;; * Redistributions in binary form must reproduce the above copyright
+;; notice, this list of conditions and the following disclaimer in the
+;; documentation and/or other materials provided with the distribution.
+;; * Neither the name of Intel Corporation nor the names of its contributors
+;; may be used to endorse or promote products derived from this software
+;; without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+%ifndef _AESNI_EMU_INC_
+%define _AESNI_EMU_INC_
+
+%include "include/reg_sizes.asm"
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; Utility macros and defines to assist AESNI translation macros
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%define GP0 rax
+%define GP1 rbx
+%define GP2 rcx
+%define GP3 rdx
+%define GP4 rbp
+%define GP5 rsi
+%define GP6 rdi
+%define GP7 r8
+%define GP8 r9
+%define GP9 r10
+%define GP10 r11
+%define GP11 r12
+%define GP12 r13
+%define GP13 r14
+%define GP14 r15
+%define NUM_GP_REGS 15
+%define NUM_XMM_REGS 16
+
+%define GP_SZ 8
+%define XMM_SZ 16
+%define ARG_SZ 16
+
+;; 8 extra bytes added to align to 16 bytes
+%define XMM_OFFSET ((NUM_GP_REGS + 1) * GP_SZ)
+;; ARG1 placed in the stack after all GP and XMM registers
+%define ARG1_OFFSET (XMM_OFFSET + (NUM_XMM_REGS * XMM_SZ))
+;; ARG2 placed in the stack after all GP and XMM registers and ARG1
+%define ARG2_OFFSET (ARG1_OFFSET + ARG_SZ)
+
+%define GP(x) GP %+ x
+%define XMM(x) xmm %+ x
+
+;; Reserve enough stack space to store all GP and XMM
+;; registers and emulation function arguments
+;; e.g. void emulate_AESXXX(xmm_reg *dst, xmm_reg *src);
+%define RES_STACK_SZ (ARG2_OFFSET + ARG_SZ)
+
+;; Allocate stack space and save GP registers
+%macro SAVE_GP_REGS 0
+ push rax
+ mov rax, rsp
+ sub rsp, RES_STACK_SZ
+ and rsp, -16
+%assign gp_regs_i 0
+%rep NUM_GP_REGS
+ mov [rsp + 8*gp_regs_i], GP(gp_regs_i)
+%assign gp_regs_i gp_regs_i+1
+%endrep
+%endmacro
+
+;; Restore GP registers and stack pointer
+%macro RESTORE_GP_REGS 0
+%assign gp_regs_i 0
+%rep NUM_GP_REGS
+ mov GP(gp_regs_i), [rsp + 8*gp_regs_i]
+%assign gp_regs_i gp_regs_i+1
+%endrep
+ mov rsp, rax
+ pop rax
+%endmacro
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; Generic macro to translate AESNI instructions to AESNI emulation functions
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%macro EMULATE_AESNI 4
+%define %%func %1
+%define %%src_dst %2
+%define %%key %3
+%define %%imm %4
+
+%ifdef LINUX
+%define %%arg1 rdi
+%define %%arg2 rsi
+%define %%arg3 rdx
+%else
+%define %%arg1 rcx
+%define %%arg2 rdx
+%define %%arg3 r8
+%endif
+
+;; Check if key is reg or ptr
+%assign IS_REG 0
+%assign x 0
+%rep NUM_XMM_REGS
+%ifidni %%key, XMM(x)
+ %assign IS_REG 1
+ %exitrep
+%endif
+%assign x x+1
+%endrep
+ ;; save GP registers to stack
+ SAVE_GP_REGS
+
+ ;; move function args onto stack before function call
+ movdqa [rsp + ARG1_OFFSET], %%src_dst
+%if IS_REG
+ movdqa [rsp + ARG2_OFFSET], %%key
+%else
+ movdqu %%src_dst, %%key
+ movdqa [rsp + ARG2_OFFSET], %%src_dst
+%endif
+ lea %%arg1, [rsp + ARG1_OFFSET]
+ lea %%arg2, [rsp + ARG2_OFFSET]
+
+ ;; move 8 bit imm rcon for aeskeygenassist
+%ifnum %%imm
+ mov BYTE(%%arg3), %%imm
+%endif
+
+;; save XMM registers to stack, as some compilers may use them in "func"
+%assign reg_idx 0
+%rep NUM_XMM_REGS
+ movdqa [rsp + XMM_OFFSET + (reg_idx * XMM_SZ)], XMM(reg_idx)
+%assign reg_idx reg_idx + 1
+%endrep
+
+;; reserve space on stack for up to 4 arguments on the stack (windows only)
+%ifndef LINUX
+ sub rsp, 32
+%endif
+ ;; call emulation function
+ call %%func
+%ifndef LINUX
+ add rsp, 32
+%endif
+
+;; restore XMM registers from stack
+%assign reg_idx 0
+%rep NUM_XMM_REGS
+ movdqa XMM(reg_idx), [rsp + XMM_OFFSET + (reg_idx * XMM_SZ)]
+%assign reg_idx reg_idx + 1
+%endrep
+
+ ;; Destination XMM gets overwritten with result from func
+ movdqa %%src_dst, [rsp + ARG1_OFFSET]
+
+ RESTORE_GP_REGS
+%endmacro
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; Macros to translate AESNI instructions to AESNI emulation functions
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;; AESENC translation macro
+%macro EMULATE_AESENC 2
+%define %%src_dst %1
+%define %%key %2
+ EMULATE_AESNI emulate_AESENC, %%src_dst, %%key, ""
+%endmacro
+
+;; AESENCLAST translation macro
+%macro EMULATE_AESENCLAST 2
+%define %%src_dst %1
+%define %%key %2
+ EMULATE_AESNI emulate_AESENCLAST, %%src_dst, %%key, ""
+%endmacro
+
+;; AESDEC translation macro
+%macro EMULATE_AESDEC 2
+%define %%src_dst %1
+%define %%key %2
+ EMULATE_AESNI emulate_AESDEC, %%src_dst, %%key, ""
+%endmacro
+
+;; AESDECLAST translation macro
+%macro EMULATE_AESDECLAST 2
+%define %%src_dst %1
+%define %%key %2
+ EMULATE_AESNI emulate_AESDECLAST, %%src_dst, %%key, ""
+%endmacro
+
+;; AESIMC translation macro
+%macro EMULATE_AESIMC 2
+%define %%src_dst %1
+%define %%key %2
+ EMULATE_AESNI emulate_AESIMC, %%src_dst, %%key, ""
+%endmacro
+
+;; AESKEYGENASSIST translation macro
+%macro EMULATE_AESKEYGENASSIST 3
+%define %%src_dst %1
+%define %%key %2
+%define %%imm %3
+ EMULATE_AESNI emulate_AESKEYGENASSIST, %%src_dst, %%key, %%imm
+%endmacro
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; AESNI defines
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%ifndef NO_AESNI_RENAME
+%define aesenc EMULATE_AESENC
+%define aesenclast EMULATE_AESENCLAST
+%define aesdec EMULATE_AESDEC
+%define aesdeclast EMULATE_AESDECLAST
+%define aesimc EMULATE_AESIMC
+%define aeskeygenassist EMULATE_AESKEYGENASSIST
+%endif
+
+extern emulate_AESENC
+extern emulate_AESENCLAST
+extern emulate_AESDEC
+extern emulate_AESDECLAST
+extern emulate_AESIMC
+extern emulate_AESKEYGENASSIST
+
+%endif ; end ifndef _AESNI_EMU_INC_
diff --git a/src/spdk/intel-ipsec-mb/include/clear_regs.asm b/src/spdk/intel-ipsec-mb/include/clear_regs.asm
new file mode 100644
index 000000000..6cb48c49e
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/include/clear_regs.asm
@@ -0,0 +1,196 @@
+;;
+;; Copyright (c) 2019, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;; * Redistributions of source code must retain the above copyright notice,
+;; this list of conditions and the following disclaimer.
+;; * Redistributions in binary form must reproduce the above copyright
+;; notice, this list of conditions and the following disclaimer in the
+;; documentation and/or other materials provided with the distribution.
+;; * Neither the name of Intel Corporation nor the names of its contributors
+;; may be used to endorse or promote products derived from this software
+;; without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+%ifndef _CLEAR_REGS_ASM_
+%define _CLEAR_REGS_ASM_
+
+%include "include/os.asm"
+
+;
+; This macro clears any GP registers passed
+;
+%macro clear_gps 1-16
+%define %%NUM_REGS %0
+%rep %%NUM_REGS
+ xor %1, %1
+%rotate 1
+%endrep
+%endmacro
+
+;
+; This macro clears any XMM registers passed on SSE
+;
+%macro clear_xmms_sse 1-16
+%define %%NUM_REGS %0
+%rep %%NUM_REGS
+ pxor %1, %1
+%rotate 1
+%endrep
+%endmacro
+
+;
+; This macro clears any XMM registers passed on AVX
+;
+%macro clear_xmms_avx 1-16
+%define %%NUM_REGS %0
+%rep %%NUM_REGS
+ vpxor %1, %1
+%rotate 1
+%endrep
+%endmacro
+
+;
+; This macro clears any YMM registers passed
+;
+%macro clear_ymms 1-16
+%define %%NUM_REGS %0
+%rep %%NUM_REGS
+ vpxor %1, %1
+%rotate 1
+%endrep
+%endmacro
+
+;
+; This macro clears any ZMM registers passed
+;
+%macro clear_zmms 1-32
+%define %%NUM_REGS %0
+%rep %%NUM_REGS
+ vpxorq %1, %1
+%rotate 1
+%endrep
+%endmacro
+
+;
+; This macro clears all scratch GP registers
+; for Windows or Linux
+;
+%macro clear_scratch_gps_asm 0
+ clear_gps rax, rcx, rdx, r8, r9, r10, r11
+%ifdef LINUX
+ clear_gps rdi, rsi
+%endif
+%endmacro
+
+;
+; This macro clears all scratch XMM registers on SSE
+;
+%macro clear_scratch_xmms_sse_asm 0
+%ifdef LINUX
+%assign i 0
+%rep 16
+ pxor xmm %+ i, xmm %+ i
+%assign i (i+1)
+%endrep
+; On Windows, XMM0-XMM5 registers are scratch registers
+%else
+%assign i 0
+%rep 6
+ pxor xmm %+ i, xmm %+ i
+%assign i (i+1)
+%endrep
+%endif ; LINUX
+%endmacro
+
+;
+; This macro clears all scratch XMM registers on AVX
+;
+%macro clear_scratch_xmms_avx_asm 0
+%ifdef LINUX
+ vzeroall
+; On Windows, XMM0-XMM5 registers are scratch registers
+%else
+%assign i 0
+%rep 6
+ vpxor xmm %+ i, xmm %+ i
+%assign i (i+1)
+%endrep
+%endif ; LINUX
+%endmacro
+
+;
+; This macro clears all scratch YMM registers
+;
+; It should be called before restoring the XMM registers
+; for Windows (XMM6-XMM15)
+;
+%macro clear_scratch_ymms_asm 0
+; On Linux, all YMM registers are scratch registers
+%ifdef LINUX
+ vzeroall
+; On Windows, YMM0-YMM5 registers are scratch registers.
+; YMM6-YMM15 upper 128 bits are scratch registers too, but
+; the lower 128 bits are to be restored after calling these function
+; which clears the upper bits too.
+%else
+%assign i 0
+%rep 6
+ vpxor ymm %+ i, ymm %+ i
+%assign i (i+1)
+%endrep
+%endif ; LINUX
+%endmacro
+
+;
+; This macro clears all scratch ZMM registers
+;
+; It should be called before restoring the XMM registers
+; for Windows (XMM6-XMM15). YMM registers are used
+; on purpose, since XOR'ing YMM registers is faster
+; than XOR'ing ZMM registers, and the operation clears
+; also the upper 256 bits
+;
+%macro clear_scratch_zmms_asm 0
+; On Linux, all ZMM registers are scratch registers
+%ifdef LINUX
+ vzeroall
+ ;; vzeroall only clears the first 16 ZMM registers
+%assign i 16
+%rep 16
+ vpxorq ymm %+ i, ymm %+ i
+%assign i (i+1)
+%endrep
+; On Windows, ZMM0-ZMM5 and ZMM16-ZMM31 registers are scratch registers.
+; ZMM6-ZMM15 upper 384 bits are scratch registers too, but
+; the lower 128 bits are to be restored after calling these function
+; which clears the upper bits too.
+%else
+%assign i 0
+%rep 6
+ vpxorq ymm %+ i, ymm %+ i
+%assign i (i+1)
+%endrep
+
+%assign i 16
+%rep 16
+ vpxorq ymm %+ i, ymm %+ i
+%assign i (i+1)
+%endrep
+%endif ; LINUX
+%endmacro
+
+%endif ;; _CLEAR_REGS_ASM
diff --git a/src/spdk/intel-ipsec-mb/include/clear_regs_mem.h b/src/spdk/intel-ipsec-mb/include/clear_regs_mem.h
new file mode 100644
index 000000000..40f888ec4
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/include/clear_regs_mem.h
@@ -0,0 +1,53 @@
+/*******************************************************************************
+ Copyright (c) 2019, Intel Corporation
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice,
+ this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+ * Neither the name of Intel Corporation nor the names of its contributors
+ may be used to endorse or promote products derived from this software
+ without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+ FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+
+#ifndef CLEAR_REGS_H
+#define CLEAR_REGS_H
+
+#define CLEAR_SCRATCH_GPS clear_scratch_gps
+
+void force_memset_zero(void *mem, const size_t size);
+
+static inline void
+clear_mem(void *mem, const size_t size)
+{
+ force_memset_zero(mem, size);
+}
+
+static inline void
+clear_var(void *var, const size_t size)
+{
+ force_memset_zero(var, size);
+}
+
+void clear_scratch_gps(void);
+void clear_scratch_xmms_sse(void);
+void clear_scratch_xmms_avx(void);
+void clear_scratch_ymms(void);
+void clear_scratch_zmms(void);
+
+#endif /* CLEAR_REGS_H */
diff --git a/src/spdk/intel-ipsec-mb/include/clear_regs_mem_fns.asm b/src/spdk/intel-ipsec-mb/include/clear_regs_mem_fns.asm
new file mode 100644
index 000000000..4fd6f7edb
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/include/clear_regs_mem_fns.asm
@@ -0,0 +1,124 @@
+;;
+;; Copyright (c) 2019, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;; * Redistributions of source code must retain the above copyright notice,
+;; this list of conditions and the following disclaimer.
+;; * Redistributions in binary form must reproduce the above copyright
+;; notice, this list of conditions and the following disclaimer in the
+;; documentation and/or other materials provided with the distribution.
+;; * Neither the name of Intel Corporation nor the names of its contributors
+;; may be used to endorse or promote products derived from this software
+;; without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+%include "include/os.asm"
+%include "include/clear_regs.asm"
+
+section .text
+;
+; This function clears all scratch GP registers
+;
+; void clear_scratch_gps(void)
+MKGLOBAL(clear_scratch_gps,function,internal)
+clear_scratch_gps:
+
+ clear_scratch_gps_asm
+
+ ret
+
+;
+; This function clears all scratch XMM registers
+;
+; void clear_scratch_xmms_sse(void)
+MKGLOBAL(clear_scratch_xmms_sse,function,internal)
+clear_scratch_xmms_sse:
+
+ clear_scratch_xmms_sse_asm
+
+ ret
+
+;
+; This function clears all scratch XMM registers
+;
+; It should be called before restoring the XMM registers
+; for Windows (XMM6-XMM15)
+;
+; void clear_scratch_xmms_avx(void)
+MKGLOBAL(clear_scratch_xmms_avx,function,internal)
+clear_scratch_xmms_avx:
+
+ clear_scratch_xmms_avx_asm
+
+ ret
+
+;
+; This function clears all scratch YMM registers
+;
+; It should be called before restoring the XMM registers
+; for Windows (XMM6-XMM15)
+;
+; void clear_scratch_ymms(void)
+MKGLOBAL(clear_scratch_ymms,function,internal)
+clear_scratch_ymms:
+
+ clear_scratch_ymms_asm
+
+ ret
+
+;
+; This function clears all scratch ZMM registers
+;
+; It should be called before restoring the XMM registers
+; for Windows (XMM6-XMM15). YMM registers are used
+; on purpose, since XOR'ing YMM registers is faster
+; than XOR'ing ZMM registers, and the operation clears
+; also the upper 256 bits
+;
+; void clear_scratch_zmms(void)
+MKGLOBAL(clear_scratch_zmms,function,internal)
+clear_scratch_zmms:
+
+ clear_scratch_zmms_asm
+
+ ret
+
+;
+; This function clears all memory passed
+;
+; void force_memset_zero(void *mem, const size_t size)
+MKGLOBAL(force_memset_zero,function,internal)
+force_memset_zero:
+
+%ifdef LINUX
+ mov rcx, rsi
+%else
+ push rdi
+ mov rdi, rcx
+ mov rcx, rdx
+%endif
+ xor eax, eax
+ cld
+ rep stosb
+
+%ifndef LINUX
+ pop rdi
+%endif
+ ret
+
+%ifdef LINUX
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
diff --git a/src/spdk/intel-ipsec-mb/include/const.inc b/src/spdk/intel-ipsec-mb/include/const.inc
new file mode 100644
index 000000000..e77e80d2e
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/include/const.inc
@@ -0,0 +1,163 @@
+;;
+;; Copyright (c) 2018, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;; * Redistributions of source code must retain the above copyright notice,
+;; this list of conditions and the following disclaimer.
+;; * Redistributions in binary form must reproduce the above copyright
+;; notice, this list of conditions and the following disclaimer in the
+;; documentation and/or other materials provided with the distribution.
+;; * Neither the name of Intel Corporation nor the names of its contributors
+;; may be used to endorse or promote products derived from this software
+;; without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+%ifndef _CONST_INC_
+%define _CONST_INC_
+
+;;; Tables used to insert word into a SIMD register
+extern len_shift_tab
+extern len_mask_tab
+extern shift_tab_16
+
+;;; Table to do 0x80 byte shift for padding prefix
+extern padding_0x80_tab16
+
+;;; Size of len_shift_tab defined in const.asm module
+%define len_tab_diff 128
+
+; PINSRW_COMMON insert word into 128 bit SIMD register
+%macro PINSRW_COMMON 7
+
+%define %%type %1 ; instruction type - sse or avx
+%define %%dest %2 ; dest XMM reg to insert word
+%define %%tmp_simd %3 ; XMM reg to clobber
+%define %%tmp_gp %4 ; GP reg to clobber
+%define %%idx %5 ; word index to insert value into XMM
+%define %%val %6 ; word value to insert into idx
+%define %%scale_idx %7 ; flag to set if index is to be scaled x16
+
+%ifidn %%scale_idx, scale_x16
+ shl %%idx, 4 ; scale idx up x16
+%endif
+%ifnum %%val
+ ;; immediate value passed on
+ mov DWORD(%%tmp_gp), %%val
+%ifidn %%type, sse
+ movd %%tmp_simd, DWORD(%%tmp_gp)
+%else
+ vmovd %%tmp_simd, DWORD(%%tmp_gp)
+%endif
+%else
+ ;; register name passed on
+%ifidn %%type, sse
+ movd %%tmp_simd, DWORD(%%val)
+%else
+ vmovd %%tmp_simd, DWORD(%%val)
+%endif
+%endif
+ lea %%tmp_gp, [rel len_shift_tab]
+ ;; check type - SSE or AVX
+%ifidn %%type, sse
+ pshufb %%tmp_simd, [%%tmp_gp + %%idx]
+ pand %%dest, [%%tmp_gp + len_tab_diff + %%idx]
+ por %%dest, %%tmp_simd
+%else
+ vpshufb %%tmp_simd, [%%tmp_gp + %%idx]
+ vpand %%dest, [%%tmp_gp + len_tab_diff + %%idx]
+ vpor %%dest, %%tmp_simd
+%endif
+%ifidn %%scale_idx, scale_x16
+ shr %%idx, 4 ; reset idx
+%endif
+%endmacro
+
+;;; Call SSE macro
+%define XPINSRW PINSRW_COMMON sse,
+
+;;; Call AVX macro
+%define XVPINSRW PINSRW_COMMON avx,
+
+
+;;; VPINSRW_M256 insert word into 32 byte memory range
+%macro VPINSRW_M256 8
+
+%define %%mem_addr %1 ; 16 byte aligned memory address to insert word
+%define %%tmp_simd1 %2 ; XMM reg to clobber
+%define %%tmp_simd2 %3 ; XMM reg to clobber
+%define %%tmp_gp %4 ; GP reg to clobber
+%define %%offset %5 ; GP reg used to store offset
+%define %%idx %6 ; word index to insert value
+%define %%val %7 ; word value to insert into idx
+%define %%scale_idx %8 ; flag to set if index is to be scaled x16
+
+ mov %%offset, %%idx
+ and %%offset, 0x8 ; set offset 0 or 8
+ and %%idx, 0x7 ; remove offset from idx
+ vmovdqa %%tmp_simd1, [%%mem_addr + %%offset*2]
+ XVPINSRW %%tmp_simd1, %%tmp_simd2, %%tmp_gp, %%idx, %%val, %%scale_idx
+ vmovdqa [%%mem_addr + %%offset*2], %%tmp_simd1
+ or %%idx, %%offset ; reset offset
+%endmacro
+
+;;; PSLB_COMMON shift bytes 128 bit SIMD register
+%macro PSLB_COMMON 6
+
+%define %%type %1 ; [in] instruction type - sse or avx
+%define %%dir %2 ; [in] shift direction - left or right
+%define %%reg %3 ; [in/out] XMM reg to shift bytes
+%define %%num %4 ; [in] GP reg containing number of bytes to shift
+%define %%shuf_tab %5 ; [out] XMM reg to store shuffle table
+%define %%tmp_gp %6 ; [clobbered] GP reg to clobber
+
+ ;; load shift table into %%shuf_tab
+ lea %%tmp_gp, [rel shift_tab_16 + 16]
+%ifidn %%dir, left
+ sub %%tmp_gp, %%num
+%else
+ add %%tmp_gp, %%num
+%endif
+
+%ifidn %%type, sse
+ movdqu %%shuf_tab, [%%tmp_gp]
+ pshufb %%reg, %%shuf_tab
+%else
+ vmovdqu %%shuf_tab, [%%tmp_gp]
+ vpshufb %%reg, %%shuf_tab
+%endif
+%endmacro
+
+;;; Call SSE left shift macro
+%macro XPSLLB 4
+ PSLB_COMMON sse, left, %1,%2,%3,%4
+%endm
+
+;;; Call SSE right shift macro
+%macro XPSRLB 4
+ PSLB_COMMON sse, right, %1,%2,%3,%4
+%endm
+
+;;; Call AVX left shift macro
+%macro XVPSLLB 4
+ PSLB_COMMON avx, left, %1,%2,%3,%4
+%endm
+
+;;; Call AVX right shift macro
+%macro XVPSRLB 4
+ PSLB_COMMON avx, right, %1,%2,%3,%4
+%endm
+
+%endif ; end ifndef _CONST_INC_
diff --git a/src/spdk/intel-ipsec-mb/include/constant_lookup.asm b/src/spdk/intel-ipsec-mb/include/constant_lookup.asm
new file mode 100644
index 000000000..a3c81dc75
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/include/constant_lookup.asm
@@ -0,0 +1,561 @@
+;;
+;; Copyright (c) 2019, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;; * Redistributions of source code must retain the above copyright notice,
+;; this list of conditions and the following disclaimer.
+;; * Redistributions in binary form must reproduce the above copyright
+;; notice, this list of conditions and the following disclaimer in the
+;; documentation and/or other materials provided with the distribution.
+;; * Neither the name of Intel Corporation nor the names of its contributors
+;; may be used to endorse or promote products derived from this software
+;; without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+%include "include/os.asm"
+%include "include/reg_sizes.asm"
+
+section .data
+default rel
+
+align 16
+idx_tab8:
+ db 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7,
+ db 0x8, 0x9, 0xA, 0xB, 0xC, 0xD, 0xE, 0xF,
+
+align 16
+add_16:
+ db 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10,
+ db 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10
+
+align 16
+idx_tab16:
+ dw 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7
+
+align 16
+add_8:
+ dw 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8
+
+align 16
+idx_tab32:
+ dd 0x0, 0x1, 0x2, 0x3
+
+align 16
+add_4:
+ dd 0x4, 0x4, 0x4, 0x4
+
+align 16
+idx_tab64:
+ dq 0x0, 0x1
+
+add_2:
+ dq 0x2, 0x2
+
+align 16
+bcast_mask:
+ db 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01,
+ db 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01
+
+section .text
+
+%ifdef LINUX
+ %define arg1 rdi
+ %define arg2 rsi
+ %define arg3 rdx
+%else
+ %define arg1 rcx
+ %define arg2 rdx
+ %define arg3 r8
+%endif
+
+%define bcast_idx xmm0
+%define xadd xmm1
+%define accum_val xmm2
+%define xindices xmm3
+%define xtmp xmm4
+%define xtmp2 xmm5
+%define tmp r9
+%define offset r10
+
+%define table arg1
+%define idx arg2
+%define size arg3
+
+; uint8_t lookup_8bit_sse(const void *table, const uint32_t idx, const uint32_t size);
+; arg 1 : pointer to table to look up
+; arg 2 : index to look up
+; arg 3 : size of table to look up (multiple of 16 bytes)
+MKGLOBAL(lookup_8bit_sse,function,internal)
+lookup_8bit_sse:
+
+ ;; Number of loop iters = matrix size / 4 (number of values in XMM)
+ shr size, 4
+ je exit8_sse
+
+ xor offset, offset
+
+ ;; Broadcast idx to look up
+ movd bcast_idx, DWORD(idx)
+ pxor xtmp, xtmp
+ pxor accum_val, accum_val
+ pshufb bcast_idx, xtmp
+
+ movdqa xadd, [rel add_16]
+ movdqa xindices, [rel idx_tab8]
+
+loop8_sse:
+ movdqa xtmp, xindices
+
+ ;; Compare indices with idx
+ ;; This generates a mask with all 0s except for the position where idx matches (all 1s here)
+ pcmpeqb xtmp, bcast_idx
+
+ ;; Load next 16 values
+ movdqa xtmp2, [table + offset]
+
+ ;; This generates data with all 0s except the value we are looking for in the index to look up
+ pand xtmp2, xtmp
+
+ por accum_val, xtmp2
+
+ ;; Get next 16 indices
+ paddb xindices, xadd
+
+ add offset, 16
+ dec size
+
+ jne loop8_sse
+
+ ;; Extract value from XMM register
+ movdqa xtmp, accum_val
+ pslldq xtmp, 8 ; shift left by 64 bits
+ por accum_val, xtmp
+
+ movdqa xtmp, accum_val
+ pslldq xtmp, 4 ; shift left by 32 bits
+ por accum_val, xtmp
+
+ movdqa xtmp, accum_val
+ pslldq xtmp, 2 ; shift left by 16 bits
+ por accum_val, xtmp
+
+ movdqa xtmp, accum_val
+ pslldq xtmp, 1 ; shift left by 8 bits
+ por accum_val, xtmp
+
+ pextrb rax, accum_val, 15
+
+exit8_sse:
+ ret
+
+; uint8_t lookup_8bit_avx(const void *table, const uint32_t idx, const uint32_t size);
+; arg 1 : pointer to table to look up
+; arg 2 : index to look up
+; arg 3 : size of table to look up (multiple of 16 bytes)
+MKGLOBAL(lookup_8bit_avx,function,internal)
+lookup_8bit_avx:
+ ;; Number of loop iters = matrix size / 4 (number of values in XMM)
+ shr size, 4
+ je exit8_avx
+
+ xor offset, offset
+
+ ;; Broadcast idx to look up
+ vmovd bcast_idx, DWORD(idx)
+ vpxor xtmp, xtmp
+ vpxor accum_val, accum_val
+ vpshufb bcast_idx, xtmp
+
+ vmovdqa xadd, [rel add_16]
+ vmovdqa xindices, [rel idx_tab8]
+
+loop8_avx:
+ ;; Compare indices with idx
+ ;; This generates a mask with all 0s except for the position where idx matches (all 1s here)
+ vpcmpeqb xtmp, xindices, bcast_idx
+
+ ;; Load next 16 values
+ vmovdqa xtmp2, [table + offset]
+
+ ;; This generates data with all 0s except the value we are looking for in the index to look up
+ vpand xtmp2, xtmp
+
+ vpor accum_val, xtmp2
+
+ ;; Get next 16 indices
+ vpaddb xindices, xadd
+
+ add offset, 16
+ dec size
+
+ jne loop8_avx
+
+ ;; Extract value from XMM register
+ vpslldq xtmp, accum_val, 8 ; shift left by 64 bits
+ vpor accum_val, xtmp
+
+ vpslldq xtmp, accum_val, 4 ; shift left by 32 bits
+ vpor accum_val, xtmp
+
+ vpslldq xtmp, accum_val, 2 ; shift left by 16 bits
+ vpor accum_val, xtmp
+
+ vpslldq xtmp, accum_val, 1 ; shift left by 8 bits
+ vpor accum_val, xtmp
+
+ vpextrb rax, accum_val, 15
+
+exit8_avx:
+
+ ret
+
+; uint8_t lookup_16bit_sse(const void *table, const uint32_t idx, const uint32_t size);
+; arg 1 : pointer to table to look up
+; arg 2 : index to look up
+; arg 3 : size of table to look up
+MKGLOBAL(lookup_16bit_sse,function,internal)
+lookup_16bit_sse:
+
+ ;; Number of loop iters = matrix size / 8 (number of values in XMM)
+ shr size, 3
+ je exit16_sse
+
+ xor offset, offset
+
+ ;; Broadcast idx to look up
+ movd bcast_idx, DWORD(idx)
+ movdqa xtmp, [rel bcast_mask]
+ pxor accum_val, accum_val
+ pshufb bcast_idx, xtmp
+
+ movdqa xadd, [rel add_8]
+ movdqa xindices, [rel idx_tab16]
+
+loop16_sse:
+
+ movdqa xtmp, xindices
+
+ ;; Compare indices with idx
+ ;; This generates a mask with all 0s except for the position where idx matches (all 1s here)
+ pcmpeqw xtmp, bcast_idx
+
+ ;; Load next 8 values
+ movdqa xtmp2, [table + offset]
+
+ ;; This generates data with all 0s except the value we are looking for in the index to look up
+ pand xtmp2, xtmp
+
+ por accum_val, xtmp2
+
+ ;; Get next 8 indices
+ paddw xindices, xadd
+ add offset, 16
+ dec size
+
+ jne loop16_sse
+
+ ;; Extract value from XMM register
+ movdqa xtmp, accum_val
+ pslldq xtmp, 8 ; shift left by 64 bits
+ por accum_val, xtmp
+
+ movdqa xtmp, accum_val
+ pslldq xtmp, 4 ; shift left by 32 bits
+ por accum_val, xtmp
+
+ movdqa xtmp, accum_val
+ pslldq xtmp, 2 ; shift left by 16 bits
+ por accum_val, xtmp
+
+ pextrw rax, accum_val, 7
+
+exit16_sse:
+ ret
+
+; uint8_t lookup_16bit_avx(const void *table, const uint32_t idx, const uint32_t size);
+; arg 1 : pointer to table to look up
+; arg 2 : index to look up
+; arg 3 : size of table to look up
+MKGLOBAL(lookup_16bit_avx,function,internal)
+lookup_16bit_avx:
+
+ ;; Number of loop iters = matrix size / 8 (number of values in XMM)
+ shr size, 3
+ je exit16_avx
+
+ xor offset, offset
+
+ ;; Broadcast idx to look up
+ vmovd bcast_idx, DWORD(idx)
+ vmovdqa xtmp, [rel bcast_mask]
+ vpxor accum_val, accum_val
+ vpshufb bcast_idx, xtmp
+
+ vmovdqa xadd, [rel add_8]
+ vmovdqa xindices, [rel idx_tab16]
+
+loop16_avx:
+
+ ;; Compare indices with idx
+ ;; This generates a mask with all 0s except for the position where idx matches (all 1s here)
+ vpcmpeqw xtmp, xindices, bcast_idx
+
+ ;; Load next 16 values
+ vmovdqa xtmp2, [table + offset]
+
+ ;; This generates data with all 0s except the value we are looking for in the index to look up
+ vpand xtmp2, xtmp
+
+ vpor accum_val, xtmp2
+
+ ;; Get next 8 indices
+ vpaddw xindices, xadd
+ add offset, 16
+ dec size
+
+ jne loop16_avx
+
+ ;; Extract value from XMM register
+ vpslldq xtmp, accum_val, 8 ; shift left by 64 bits
+ vpor accum_val, xtmp
+
+ vpslldq xtmp, accum_val, 4 ; shift left by 32 bits
+ vpor accum_val, xtmp
+
+ vpslldq xtmp, accum_val, 2 ; shift left by 16 bits
+ vpor accum_val, xtmp
+
+ vpextrw rax, accum_val, 7
+
+exit16_avx:
+ ret
+
+; uint32_t lookup_32bit_sse(const void *table, const uint32_t idx, const uint32_t size);
+; arg 1 : pointer to table to look up
+; arg 2 : index to look up
+; arg 3 : size of table to look up
+MKGLOBAL(lookup_32bit_sse,function,internal)
+lookup_32bit_sse:
+
+ ;; Number of loop iters = matrix size / 4 (number of values in XMM)
+ shr size, 2
+ je exit32_sse
+
+ xor offset, offset
+
+ ;; Broadcast idx to look up
+ movd bcast_idx, DWORD(idx)
+ pxor accum_val, accum_val
+ pshufd bcast_idx, bcast_idx, 0
+
+ movdqa xadd, [rel add_4]
+ movdqa xindices, [rel idx_tab32]
+
+loop32_sse:
+ movdqa xtmp, xindices
+
+ ;; Compare indices with idx
+ ;; This generates a mask with all 0s except for the position where idx matches (all 1s here)
+ pcmpeqd xtmp, bcast_idx
+
+ ;; Load next 4 values
+ movdqa xtmp2, [table + offset]
+
+ ;; This generates data with all 0s except the value we are looking for in the index to look up
+ pand xtmp2, xtmp
+
+ por accum_val, xtmp2
+
+ ;; Get next 4 indices
+ paddd xindices, xadd
+ add offset, 16
+ dec size
+
+ jne loop32_sse
+
+ ;; Extract value from XMM register
+ movdqa xtmp, accum_val
+ psrldq xtmp, 8 ; shift right by 64 bits
+ por accum_val, xtmp
+
+ movdqa xtmp, accum_val
+ psrldq xtmp, 4 ; shift right by 32 bits
+ por accum_val, xtmp
+
+ movd eax, accum_val
+
+exit32_sse:
+ ret
+
+
+; uint32_t lookup_32bit_avx(const void *table, const uint32_t idx, const uint32_t size);
+; arg 1 : pointer to table to look up
+; arg 2 : index to look up
+; arg 3 : size of table to look up
+MKGLOBAL(lookup_32bit_avx,function,internal)
+lookup_32bit_avx:
+ ;; Number of loop iters = matrix size / 4 (number of values in XMM)
+ shr size, 2
+ je exit32_avx
+
+ xor offset, offset
+
+ ;; Broadcast idx to look up
+ vmovd bcast_idx, DWORD(idx)
+ vpxor accum_val, accum_val
+ vpshufd bcast_idx, bcast_idx, 0
+
+ vmovdqa xadd, [rel add_4]
+ vmovdqa xindices, [rel idx_tab32]
+
+loop32_avx:
+ ;; Compare indices with idx
+ ;; This generates a mask with all 0s except for the position where idx matches (all 1s here)
+ vpcmpeqd xtmp, xindices, bcast_idx
+
+ ;; Load next 4 values
+ vmovdqa xtmp2, [table + offset]
+
+ ;; This generates data with all 0s except the value we are looking for in the index to look up
+ vpand xtmp2, xtmp
+
+ vpor accum_val, xtmp2
+
+ ;; Get next 4 indices
+ vpaddd xindices, xadd
+ add offset, 16
+ dec size
+
+ jne loop32_avx
+
+ ;; Extract value from XMM register
+ vpsrldq xtmp, accum_val, 8 ; shift right by 64 bits
+ vpor accum_val, xtmp
+
+ vpsrldq xtmp, accum_val, 4 ; shift right by 32 bits
+ vpor accum_val, xtmp
+
+ vmovd eax, accum_val
+
+exit32_avx:
+ ret
+
+
+; uint64_t lookup_64bit_sse(const void *table, const uint32_t idx, const uint32_t size);
+; arg 1 : pointer to table to look up
+; arg 2 : index to look up
+; arg 3 : size of table to look up
+MKGLOBAL(lookup_64bit_sse,function,internal)
+lookup_64bit_sse:
+ ;; Number of loop iters = matrix size / 2 (number of values in XMM)
+ shr size, 1
+ je exit64_sse
+
+ xor offset, offset
+
+ ;; Broadcast idx to look up
+ movq bcast_idx, idx
+ pxor accum_val, accum_val
+ pinsrq bcast_idx, idx, 1
+
+ movdqa xadd, [rel add_2]
+ movdqa xindices, [rel idx_tab64]
+
+loop64_sse:
+ movdqa xtmp, xindices
+
+ ;; Compare indices with idx
+ ;; This generates a mask with all 0s except for the position where idx matches (all 1s here)
+ pcmpeqq xtmp, bcast_idx
+
+ ;; Load next 2 values
+ movdqa xtmp2, [table + offset]
+
+ ;; This generates data with all 0s except the value we are looking for in the index to look up
+ pand xtmp2, xtmp
+
+ por accum_val, xtmp2
+
+ ;; Get next 2 indices
+ paddq xindices, xadd
+ add offset, 16
+ dec size
+
+ jne loop64_sse
+
+ ;; Extract value from XMM register
+ movdqa xtmp, accum_val
+ psrldq xtmp, 8 ; shift right by 64 bits
+ por accum_val, xtmp
+
+ movq rax, accum_val
+
+exit64_sse:
+ ret
+
+
+; uint64_t lookup_64bit_avx(const void *table, const uint32_t idx, const uint32_t size);
+; arg 1 : pointer to table to look up
+; arg 2 : index to look up
+; arg 3 : size of table to look up
+MKGLOBAL(lookup_64bit_avx,function,internal)
+lookup_64bit_avx:
+ ;; Number of loop iters = matrix size / 2 (number of values in XMM)
+ shr size, 1
+ je exit64_avx
+
+ xor offset, offset
+
+ vmovq bcast_idx, idx
+ vpxor accum_val, accum_val
+ vpinsrq bcast_idx, idx, 1
+
+ vmovdqa xadd, [rel add_2]
+ vmovdqa xindices, [rel idx_tab64]
+
+loop64_avx:
+ ;; Compare indices with idx
+ ;; This generates a mask with all 0s except for the position where idx matches (all 1s here)
+ vpcmpeqq xtmp, xindices, bcast_idx
+
+ ;; Load next 2 values
+ vmovdqa xtmp2, [table + offset]
+
+ ;; This generates data with all 0s except the value we are looking for in the index to look up
+ vpand xtmp2, xtmp
+
+ vpor accum_val, xtmp2
+
+ ;; Get next 2 indices
+ vpaddq xindices, xadd
+ add offset, 16
+ dec size
+
+ jne loop64_avx
+
+ ;; Extract value from XMM register
+ vpsrldq xtmp, accum_val, 8 ; shift right by 64 bits
+ vpor accum_val, xtmp
+
+ vmovq rax, accum_val
+
+exit64_avx:
+ ret
+
+
+%ifdef LINUX
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
diff --git a/src/spdk/intel-ipsec-mb/include/constant_lookup.h b/src/spdk/intel-ipsec-mb/include/constant_lookup.h
new file mode 100644
index 000000000..bd56a24d2
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/include/constant_lookup.h
@@ -0,0 +1,173 @@
+/*******************************************************************************
+ Copyright (c) 2019, Intel Corporation
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice,
+ this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+ * Neither the name of Intel Corporation nor the names of its contributors
+ may be used to endorse or promote products derived from this software
+ without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+ FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+
+#ifndef CONSTANT_LOOKUP_H
+#define CONSTANT_LOOKUP_H
+
+#include "intel-ipsec-mb.h"
+
+#ifdef SAFE_LOOKUP
+#define LOOKUP8_SSE(_table, _idx, _size) \
+ lookup_8bit_sse(_table, _idx, _size)
+#define LOOKUP8_AVX(_table, _idx, _size) \
+ lookup_8bit_avx(_table, _idx, _size)
+#define LOOKUP16_SSE(_table, _idx, _size) \
+ lookup_16bit_sse(_table, _idx, _size)
+#define LOOKUP16_AVX(_table, _idx, _size) \
+ lookup_16bit_avx(_table, _idx, _size)
+#define LOOKUP32_SSE(_table, _idx, _size) \
+ lookup_32bit_sse(_table, _idx, _size)
+#define LOOKUP32_AVX(_table, _idx, _size) \
+ lookup_32bit_avx(_table, _idx, _size)
+#define LOOKUP64_SSE(_table, _idx, _size) \
+ lookup_64bit_sse(_table, _idx, _size)
+#define LOOKUP64_AVX(_table, _idx, _size) \
+ lookup_64bit_avx(_table, _idx, _size)
+#else
+#define LOOKUP8_SSE(_table, _idx, _size) \
+ _table[_idx]
+#define LOOKUP8_AVX(_table, _idx, _size) \
+ _table[_idx]
+#define LOOKUP16_SSE(_table, _idx, _size) \
+ _table[_idx]
+#define LOOKUP16_AVX(_table, _idx, _size) \
+ _table[_idx]
+#define LOOKUP32_SSE(_table, _idx, _size) \
+ _table[_idx]
+#define LOOKUP32_AVX(_table, _idx, _size) \
+ _table[_idx]
+#define LOOKUP64_SSE(_table, _idx, _size) \
+ _table[_idx]
+#define LOOKUP64_AVX(_table, _idx, _size) \
+ _table[_idx]
+#endif
+
+/*
+ * @brief Constant time SSE lookup function on variable size table
+ * with 8-bit values
+ *
+ * @param[in] table Pointer to the table to look up (16-byte aligned)
+ * @param[in] idx Index to look up
+ * @param[in] size Number of 8 bit elements in the table (multiple of 16)
+ *
+ * @return value to lookup
+ */
+uint8_t
+lookup_8bit_sse(const void *table, const uint32_t idx, const uint32_t size);
+
+/*
+ * @brief Constant time AVX lookup function on variable size table
+ * with 8-bit values
+ *
+ * @param[in] table Pointer to the table to look up (16-byte aligned)
+ * @param[in] idx Index to look up
+ * @param[in] size Number of 8 bit elements in the table (multiple of 16)
+ *
+ * @return value to lookup
+ */
+uint8_t
+lookup_8bit_avx(const void *table, const uint32_t idx, const uint32_t size);
+
+/*
+ * @brief Constant time SSE lookup function on variable size table
+ * with 16-bit values
+ *
+ * @param[in] table Pointer to the table to look up (16-byte aligned)
+ * @param[in] idx Index to look up
+ * @param[in] size Number of 16 bit elements in the table (multiple of 8)
+ *
+ * @return value to lookup
+ */
+uint16_t
+lookup_16bit_sse(const void *table, const uint32_t idx, const uint32_t size);
+
+/*
+ * @brief Constant time AVX lookup function on variable size table
+ * with 16-bit values
+ *
+ * @param[in] table Pointer to the table to look up (16-byte aligned)
+ * @param[in] idx Index to look up
+ * @param[in] size Number of 16 bit elements in the table (multiple of 8)
+ *
+ * @return value to lookup
+ */
+uint16_t
+lookup_16bit_avx(const void *table, const uint32_t idx, const uint32_t size);
+
+/*
+ * @brief Constant time SSE lookup function on
+ * variable size table with 32-bit values
+ *
+ * @param[in] table Pointer to the table to look up (16-byte aligned)
+ * @param[in] idx Index to look up
+ * @param[in] size Number of 32 bit elements in the table (multiple of 4)
+ *
+ * @return value to lookup
+ */
+uint32_t
+lookup_32bit_sse(const void *table, const uint32_t idx, const uint32_t size);
+
+/*
+ * @brief Constant time AVX lookup function on
+ * variable size table with 32-bit values
+ *
+ * @param[in] table Pointer to the table to look up (16-byte aligned)
+ * @param[in] idx Index to look up
+ * @param[in] size Number of 32 bit elements in the table (multiple of 4)
+ *
+ * @return value to lookup
+ */
+uint32_t
+lookup_32bit_avx(const void *table, const uint32_t idx, const uint32_t size);
+
+/*
+ * @brief Constant time SSE lookup function on
+ * variable size table with 64-bit values
+ *
+ * @param[in] table Pointer to the table to look up (16-byte aligned)
+ * @param[in] idx Index to look up
+ * @param[in] size Number of 64 bit elements in the table (multiple of 2)
+ *
+ * @return value to lookup
+ */
+uint64_t
+lookup_64bit_sse(const void *table, const uint32_t idx, const uint32_t size);
+
+/*
+ * @brief Constant time AVX lookup function on
+ * variable size table with 64-bit values
+ *
+ * @param[in] table Pointer to the table to look up (16-byte aligned)
+ * @param[in] idx Index to look up
+ * @param[in] size Number of 64 bit elements in the table (multiple of 2)
+ *
+ * @return value to lookup
+ */
+uint64_t
+lookup_64bit_avx(const void *table, const uint32_t idx, const uint32_t size);
+
+#endif /* CONSTANT_LOOKUP_H */
diff --git a/src/spdk/intel-ipsec-mb/include/cpu_feature.h b/src/spdk/intel-ipsec-mb/include/cpu_feature.h
new file mode 100644
index 000000000..1347094a7
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/include/cpu_feature.h
@@ -0,0 +1,52 @@
+/*******************************************************************************
+ Copyright (c) 2018, Intel Corporation
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice,
+ this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+ * Neither the name of Intel Corporation nor the names of its contributors
+ may be used to endorse or promote products derived from this software
+ without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+ FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+
+#include "intel-ipsec-mb.h"
+
+#ifndef CPU_FEATURE_H
+#define CPU_FEATURE_H
+
+/**
+ * @brief Detects hardware features and returns their status
+ *
+ * @return Bitmask representing presence of CPU features/extensions,
+ * see intel-ipsec-mb.h IMB_FEATURE_xyz definitions for details.
+ */
+IMB_DLL_LOCAL uint64_t cpu_feature_detect(void);
+
+/**
+ * @brief Modifies CPU \a features mask based on requested \a flags
+ *
+ * @param flags bitmask describing CPU feature adjustments
+ * @param features bitmask describing present CPU features
+ *
+ * @return \a features with applied modifications on them via \a flags
+ */
+IMB_DLL_LOCAL uint64_t
+cpu_feature_adjust(const uint64_t flags, uint64_t features);
+
+#endif /* CPU_FEATURE_H */
diff --git a/src/spdk/intel-ipsec-mb/include/datastruct.asm b/src/spdk/intel-ipsec-mb/include/datastruct.asm
new file mode 100644
index 000000000..0ab1113ab
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/include/datastruct.asm
@@ -0,0 +1,235 @@
+;;
+;; Copyright (c) 2012-2018, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;; * Redistributions of source code must retain the above copyright notice,
+;; this list of conditions and the following disclaimer.
+;; * Redistributions in binary form must reproduce the above copyright
+;; notice, this list of conditions and the following disclaimer in the
+;; documentation and/or other materials provided with the distribution.
+;; * Neither the name of Intel Corporation nor the names of its contributors
+;; may be used to endorse or promote products derived from this software
+;; without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+; Macros for defining data structures
+
+; Usage example
+
+;START_FIELDS ; JOB_AES
+;;; name size align
+;FIELD _plaintext, 8, 8 ; pointer to plaintext
+;FIELD _ciphertext, 8, 8 ; pointer to ciphertext
+;FIELD _IV, 16, 8 ; IV
+;FIELD _keys, 8, 8 ; pointer to keys
+;FIELD _len, 4, 4 ; length in bytes
+;FIELD _status, 4, 4 ; status enumeration
+;FIELD _user_data, 8, 8 ; pointer to user data
+;UNION _union, size1, align1, \
+; size2, align2, \
+; size3, align3, \
+; ...
+;END_FIELDS
+;%assign _JOB_AES_size _FIELD_OFFSET
+;%assign _JOB_AES_align _STRUCT_ALIGN
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; Alternate "struc-like" syntax:
+; STRUCT job_aes2
+; RES_Q .plaintext, 1
+; RES_Q .ciphertext, 1
+; RES_DQ .IV, 1
+; RES_B .nested, _JOB_AES_SIZE, _JOB_AES_ALIGN
+; RES_U .union, size1, align1, \
+; size2, align2, \
+; ...
+; ENDSTRUCT
+; ; Following only needed if nesting
+; %assign job_aes2_size _FIELD_OFFSET
+; %assign job_aes2_align _STRUCT_ALIGN
+;
+; RES_* macros take a name, a count and an optional alignment.
+; The count in in terms of the base size of the macro, and the
+; default alignment is the base size.
+; The macros are:
+; Macro Base size
+; RES_B 1
+; RES_W 2
+; RES_D 4
+; RES_Q 8
+; RES_DQ 16
+; RES_Y 32
+; RES_Z 64
+;
+; RES_U defines a union. It's arguments are a name and two or more
+; pairs of "size, alignment"
+;
+; The two assigns are only needed if this structure is being nested
+; within another. Even if the assigns are not done, one can still use
+; STRUCT_NAME_size as the size of the structure.
+;
+; Note that for nesting, you still need to assign to STRUCT_NAME_size.
+;
+; The differences between this and using "struc" directly are that each
+; type is implicitly aligned to its natural length (although this can be
+; over-ridden with an explicit third parameter), and that the structure
+; is padded at the end to its overall alignment.
+;
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%ifndef _DATASTRUCT_ASM_
+%define _DATASTRUCT_ASM_
+
+;; START_FIELDS
+%macro START_FIELDS 0
+%assign _FIELD_OFFSET 0
+%assign _STRUCT_ALIGN 0
+%endm
+
+;; FIELD name size align
+%macro FIELD 3
+%define %%name %1
+%define %%size %2
+%define %%align %3
+
+%assign _FIELD_OFFSET (_FIELD_OFFSET + (%%align) - 1) & (~ ((%%align)-1))
+%%name equ _FIELD_OFFSET
+%assign _FIELD_OFFSET _FIELD_OFFSET + (%%size)
+%if (%%align > _STRUCT_ALIGN)
+%assign _STRUCT_ALIGN %%align
+%endif
+%endm
+
+;; END_FIELDS
+%macro END_FIELDS 0
+%assign _FIELD_OFFSET (_FIELD_OFFSET + _STRUCT_ALIGN-1) & (~ (_STRUCT_ALIGN-1))
+%endm
+
+%macro UNION 5-*
+%if (0 == (%0 & 1))
+ %error EVEN number of parameters to UNION Macro
+ %err
+%endif
+%rotate 1
+ %assign _UNION_SIZE %1
+ %assign _UNION_ALIGN %2
+%rep (%0 - 3)/2
+ %rotate 2
+ %if (%1 > _UNION_SIZE)
+ %assign _UNION_SIZE %1
+ %endif
+ %if (%2 > _UNION_ALIGN)
+ %assign _UNION_ALIGN %2
+ %endif
+%endrep
+%rotate 2
+FIELD %1, _UNION_SIZE, _UNION_ALIGN
+%endm
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%macro STRUCT 1
+START_FIELDS
+struc %1
+%endm
+
+%macro ENDSTRUCT 0
+%assign %%tmp _FIELD_OFFSET
+END_FIELDS
+%assign %%tmp (_FIELD_OFFSET - %%tmp)
+%if (%%tmp > 0)
+ resb %%tmp
+%endif
+endstruc
+%endm
+
+;; RES_int name size align
+%macro RES_int 3
+%define %%name %1
+%define %%size %2
+%define %%align %3
+
+%assign _FIELD_OFFSET (_FIELD_OFFSET + (%%align) - 1) & (~ ((%%align)-1))
+align %%align
+%%name resb %%size
+%assign _FIELD_OFFSET _FIELD_OFFSET + (%%size)
+%if (%%align > _STRUCT_ALIGN)
+%assign _STRUCT_ALIGN %%align
+%endif
+%endm
+
+
+
+; macro RES_B name, size [, align]
+%macro RES_B 2-3 1
+RES_int %1, %2, %3
+%endm
+
+; macro RES_W name, size [, align]
+%macro RES_W 2-3 2
+RES_int %1, 2*(%2), %3
+%endm
+
+; macro RES_D name, size [, align]
+%macro RES_D 2-3 4
+RES_int %1, 4*(%2), %3
+%endm
+
+; macro RES_Q name, size [, align]
+%macro RES_Q 2-3 8
+RES_int %1, 8*(%2), %3
+%endm
+
+; macro RES_DQ name, size [, align]
+%macro RES_DQ 2-3 16
+RES_int %1, 16*(%2), %3
+%endm
+
+; macro RES_Y name, size [, align]
+%macro RES_Y 2-3 32
+RES_int %1, 32*(%2), %3
+%endm
+
+; macro RES_Z name, size [, align]
+%macro RES_Z 2-3 64
+RES_int %1, 64*(%2), %3
+%endm
+
+
+%macro RES_U 5-*
+%if (0 == (%0 & 1))
+ %error EVEN number of parameters to RES_U Macro
+ %err
+%endif
+%rotate 1
+ %assign _UNION_SIZE %1
+ %assign _UNION_ALIGN %2
+%rep (%0 - 3)/2
+ %rotate 2
+ %if (%1 > _UNION_SIZE)
+ %assign _UNION_SIZE %1
+ %endif
+ %if (%2 > _UNION_ALIGN)
+ %assign _UNION_ALIGN %2
+ %endif
+%endrep
+%rotate 2
+RES_int %1, _UNION_SIZE, _UNION_ALIGN
+%endm
+
+%endif ; end ifdef _DATASTRUCT_ASM_
diff --git a/src/spdk/intel-ipsec-mb/include/dbgprint.asm b/src/spdk/intel-ipsec-mb/include/dbgprint.asm
new file mode 100644
index 000000000..d14eb0ebc
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/include/dbgprint.asm
@@ -0,0 +1,413 @@
+;;
+;; Copyright (c) 2012-2018, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;; * Redistributions of source code must retain the above copyright notice,
+;; this list of conditions and the following disclaimer.
+;; * Redistributions in binary form must reproduce the above copyright
+;; notice, this list of conditions and the following disclaimer in the
+;; documentation and/or other materials provided with the distribution.
+;; * Neither the name of Intel Corporation nor the names of its contributors
+;; may be used to endorse or promote products derived from this software
+;; without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+; Macros for "printing" for debug purposes from within asm code
+;
+; The basic macros are:
+; DBGPRINT16, DBGPRINT32, DBGPRINT64, DBGPRINT_XMM, DBGPRINT_YMM, DBGPRINT_ZMM
+; These are called with 1 or more arguments, all of which are of the
+; size/type as specified in the name. E.g.
+; DBGPRINT64 reg1, reg2, reg3, ...
+;
+; There is also a macro DEBUGPRINTL that takes one argument, a string. E.g.
+; DBGPRINTL "hit this point in the code"
+;
+; There are also variations on these with the "DBGPRINT" suffixed with "L", e.g.
+; DBGPRINTL64. These take two or more arguments, where the first is a string,
+; and the rest are of the specified type, e.g.
+; DBGPRINTL64 "Rindex", Rindex
+; Essentially, this is the same as a DBGPRINTL followed by DBGPRINT64.
+;
+; If DO_DBGPRINT is defined, then the macros write the debug information into
+; a buffer. If DO_DBGPRINT is *not* defined, then the macros expand to nothing.
+;
+; CAVEAT: The macros need a GPR. Currently, it uses R15. If the first register
+; argument is R15, then it will use R14. This means that if you try
+; DBGPRINTL64 "text", rax, r15
+; you will not get the proper value of r15.
+; One way to avoid this issue is to not use multiple registers on the same line
+; if the register types are GPR (i.e. this is not an issue for printing XMM
+; registers). E.g the above could be done with:
+; DBGPRINTL64 "test", rax
+; DBGPRINT64 r15
+;
+; Note also that the macros only check for r15. Thus is you tried something
+; like (after token expansion):
+; DBGPRINT32 r15d
+; you won't get the right results. If you want to display r15d, you should
+; print it as the 64-bit r15.
+;
+; To actually print the data, from your C code include the file
+; "dbgprint.h". The default buffer size is 16kB. If you want to change
+; that, #define DBG_BUFFER_SIZE before including "dbgprint.h".
+;
+; Then, (after your asm routine(s) have returned, call
+; print_debug() or print_debug(file pointer)
+; If you do not specify a file pointer, it defaults to stdout.
+;
+; Printing the debug data also resets the write pointer to the beginning,
+; effectively "deleting" the previous messages.
+;
+%ifndef DBGPRINT_ASM_INCLUDED
+%define DBGPRINT_ASM_INCLUDED
+
+;%define DO_DBGPRINT
+%ifdef DO_DBGPRINT
+extern pDebugBuffer
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; DBGPRINT_INT size, param, ...
+%macro DBGPRINT_INT 2-*
+%ifidni %2,r15
+%xdefine %%reg r14
+%else
+%xdefine %%reg r15
+%endif
+%xdefine %%size %1
+%rotate 1
+ push %%reg
+ mov %%reg, [pDebugBuffer]
+%rep %0 - 1
+ mov byte [%%reg], %%size
+ %if (%%size == 2)
+ mov word [%%reg+1], %1
+ %elif (%%size == 4)
+ mov dword [%%reg+1], %1
+ %elif (%%size == 8)
+ mov qword [%%reg+1], %1
+ %elif (%%size == 16)
+ movdqu oword [%%reg+1], %1
+ %elif (%%size == 32)
+ vmovdqu [%%reg+1], %1
+ %elif (%%size == 64)
+ vmovdqu32 [%%reg+1], %1
+ %else
+ %error invalid size %%size
+ %endif
+ add %%reg, %%size+1
+%rotate 1
+%endrep
+ mov [pDebugBuffer], %%reg
+ pop %%reg
+%endmacro
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; DBGPRINTL_INT size, label, param, ...
+%macro DBGPRINTL_INT 3-*
+%ifidni %3,r15
+%xdefine %%reg r14
+%else
+%xdefine %%reg r15
+%endif
+%xdefine %%size %1
+%rotate 1
+ push %%reg
+ mov %%reg, [pDebugBuffer]
+
+ mov byte [%%reg], 0x57
+section .data
+%%lab: db %1, 0
+section .text
+ mov qword [%%reg+1], %%lab
+ add %%reg, 8+1
+%rotate 1
+
+%rep %0 - 2
+ mov byte [%%reg], %%size
+%if (%%size == 2)
+ mov word [%%reg+1], %1
+%elif (%%size == 4)
+ mov dword [%%reg+1], %1
+%elif (%%size == 8)
+ mov qword [%%reg+1], %1
+%elif (%%size == 16)
+ movdqu oword [%%reg+1], %1
+%elif (%%size == 32)
+ vmovdqu [%%reg+1], %1
+%elif (%%size == 64)
+ vmovdqu32 [%%reg+1], %1
+%else
+%error invalid size %%size
+%endif
+ add %%reg, %%size+1
+%rotate 1
+%endrep
+ mov [pDebugBuffer], %%reg
+ pop %%reg
+%endmacro
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; DBGPRINTL* data, ...
+%macro DBGPRINT16 1+
+ DBGPRINT_INT 2, %1
+%endmacro
+%macro DBGPRINT32 1+
+ DBGPRINT_INT 4, %1
+%endmacro
+%macro DBGPRINT64 1+
+ DBGPRINT_INT 8, %1
+%endmacro
+%macro DBGPRINT_XMM 1+
+ DBGPRINT_INT 16, %1
+%endmacro
+%macro DBGPRINT_YMM 1+
+ DBGPRINT_INT 32, %1
+%endmacro
+%macro DBGPRINT_ZMM 1+
+ DBGPRINT_INT 64, %1
+%endmacro
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; DBGPRINTL* label, data, ...
+%macro DBGPRINTL16 2+
+ DBGPRINTL_INT 2, %1, %2
+%endmacro
+%macro DBGPRINTL32 2+
+ DBGPRINTL_INT 4, %1, %2
+%endmacro
+%macro DBGPRINTL64 2+
+ DBGPRINTL_INT 8, %1, %2
+%endmacro
+%macro DBGPRINTL_XMM 2+
+ DBGPRINTL_INT 16, %1, %2
+%endmacro
+%macro DBGPRINTL_YMM 2+
+ DBGPRINTL_INT 32, %1, %2
+%endmacro
+%macro DBGPRINTL_ZMM 2+
+ DBGPRINTL_INT 64, %1, %2
+%endmacro
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro DBGPRINTL 1
+ push r15
+ mov r15, [pDebugBuffer]
+
+ mov byte [r15], 0x57
+section .data
+%%lab: db %1, 0
+section .text
+ mov qword [r15+1], %%lab
+ add r15, 8+1
+
+ mov [pDebugBuffer], r15
+ pop r15
+%endmacro
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%else
+%macro DBGPRINT16 1+
+%endmacro
+%macro DBGPRINT32 1+
+%endmacro
+%macro DBGPRINT64 1+
+%endmacro
+%macro DBGPRINT_XMM 1+
+%endmacro
+%macro DBGPRINT_YMM 1+
+%endmacro
+%macro DBGPRINT_ZMM 1+
+%endmacro
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro DBGPRINTL16 2+
+%endmacro
+%macro DBGPRINTL32 2+
+%endmacro
+%macro DBGPRINTL64 2+
+%endmacro
+%macro DBGPRINTL_XMM 2+
+%endmacro
+%macro DBGPRINTL_YMM 2+
+%endmacro
+%macro DBGPRINTL_ZMM 2+
+%endmacro
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro DBGPRINTL 1
+%endmacro
+%endif
+
+
+
+%if 0 ; OLD
+%macro DBGPRINTL_ZMM 2-*
+ push rax
+ mov rax, [pDebugBuffer]
+
+ mov byte [rax], 0x57
+section .data
+%%lab: db %1, 0
+section .text
+ mov qword [rax+1], %%lab
+ add rax, 8+1
+%rotate 1
+
+%rep %0 - 1
+ mov byte [rax], 64
+ vmovdqu32 [rax+1], %1
+%rotate 1
+ add rax, 64+1
+%endrep
+ mov [pDebugBuffer], rax
+ pop rax
+%endmacro
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro DBGPRINT_ZMM 1-*
+ push rax
+ mov rax, [pDebugBuffer]
+%rep %0
+ mov byte [rax], 64
+ vmovdqu32 [rax+1], %1
+%rotate 1
+ add rax, 64+1
+%endrep
+ mov [pDebugBuffer], rax
+ pop rax
+%endmacro
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro DBGPRINT_YMM 1-*
+ push rax
+ mov rax, [pDebugBuffer]
+%rep %0
+ mov byte [rax], 32
+ vmovdqu [rax+1], %1
+%rotate 1
+ add rax, 32+1
+%endrep
+ mov [pDebugBuffer], rax
+ pop rax
+%endmacro
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro DBGPRINT_XMM 1-*
+ push rax
+ mov rax, [pDebugBuffer]
+%rep %0
+ mov byte [rax], 16
+ vmovdqu oword [rax+1], %1
+%rotate 1
+ add rax, 16+1
+%endrep
+ mov [pDebugBuffer], rax
+ pop rax
+%endmacro
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro DBGPRINTL64 2-*
+ push rax
+ mov rax, [pDebugBuffer]
+
+ mov byte [rax], 0x57
+section .data
+%%lab: db %1, 0
+section .text
+ mov qword [rax+1], %%lab
+ add rax, 8+1
+%rotate 1
+
+%rep %0 - 1
+ mov byte [rax], 8
+ mov qword [rax+1], %1
+%rotate 1
+ add rax, 8+1
+%endrep
+ mov [pDebugBuffer], rax
+ pop rax
+%endmacro
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro DBGPRINT64 1-*
+ push rax
+ mov rax, [pDebugBuffer]
+%rep %0
+ mov byte [rax], 8
+ mov qword [rax+1], %1
+%rotate 1
+ add rax, 8+1
+%endrep
+ mov [pDebugBuffer], rax
+ pop rax
+%endmacro
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro DBGPRINT32 1-*
+ push rax
+ mov rax, [pDebugBuffer]
+%rep %0
+ mov byte [rax], 4
+ mov dword [rax+1], %1
+%rotate 1
+ add rax, 4+1
+%endrep
+ mov [pDebugBuffer], rax
+ pop rax
+%endmacro
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro DBGPRINT16 1-*
+ push rax
+ mov rax, [pDebugBuffer]
+%rep %0
+ mov byte [rax], 2
+ mov word [rax+1], %1
+%rotate 1
+ add rax, 2+1
+%endrep
+ mov [pDebugBuffer], rax
+ pop rax
+%endmacro
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro DBGPRINT_LAB 1
+ push rax
+ mov rax, [pDebugBuffer]
+
+ mov byte [rax], 0x57
+section .data
+%%lab: db %1, 0
+section .text
+ mov qword [rax+1], %%lab
+ add rax, 8+1
+
+ mov [pDebugBuffer], rax
+ pop rax
+%endmacro
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro DBGHIST 2
+ inc dword [%1 + 4 * %2]
+%endmacro
+%macro DBGPRINT_ZMM 1-*
+%endmacro
+%macro DBGPRINT_YMM 1-*
+%endmacro
+%macro DBGPRINT_XMM 1-*
+%endmacro
+%macro DBGPRINT64 1-*
+%endmacro
+%macro DBGPRINT32 1-*
+%endmacro
+%macro DBGPRINT16 1-*
+%endmacro
+%macro DBGHIST 2
+%endmacro
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%endif ; ifdef 0 ; OLD
+
+%endif ; DBGPRINT_ASM_INCLUDED
diff --git a/src/spdk/intel-ipsec-mb/include/des_utils.h b/src/spdk/intel-ipsec-mb/include/des_utils.h
new file mode 100644
index 000000000..4358132d0
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/include/des_utils.h
@@ -0,0 +1,134 @@
+/*******************************************************************************
+ Copyright (c) 2017-2018, Intel Corporation
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice,
+ this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+ * Neither the name of Intel Corporation nor the names of its contributors
+ may be used to endorse or promote products derived from this software
+ without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+ FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+
+/* DES utility functions and macros */
+
+#ifndef DES_UTILS_H
+#define DES_UTILS_H
+
+#include <stdint.h>
+#include "intel-ipsec-mb.h"
+
+/**
+ * @brief Gets selected bit value out of a 64-bit word
+ *
+ * @param val 64-bit word
+ * @param n bit number (0 to 63) to get value of
+ *
+ * @return n-th bit value (0 or 1 value only)
+ */
+__forceinline
+uint64_t bit_get64b(const uint64_t val, const unsigned n)
+{
+ IMB_ASSERT(n < 64);
+ return (val >> n) & UINT64_C(1);
+}
+
+/**
+ * @brief Sets selected bit in a 64-bit word
+ *
+ * @param val 64-bit word
+ * @param n bit number (0 to 63) to get value of
+ * @param b bit value (0 or 1)
+ *
+ * @return val with n-th bit set to value b
+ */
+__forceinline
+uint64_t bit_set64b(const uint64_t val, const unsigned n, const uint64_t b)
+{
+ const uint64_t m = UINT64_C(1) << n;
+
+ IMB_ASSERT(n < 64);
+ return (val & (~m)) | (b << n);
+}
+
+/**
+ * @brief Permutes bits in a 64-bit word as described by pattern
+ *
+ * The function goes through pattern array from index 0 to 'size' (max 63).
+ * It sets output bit number 'index' to value of
+ * bit number 'pattern[index] - 1' from 'in'.
+ *
+ * @param in 64-bit word to be permuted
+ * @param pattern pointer to array defining the permutation
+ * @param size is size of the permutation pattern
+ *
+ * @return permuted in word as described by the pattern
+ */
+__forceinline
+uint64_t permute_64b(const uint64_t in, const uint8_t *pattern, const int size)
+{
+ uint64_t out = 0;
+ int n = 0;
+
+ IMB_ASSERT(size <= 64);
+
+ for (n = 0; n < size; n++) {
+ /* '-1' is required as bit numbers in FIPS start with 1 not 0 */
+ const int m = ((int) pattern[n]) - 1;
+ const uint64_t bit_val = bit_get64b(in, m);
+
+ out = bit_set64b(out, n, bit_val);
+ }
+
+ return out;
+}
+
+static const uint8_t reflect_tab[16] = {
+ /* [ 0] 0000 => 0000 */ 0, /* [ 1] 0001 => 1000 */ 8,
+ /* [ 2] 0010 => 0100 */ 4, /* [ 3] 0011 => 1100 */ 12,
+ /* [ 4] 0100 => 0010 */ 2, /* [ 5] 0101 => 1010 */ 10,
+ /* [ 6] 0110 => 0110 */ 6, /* [ 7] 0111 => 1110 */ 14,
+ /* [ 8] 1000 => 0001 */ 1, /* [ 9] 1001 => 1001 */ 9,
+ /* [10] 1010 => 0101 */ 5, /* [11] 1011 => 1101 */ 13,
+ /* [12] 1100 => 0011 */ 3, /* [13] 1101 => 1011 */ 11,
+ /* [14] 1110 => 0111 */ 7, /* [15] 1111 => 1111 */ 15
+};
+
+__forceinline
+uint8_t reflect_8b(const uint8_t pb)
+{
+ return reflect_tab[pb >> 4] | (reflect_tab[pb & 15] << 4);
+}
+
+__forceinline
+uint64_t load64_reflect(const void *key)
+{
+ const uint8_t *kb = (const uint8_t *) key;
+
+ return ((uint64_t) reflect_8b(kb[0])) |
+ ((uint64_t) reflect_8b(kb[1])) << 8 |
+ ((uint64_t) reflect_8b(kb[2])) << 16 |
+ ((uint64_t) reflect_8b(kb[3])) << 24 |
+ ((uint64_t) reflect_8b(kb[4])) << 32 |
+ ((uint64_t) reflect_8b(kb[5])) << 40 |
+ ((uint64_t) reflect_8b(kb[6])) << 48 |
+ ((uint64_t) reflect_8b(kb[7])) << 56;
+}
+
+
+#endif /* DES_UTILS_H */
diff --git a/src/spdk/intel-ipsec-mb/include/gcm.h b/src/spdk/intel-ipsec-mb/include/gcm.h
new file mode 100644
index 000000000..bcc13cb3a
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/include/gcm.h
@@ -0,0 +1,428 @@
+/*******************************************************************************
+ Copyright (c) 2018-2019, Intel Corporation
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice,
+ this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+ * Neither the name of Intel Corporation nor the names of its contributors
+ may be used to endorse or promote products derived from this software
+ without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+ FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+
+#include "intel-ipsec-mb.h"
+
+#ifndef NO_GCM
+
+#ifndef _GCM_H_
+#define _GCM_H_
+
+/*
+ * AVX512+VAES+VPCLMULQDQ GCM API
+ * - intentionally this is not exposed in intel-ipsec-mb.h
+ * - available through IMB_GCM_xxx() macros from intel-ipsec-mb.h
+ */
+IMB_DLL_EXPORT void
+aes_gcm_enc_128_vaes_avx512(const struct gcm_key_data *key_data,
+ struct gcm_context_data *context_data,
+ uint8_t *out, uint8_t const *in, uint64_t len,
+ const uint8_t *iv,
+ uint8_t const *aad, uint64_t aad_len,
+ uint8_t *auth_tag, uint64_t auth_tag_len);
+IMB_DLL_EXPORT void
+aes_gcm_enc_192_vaes_avx512(const struct gcm_key_data *key_data,
+ struct gcm_context_data *context_data,
+ uint8_t *out, uint8_t const *in, uint64_t len,
+ const uint8_t *iv,
+ uint8_t const *aad, uint64_t aad_len,
+ uint8_t *auth_tag, uint64_t auth_tag_len);
+IMB_DLL_EXPORT void
+aes_gcm_enc_256_vaes_avx512(const struct gcm_key_data *key_data,
+ struct gcm_context_data *context_data,
+ uint8_t *out, uint8_t const *in, uint64_t len,
+ const uint8_t *iv,
+ uint8_t const *aad, uint64_t aad_len,
+ uint8_t *auth_tag, uint64_t auth_tag_len);
+IMB_DLL_EXPORT void
+aes_gcm_dec_128_vaes_avx512(const struct gcm_key_data *key_data,
+ struct gcm_context_data *context_data,
+ uint8_t *out, uint8_t const *in, uint64_t len,
+ const uint8_t *iv,
+ uint8_t const *aad, uint64_t aad_len,
+ uint8_t *auth_tag, uint64_t auth_tag_len);
+IMB_DLL_EXPORT void
+aes_gcm_dec_192_vaes_avx512(const struct gcm_key_data *key_data,
+ struct gcm_context_data *context_data,
+ uint8_t *out, uint8_t const *in, uint64_t len,
+ const uint8_t *iv,
+ uint8_t const *aad, uint64_t aad_len,
+ uint8_t *auth_tag, uint64_t auth_tag_len);
+IMB_DLL_EXPORT void
+aes_gcm_dec_256_vaes_avx512(const struct gcm_key_data *key_data,
+ struct gcm_context_data *context_data,
+ uint8_t *out, uint8_t const *in, uint64_t len,
+ const uint8_t *iv,
+ uint8_t const *aad, uint64_t aad_len,
+ uint8_t *auth_tag, uint64_t auth_tag_len);
+
+IMB_DLL_EXPORT void
+aes_gcm_init_128_vaes_avx512(const struct gcm_key_data *key_data,
+ struct gcm_context_data *context_data,
+ const uint8_t *iv, uint8_t const *aad,
+ uint64_t aad_len);
+IMB_DLL_EXPORT void
+aes_gcm_init_192_vaes_avx512(const struct gcm_key_data *key_data,
+ struct gcm_context_data *context_data,
+ const uint8_t *iv, uint8_t const *aad,
+ uint64_t aad_len);
+IMB_DLL_EXPORT void
+aes_gcm_init_256_vaes_avx512(const struct gcm_key_data *key_data,
+ struct gcm_context_data *context_data,
+ const uint8_t *iv, uint8_t const *aad,
+ uint64_t aad_len);
+IMB_DLL_EXPORT void
+aes_gcm_enc_128_update_vaes_avx512(const struct gcm_key_data *key_data,
+ struct gcm_context_data *context_data,
+ uint8_t *out, const uint8_t *in,
+ uint64_t len);
+IMB_DLL_EXPORT void
+aes_gcm_enc_192_update_vaes_avx512(const struct gcm_key_data *key_data,
+ struct gcm_context_data *context_data,
+ uint8_t *out, const uint8_t *in,
+ uint64_t len);
+IMB_DLL_EXPORT void
+aes_gcm_enc_256_update_vaes_avx512(const struct gcm_key_data *key_data,
+ struct gcm_context_data *context_data,
+ uint8_t *out, const uint8_t *in,
+ uint64_t len);
+IMB_DLL_EXPORT void
+aes_gcm_dec_128_update_vaes_avx512(const struct gcm_key_data *key_data,
+ struct gcm_context_data *context_data,
+ uint8_t *out, const uint8_t *in,
+ uint64_t len);
+IMB_DLL_EXPORT void
+aes_gcm_dec_192_update_vaes_avx512(const struct gcm_key_data *key_data,
+ struct gcm_context_data *context_data,
+ uint8_t *out, const uint8_t *in,
+ uint64_t len);
+IMB_DLL_EXPORT void
+aes_gcm_dec_256_update_vaes_avx512(const struct gcm_key_data *key_data,
+ struct gcm_context_data *context_data,
+ uint8_t *out, const uint8_t *in,
+ uint64_t len);
+IMB_DLL_EXPORT void
+aes_gcm_enc_128_finalize_vaes_avx512(const struct gcm_key_data *key_data,
+ struct gcm_context_data *context_data,
+ uint8_t *auth_tag, uint64_t auth_tag_len);
+IMB_DLL_EXPORT void
+aes_gcm_enc_192_finalize_vaes_avx512(const struct gcm_key_data *key_data,
+ struct gcm_context_data *context_data,
+ uint8_t *auth_tag, uint64_t auth_tag_len);
+IMB_DLL_EXPORT void
+aes_gcm_enc_256_finalize_vaes_avx512(const struct gcm_key_data *key_data,
+ struct gcm_context_data *context_data,
+ uint8_t *auth_tag, uint64_t auth_tag_len);
+IMB_DLL_EXPORT void
+aes_gcm_dec_128_finalize_vaes_avx512(const struct gcm_key_data *key_data,
+ struct gcm_context_data *context_data,
+ uint8_t *auth_tag, uint64_t auth_tag_len);
+IMB_DLL_EXPORT void
+aes_gcm_dec_192_finalize_vaes_avx512(const struct gcm_key_data *key_data,
+ struct gcm_context_data *context_data,
+ uint8_t *auth_tag, uint64_t auth_tag_len);
+IMB_DLL_EXPORT void
+aes_gcm_dec_256_finalize_vaes_avx512(const struct gcm_key_data *key_data,
+ struct gcm_context_data *context_data,
+ uint8_t *auth_tag, uint64_t auth_tag_len);
+IMB_DLL_EXPORT void
+aes_gcm_precomp_128_vaes_avx512(struct gcm_key_data *key_data);
+IMB_DLL_EXPORT void
+aes_gcm_precomp_192_vaes_avx512(struct gcm_key_data *key_data);
+IMB_DLL_EXPORT void
+aes_gcm_precomp_256_vaes_avx512(struct gcm_key_data *key_data);
+
+IMB_DLL_EXPORT void
+aes_gcm_pre_128_vaes_avx512(const void *key, struct gcm_key_data *key_data);
+IMB_DLL_EXPORT void
+aes_gcm_pre_192_vaes_avx512(const void *key, struct gcm_key_data *key_data);
+IMB_DLL_EXPORT void
+aes_gcm_pre_256_vaes_avx512(const void *key, struct gcm_key_data *key_data);
+
+/*
+ * AVX512 GCM API
+ * - intentionally this is not exposed in intel-ipsec-mb.h
+ * - available through IMB_GCM_xxx() macros from intel-ipsec-mb.h
+ */
+IMB_DLL_EXPORT void
+aes_gcm_enc_128_avx512(const struct gcm_key_data *key_data,
+ struct gcm_context_data *context_data,
+ uint8_t *out, uint8_t const *in, uint64_t len,
+ const uint8_t *iv,
+ uint8_t const *aad, uint64_t aad_len,
+ uint8_t *auth_tag, uint64_t auth_tag_len);
+IMB_DLL_EXPORT void
+aes_gcm_enc_192_avx512(const struct gcm_key_data *key_data,
+ struct gcm_context_data *context_data,
+ uint8_t *out, uint8_t const *in, uint64_t len,
+ const uint8_t *iv,
+ uint8_t const *aad, uint64_t aad_len,
+ uint8_t *auth_tag, uint64_t auth_tag_len);
+IMB_DLL_EXPORT void
+aes_gcm_enc_256_avx512(const struct gcm_key_data *key_data,
+ struct gcm_context_data *context_data,
+ uint8_t *out, uint8_t const *in, uint64_t len,
+ const uint8_t *iv,
+ uint8_t const *aad, uint64_t aad_len,
+ uint8_t *auth_tag, uint64_t auth_tag_len);
+IMB_DLL_EXPORT void
+aes_gcm_dec_128_avx512(const struct gcm_key_data *key_data,
+ struct gcm_context_data *context_data,
+ uint8_t *out, uint8_t const *in, uint64_t len,
+ const uint8_t *iv,
+ uint8_t const *aad, uint64_t aad_len,
+ uint8_t *auth_tag, uint64_t auth_tag_len);
+IMB_DLL_EXPORT void
+aes_gcm_dec_192_avx512(const struct gcm_key_data *key_data,
+ struct gcm_context_data *context_data,
+ uint8_t *out, uint8_t const *in, uint64_t len,
+ const uint8_t *iv,
+ uint8_t const *aad, uint64_t aad_len,
+ uint8_t *auth_tag, uint64_t auth_tag_len);
+IMB_DLL_EXPORT void
+aes_gcm_dec_256_avx512(const struct gcm_key_data *key_data,
+ struct gcm_context_data *context_data,
+ uint8_t *out, uint8_t const *in, uint64_t len,
+ const uint8_t *iv,
+ uint8_t const *aad, uint64_t aad_len,
+ uint8_t *auth_tag, uint64_t auth_tag_len);
+
+IMB_DLL_EXPORT void
+aes_gcm_init_128_avx512(const struct gcm_key_data *key_data,
+ struct gcm_context_data *context_data,
+ const uint8_t *iv, uint8_t const *aad,
+ uint64_t aad_len);
+IMB_DLL_EXPORT void
+aes_gcm_init_192_avx512(const struct gcm_key_data *key_data,
+ struct gcm_context_data *context_data,
+ const uint8_t *iv, uint8_t const *aad,
+ uint64_t aad_len);
+IMB_DLL_EXPORT void
+aes_gcm_init_256_avx512(const struct gcm_key_data *key_data,
+ struct gcm_context_data *context_data,
+ const uint8_t *iv, uint8_t const *aad,
+ uint64_t aad_len);
+IMB_DLL_EXPORT void
+aes_gcm_enc_128_update_avx512(const struct gcm_key_data *key_data,
+ struct gcm_context_data *context_data,
+ uint8_t *out, const uint8_t *in,
+ uint64_t len);
+IMB_DLL_EXPORT void
+aes_gcm_enc_192_update_avx512(const struct gcm_key_data *key_data,
+ struct gcm_context_data *context_data,
+ uint8_t *out, const uint8_t *in,
+ uint64_t len);
+IMB_DLL_EXPORT void
+aes_gcm_enc_256_update_avx512(const struct gcm_key_data *key_data,
+ struct gcm_context_data *context_data,
+ uint8_t *out, const uint8_t *in,
+ uint64_t len);
+IMB_DLL_EXPORT void
+aes_gcm_dec_128_update_avx512(const struct gcm_key_data *key_data,
+ struct gcm_context_data *context_data,
+ uint8_t *out, const uint8_t *in,
+ uint64_t len);
+IMB_DLL_EXPORT void
+aes_gcm_dec_192_update_avx512(const struct gcm_key_data *key_data,
+ struct gcm_context_data *context_data,
+ uint8_t *out, const uint8_t *in,
+ uint64_t len);
+IMB_DLL_EXPORT void
+aes_gcm_dec_256_update_avx512(const struct gcm_key_data *key_data,
+ struct gcm_context_data *context_data,
+ uint8_t *out, const uint8_t *in,
+ uint64_t len);
+IMB_DLL_EXPORT void
+aes_gcm_enc_128_finalize_avx512(const struct gcm_key_data *key_data,
+ struct gcm_context_data *context_data,
+ uint8_t *auth_tag, uint64_t auth_tag_len);
+IMB_DLL_EXPORT void
+aes_gcm_enc_192_finalize_avx512(const struct gcm_key_data *key_data,
+ struct gcm_context_data *context_data,
+ uint8_t *auth_tag, uint64_t auth_tag_len);
+IMB_DLL_EXPORT void
+aes_gcm_enc_256_finalize_avx512(const struct gcm_key_data *key_data,
+ struct gcm_context_data *context_data,
+ uint8_t *auth_tag, uint64_t auth_tag_len);
+IMB_DLL_EXPORT void
+aes_gcm_dec_128_finalize_avx512(const struct gcm_key_data *key_data,
+ struct gcm_context_data *context_data,
+ uint8_t *auth_tag, uint64_t auth_tag_len);
+IMB_DLL_EXPORT void
+aes_gcm_dec_192_finalize_avx512(const struct gcm_key_data *key_data,
+ struct gcm_context_data *context_data,
+ uint8_t *auth_tag, uint64_t auth_tag_len);
+IMB_DLL_EXPORT void
+aes_gcm_dec_256_finalize_avx512(const struct gcm_key_data *key_data,
+ struct gcm_context_data *context_data,
+ uint8_t *auth_tag, uint64_t auth_tag_len);
+IMB_DLL_EXPORT void
+aes_gcm_precomp_128_avx512(struct gcm_key_data *key_data);
+IMB_DLL_EXPORT void
+aes_gcm_precomp_192_avx512(struct gcm_key_data *key_data);
+IMB_DLL_EXPORT void
+aes_gcm_precomp_256_avx512(struct gcm_key_data *key_data);
+
+IMB_DLL_EXPORT void
+aes_gcm_pre_128_avx512(const void *key, struct gcm_key_data *key_data);
+IMB_DLL_EXPORT void
+aes_gcm_pre_192_avx512(const void *key, struct gcm_key_data *key_data);
+IMB_DLL_EXPORT void
+aes_gcm_pre_256_avx512(const void *key, struct gcm_key_data *key_data);
+
+/*
+ * AESNI emulation GCM API (based on SSE acrhitecture)
+ * - intentionally this is not exposed in intel-ipsec-mb.h
+ * - available through IMB_GCM_xxx() macros from intel-ipsec-mb.h
+ */
+IMB_DLL_EXPORT void
+aes_gcm_enc_128_sse_no_aesni(const struct gcm_key_data *key_data,
+ struct gcm_context_data *context_data,
+ uint8_t *out, uint8_t const *in, uint64_t len,
+ const uint8_t *iv, uint8_t const *aad,
+ uint64_t aad_len, uint8_t *auth_tag,
+ uint64_t auth_tag_len);
+IMB_DLL_EXPORT void
+aes_gcm_enc_192_sse_no_aesni(const struct gcm_key_data *key_data,
+ struct gcm_context_data *context_data,
+ uint8_t *out, uint8_t const *in, uint64_t len,
+ const uint8_t *iv, uint8_t const *aad,
+ uint64_t aad_len, uint8_t *auth_tag,
+ uint64_t auth_tag_len);
+IMB_DLL_EXPORT void
+aes_gcm_enc_256_sse_no_aesni(const struct gcm_key_data *key_data,
+ struct gcm_context_data *context_data,
+ uint8_t *out, uint8_t const *in, uint64_t len,
+ const uint8_t *iv,
+ uint8_t const *aad, uint64_t aad_len,
+ uint8_t *auth_tag, uint64_t auth_tag_len);
+IMB_DLL_EXPORT void
+aes_gcm_dec_128_sse_no_aesni(const struct gcm_key_data *key_data,
+ struct gcm_context_data *context_data,
+ uint8_t *out, uint8_t const *in, uint64_t len,
+ const uint8_t *iv, uint8_t const *aad,
+ uint64_t aad_len, uint8_t *auth_tag,
+ uint64_t auth_tag_len);
+IMB_DLL_EXPORT void
+aes_gcm_dec_192_sse_no_aesni(const struct gcm_key_data *key_data,
+ struct gcm_context_data *context_data,
+ uint8_t *out, uint8_t const *in, uint64_t len,
+ const uint8_t *iv, uint8_t const *aad,
+ uint64_t aad_len, uint8_t *auth_tag,
+ uint64_t auth_tag_len);
+IMB_DLL_EXPORT void
+aes_gcm_dec_256_sse_no_aesni(const struct gcm_key_data *key_data,
+ struct gcm_context_data *context_data,
+ uint8_t *out, uint8_t const *in, uint64_t len,
+ const uint8_t *iv, uint8_t const *aad,
+ uint64_t aad_len, uint8_t *auth_tag,
+ uint64_t auth_tag_len);
+IMB_DLL_EXPORT void
+aes_gcm_init_128_sse_no_aesni(const struct gcm_key_data *key_data,
+ struct gcm_context_data *context_data,
+ const uint8_t *iv, uint8_t const *aad,
+ uint64_t aad_len);
+IMB_DLL_EXPORT void
+aes_gcm_init_192_sse_no_aesni(const struct gcm_key_data *key_data,
+ struct gcm_context_data *context_data,
+ const uint8_t *iv, uint8_t const *aad,
+ uint64_t aad_len);
+IMB_DLL_EXPORT void
+aes_gcm_init_256_sse_no_aesni(const struct gcm_key_data *key_data,
+ struct gcm_context_data *context_data,
+ const uint8_t *iv, uint8_t const *aad,
+ uint64_t aad_len);
+IMB_DLL_EXPORT void
+aes_gcm_enc_128_update_sse_no_aesni(const struct gcm_key_data *key_data,
+ struct gcm_context_data *context_data,
+ uint8_t *out, const uint8_t *in,
+ uint64_t len);
+IMB_DLL_EXPORT void
+aes_gcm_enc_192_update_sse_no_aesni(const struct gcm_key_data *key_data,
+ struct gcm_context_data *context_data,
+ uint8_t *out, const uint8_t *in,
+ uint64_t len);
+IMB_DLL_EXPORT void
+aes_gcm_enc_256_update_sse_no_aesni(const struct gcm_key_data *key_data,
+ struct gcm_context_data *context_data,
+ uint8_t *out, const uint8_t *in,
+ uint64_t len);
+IMB_DLL_EXPORT void
+aes_gcm_dec_128_update_sse_no_aesni(const struct gcm_key_data *key_data,
+ struct gcm_context_data *context_data,
+ uint8_t *out, const uint8_t *in,
+ uint64_t len);
+IMB_DLL_EXPORT void
+aes_gcm_dec_192_update_sse_no_aesni(const struct gcm_key_data *key_data,
+ struct gcm_context_data *context_data,
+ uint8_t *out, const uint8_t *in,
+ uint64_t len);
+IMB_DLL_EXPORT void
+aes_gcm_dec_256_update_sse_no_aesni(const struct gcm_key_data *key_data,
+ struct gcm_context_data *context_data,
+ uint8_t *out, const uint8_t *in,
+ uint64_t len);
+IMB_DLL_EXPORT void
+aes_gcm_enc_128_finalize_sse_no_aesni(const struct gcm_key_data *key_data,
+ struct gcm_context_data *context_data,
+ uint8_t *auth_tag, uint64_t auth_tag_len);
+IMB_DLL_EXPORT void
+aes_gcm_enc_192_finalize_sse_no_aesni(const struct gcm_key_data *key_data,
+ struct gcm_context_data *context_data,
+ uint8_t *auth_tag, uint64_t auth_tag_len);
+IMB_DLL_EXPORT void
+aes_gcm_enc_256_finalize_sse_no_aesni(const struct gcm_key_data *key_data,
+ struct gcm_context_data *context_data,
+ uint8_t *auth_tag, uint64_t auth_tag_len);
+IMB_DLL_EXPORT void
+aes_gcm_dec_128_finalize_sse_no_aesni(const struct gcm_key_data *key_data,
+ struct gcm_context_data *context_data,
+ uint8_t *auth_tag, uint64_t auth_tag_len);
+IMB_DLL_EXPORT void
+aes_gcm_dec_192_finalize_sse_no_aesni(const struct gcm_key_data *key_data,
+ struct gcm_context_data *context_data,
+ uint8_t *auth_tag, uint64_t auth_tag_len);
+IMB_DLL_EXPORT void
+aes_gcm_dec_256_finalize_sse_no_aesni(const struct gcm_key_data *key_data,
+ struct gcm_context_data *context_data,
+ uint8_t *auth_tag, uint64_t auth_tag_len);
+IMB_DLL_EXPORT void
+aes_gcm_precomp_128_sse_no_aesni(struct gcm_key_data *key_data);
+IMB_DLL_EXPORT void
+aes_gcm_precomp_192_sse_no_aesni(struct gcm_key_data *key_data);
+IMB_DLL_EXPORT void
+aes_gcm_precomp_256_sse_no_aesni(struct gcm_key_data *key_data);
+
+IMB_DLL_EXPORT void
+aes_gcm_pre_128_sse_no_aesni(const void *key, struct gcm_key_data *key_data);
+IMB_DLL_EXPORT void
+aes_gcm_pre_192_sse_no_aesni(const void *key, struct gcm_key_data *key_data);
+IMB_DLL_EXPORT void
+aes_gcm_pre_256_sse_no_aesni(const void *key, struct gcm_key_data *key_data);
+
+#endif /* _GCM_H_ */
+#endif /* NO_GCM */
diff --git a/src/spdk/intel-ipsec-mb/include/gcm_defines.asm b/src/spdk/intel-ipsec-mb/include/gcm_defines.asm
new file mode 100644
index 000000000..31a961729
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/include/gcm_defines.asm
@@ -0,0 +1,272 @@
+;;
+;; Copyright (c) 2012-2019, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;; * Redistributions of source code must retain the above copyright notice,
+;; this list of conditions and the following disclaimer.
+;; * Redistributions in binary form must reproduce the above copyright
+;; notice, this list of conditions and the following disclaimer in the
+;; documentation and/or other materials provided with the distribution.
+;; * Neither the name of Intel Corporation nor the names of its contributors
+;; may be used to endorse or promote products derived from this software
+;; without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+%ifndef GCM_DEFINES_ASM_INCLUDED
+%define GCM_DEFINES_ASM_INCLUDED
+
+;
+; Authors:
+; Erdinc Ozturk
+; Vinodh Gopal
+; James Guilford
+
+section .data
+default rel
+
+align 16
+POLY: dq 0x0000000000000001, 0xC200000000000000
+
+align 64
+POLY2:
+ dq 0x00000001C2000000, 0xC200000000000000
+ dq 0x00000001C2000000, 0xC200000000000000
+ dq 0x00000001C2000000, 0xC200000000000000
+ dq 0x00000001C2000000, 0xC200000000000000
+
+align 16
+TWOONE: dq 0x0000000000000001, 0x0000000100000000
+
+;;; @note Order of these constants should not change.
+;;; More specifically, ALL_F should follow SHIFT_MASK, and ZERO should follow ALL_F
+align 64
+SHUF_MASK:
+ dq 0x08090A0B0C0D0E0F, 0x0001020304050607
+ dq 0x08090A0B0C0D0E0F, 0x0001020304050607
+ dq 0x08090A0B0C0D0E0F, 0x0001020304050607
+ dq 0x08090A0B0C0D0E0F, 0x0001020304050607
+
+align 16
+SHIFT_MASK:
+ dq 0x0706050403020100, 0x0f0e0d0c0b0a0908
+
+ALL_F:
+ dq 0xffffffffffffffff, 0xffffffffffffffff
+
+ZERO:
+ dq 0x0000000000000000, 0x0000000000000000
+
+align 16
+ONE:
+ dq 0x0000000000000001, 0x0000000000000000
+
+align 16
+TWO:
+ dq 0x0000000000000002, 0x0000000000000000
+
+align 16
+ONEf:
+ dq 0x0000000000000000, 0x0100000000000000
+
+align 16
+TWOf:
+ dq 0x0000000000000000, 0x0200000000000000
+
+align 64
+ddq_add_1234:
+ dq 0x0000000000000001, 0x0000000000000000
+ dq 0x0000000000000002, 0x0000000000000000
+ dq 0x0000000000000003, 0x0000000000000000
+ dq 0x0000000000000004, 0x0000000000000000
+
+align 64
+ddq_add_5678:
+ dq 0x0000000000000005, 0x0000000000000000
+ dq 0x0000000000000006, 0x0000000000000000
+ dq 0x0000000000000007, 0x0000000000000000
+ dq 0x0000000000000008, 0x0000000000000000
+
+align 64
+ddq_add_4444:
+ dq 0x0000000000000004, 0x0000000000000000
+ dq 0x0000000000000004, 0x0000000000000000
+ dq 0x0000000000000004, 0x0000000000000000
+ dq 0x0000000000000004, 0x0000000000000000
+
+align 64
+ddq_add_8888:
+ dq 0x0000000000000008, 0x0000000000000000
+ dq 0x0000000000000008, 0x0000000000000000
+ dq 0x0000000000000008, 0x0000000000000000
+ dq 0x0000000000000008, 0x0000000000000000
+
+align 64
+ddq_addbe_1234:
+ dq 0x0000000000000000, 0x0100000000000000
+ dq 0x0000000000000000, 0x0200000000000000
+ dq 0x0000000000000000, 0x0300000000000000
+ dq 0x0000000000000000, 0x0400000000000000
+
+align 64
+ddq_addbe_5678:
+ dq 0x0000000000000000, 0x0500000000000000
+ dq 0x0000000000000000, 0x0600000000000000
+ dq 0x0000000000000000, 0x0700000000000000
+ dq 0x0000000000000000, 0x0800000000000000
+
+align 64
+ddq_addbe_4444:
+ dq 0x0000000000000000, 0x0400000000000000
+ dq 0x0000000000000000, 0x0400000000000000
+ dq 0x0000000000000000, 0x0400000000000000
+ dq 0x0000000000000000, 0x0400000000000000
+
+align 64
+ddq_addbe_8888:
+ dq 0x0000000000000000, 0x0800000000000000
+ dq 0x0000000000000000, 0x0800000000000000
+ dq 0x0000000000000000, 0x0800000000000000
+ dq 0x0000000000000000, 0x0800000000000000
+
+align 64
+byte_len_to_mask_table:
+ dw 0x0000, 0x0001, 0x0003, 0x0007,
+ dw 0x000f, 0x001f, 0x003f, 0x007f,
+ dw 0x00ff, 0x01ff, 0x03ff, 0x07ff,
+ dw 0x0fff, 0x1fff, 0x3fff, 0x7fff,
+ dw 0xffff
+
+align 64
+byte64_len_to_mask_table:
+ dq 0x0000000000000000, 0x0000000000000001
+ dq 0x0000000000000003, 0x0000000000000007
+ dq 0x000000000000000f, 0x000000000000001f
+ dq 0x000000000000003f, 0x000000000000007f
+ dq 0x00000000000000ff, 0x00000000000001ff
+ dq 0x00000000000003ff, 0x00000000000007ff
+ dq 0x0000000000000fff, 0x0000000000001fff
+ dq 0x0000000000003fff, 0x0000000000007fff
+ dq 0x000000000000ffff, 0x000000000001ffff
+ dq 0x000000000003ffff, 0x000000000007ffff
+ dq 0x00000000000fffff, 0x00000000001fffff
+ dq 0x00000000003fffff, 0x00000000007fffff
+ dq 0x0000000000ffffff, 0x0000000001ffffff
+ dq 0x0000000003ffffff, 0x0000000007ffffff
+ dq 0x000000000fffffff, 0x000000001fffffff
+ dq 0x000000003fffffff, 0x000000007fffffff
+ dq 0x00000000ffffffff, 0x00000001ffffffff
+ dq 0x00000003ffffffff, 0x00000007ffffffff
+ dq 0x0000000fffffffff, 0x0000001fffffffff
+ dq 0x0000003fffffffff, 0x0000007fffffffff
+ dq 0x000000ffffffffff, 0x000001ffffffffff
+ dq 0x000003ffffffffff, 0x000007ffffffffff
+ dq 0x00000fffffffffff, 0x00001fffffffffff
+ dq 0x00003fffffffffff, 0x00007fffffffffff
+ dq 0x0000ffffffffffff, 0x0001ffffffffffff
+ dq 0x0003ffffffffffff, 0x0007ffffffffffff
+ dq 0x000fffffffffffff, 0x001fffffffffffff
+ dq 0x003fffffffffffff, 0x007fffffffffffff
+ dq 0x00ffffffffffffff, 0x01ffffffffffffff
+ dq 0x03ffffffffffffff, 0x07ffffffffffffff
+ dq 0x0fffffffffffffff, 0x1fffffffffffffff
+ dq 0x3fffffffffffffff, 0x7fffffffffffffff
+ dq 0xffffffffffffffff
+
+align 64
+mask_out_top_block:
+ dq 0xffffffffffffffff, 0xffffffffffffffff
+ dq 0xffffffffffffffff, 0xffffffffffffffff
+ dq 0xffffffffffffffff, 0xffffffffffffffff
+ dq 0x0000000000000000, 0x0000000000000000
+
+section .text
+
+;;define the fields of gcm_context_data struct
+;; struct gcm_context_data {
+;; // init, update and finalize context data
+;; uint8_t aad_hash[GCM_BLOCK_LEN];
+;; uint64_t aad_length;
+;; uint64_t in_length;
+;; uint8_t partial_block_enc_key[GCM_BLOCK_LEN];
+;; uint8_t orig_IV[GCM_BLOCK_LEN];
+;; uint8_t current_counter[GCM_BLOCK_LEN];
+;; uint64_t partial_block_length;
+;; };
+
+%define AadHash (16*0) ; store current Hash of data which has been input
+%define AadLen (16*1) ; store length of input data which will not be encrypted or decrypted
+%define InLen ((16*1)+8); store length of input data which will be encrypted or decrypted
+%define PBlockEncKey (16*2) ; encryption key for the partial block at the end of the previous update
+%define OrigIV (16*3) ; input IV
+%define CurCount (16*4) ; Current counter for generation of encryption key
+%define PBlockLen (16*5) ; length of partial block at the end of the previous update
+
+%define reg(q) xmm %+ q
+%define regy(q) ymm %+ q
+%define regz(q) zmm %+ q
+
+%ifdef WIN_ABI
+ %xdefine arg1 rcx
+ %xdefine arg2 rdx
+ %xdefine arg3 r8
+ %xdefine arg4 r9
+ %xdefine arg5 qword [r14 + STACK_OFFSET + 8*5]
+ %xdefine arg6 qword [r14 + STACK_OFFSET + 8*6]
+ %xdefine arg7 qword [r14 + STACK_OFFSET + 8*7]
+ %xdefine arg8 qword [r14 + STACK_OFFSET + 8*8]
+ %xdefine arg9 qword [r14 + STACK_OFFSET + 8*9]
+ %xdefine arg10 qword [r14 + STACK_OFFSET + 8*10]
+%else
+ %xdefine arg1 rdi
+ %xdefine arg2 rsi
+ %xdefine arg3 rdx
+ %xdefine arg4 rcx
+ %xdefine arg5 r8
+ %xdefine arg6 r9
+ %xdefine arg7 qword [r14 + STACK_OFFSET + 8*1]
+ %xdefine arg8 qword [r14 + STACK_OFFSET + 8*2]
+ %xdefine arg9 qword [r14 + STACK_OFFSET + 8*3]
+ %xdefine arg10 qword [r14 + STACK_OFFSET + 8*4]
+%endif
+
+%ifdef NT_LDST
+ %define NT_LD
+ %define NT_ST
+%endif
+
+;;; Use Non-temporal load/stor
+%ifdef NT_LD
+ %define XLDR movntdqa
+ %define VXLDR vmovntdqa
+ %define VX512LDR vmovntdqa
+%else
+ %define XLDR movdqu
+ %define VXLDR vmovdqu
+ %define VX512LDR vmovdqu8
+%endif
+
+;;; Use Non-temporal load/stor
+%ifdef NT_ST
+ %define XSTR movntdq
+ %define VXSTR vmovntdq
+ %define VX512STR vmovntdq
+%else
+ %define XSTR movdqu
+ %define VXSTR vmovdqu
+ %define VX512STR vmovdqu8
+%endif
+
+%endif ; GCM_DEFINES_ASM_INCLUDED
diff --git a/src/spdk/intel-ipsec-mb/include/gcm_keys_avx2_avx512.asm b/src/spdk/intel-ipsec-mb/include/gcm_keys_avx2_avx512.asm
new file mode 100644
index 000000000..d812e53bd
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/include/gcm_keys_avx2_avx512.asm
@@ -0,0 +1,52 @@
+;;
+;; Copyright (c) 2019, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;; * Redistributions of source code must retain the above copyright notice,
+;; this list of conditions and the following disclaimer.
+;; * Redistributions in binary form must reproduce the above copyright
+;; notice, this list of conditions and the following disclaimer in the
+;; documentation and/or other materials provided with the distribution.
+;; * Neither the name of Intel Corporation nor the names of its contributors
+;; may be used to endorse or promote products derived from this software
+;; without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+%ifndef GCM_KEYS_AVX2_AVX512_INCLUDED
+%define GCM_KEYS_AVX2_AVX512_INCLUDED
+
+;; Define the fields of gcm_key_data struct:
+;; uint8_t expanded_keys[GCM_ENC_KEY_LEN * GCM_KEY_SETS];
+;; uint8_t shifted_hkey_8[GCM_ENC_KEY_LEN]; // HashKey^8 <<1 mod poly
+;; uint8_t shifted_hkey_7[GCM_ENC_KEY_LEN]; // HashKey^7 <<1 mod poly
+;; uint8_t shifted_hkey_6[GCM_ENC_KEY_LEN]; // HashKey^6 <<1 mod poly
+;; uint8_t shifted_hkey_5[GCM_ENC_KEY_LEN]; // HashKey^5 <<1 mod poly
+;; uint8_t shifted_hkey_4[GCM_ENC_KEY_LEN]; // HashKey^4 <<1 mod poly
+;; uint8_t shifted_hkey_3[GCM_ENC_KEY_LEN]; // HashKey^3 <<1 mod poly
+;; uint8_t shifted_hkey_2[GCM_ENC_KEY_LEN]; // HashKey^2 <<1 mod poly
+;; uint8_t shifted_hkey_1[GCM_ENC_KEY_LEN]; // HashKey <<1 mod poly
+
+%define HashKey_8 (16*15) ; HashKey^8 <<1 mod poly
+%define HashKey_7 (16*16) ; HashKey^7 <<1 mod poly
+%define HashKey_6 (16*17) ; HashKey^6 <<1 mod poly
+%define HashKey_5 (16*18) ; HashKey^5 <<1 mod poly
+%define HashKey_4 (16*19) ; HashKey^4 <<1 mod poly
+%define HashKey_3 (16*20) ; HashKey^3 <<1 mod poly
+%define HashKey_2 (16*21) ; HashKey^2 <<1 mod poly
+%define HashKey_1 (16*22) ; HashKey <<1 mod poly
+%define HashKey (16*22) ; HashKey <<1 mod poly
+
+%endif ; GCM_KEYS_AVX2_AVX512_INCLUDED
diff --git a/src/spdk/intel-ipsec-mb/include/gcm_keys_sse_avx.asm b/src/spdk/intel-ipsec-mb/include/gcm_keys_sse_avx.asm
new file mode 100644
index 000000000..f7531e5a7
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/include/gcm_keys_sse_avx.asm
@@ -0,0 +1,73 @@
+;;
+;; Copyright (c) 2019, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;; * Redistributions of source code must retain the above copyright notice,
+;; this list of conditions and the following disclaimer.
+;; * Redistributions in binary form must reproduce the above copyright
+;; notice, this list of conditions and the following disclaimer in the
+;; documentation and/or other materials provided with the distribution.
+;; * Neither the name of Intel Corporation nor the names of its contributors
+;; may be used to endorse or promote products derived from this software
+;; without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+%ifndef GCM_KEYS_SSE_AVX_INCLUDED
+%define GCM_KEYS_SSE_AVX_INCLUDED
+
+;; Define the fields of gcm_key_data struct:
+;; uint8_t expanded_keys[GCM_ENC_KEY_LEN * GCM_KEY_SETS];
+;; uint8_t shifted_hkey_8[GCM_ENC_KEY_LEN]; // HashKey^8 <<1 mod poly
+;; uint8_t shifted_hkey_7[GCM_ENC_KEY_LEN]; // HashKey^7 <<1 mod poly
+;; uint8_t shifted_hkey_6[GCM_ENC_KEY_LEN]; // HashKey^6 <<1 mod poly
+;; uint8_t shifted_hkey_5[GCM_ENC_KEY_LEN]; // HashKey^5 <<1 mod poly
+;; uint8_t shifted_hkey_4[GCM_ENC_KEY_LEN]; // HashKey^4 <<1 mod poly
+;; uint8_t shifted_hkey_3[GCM_ENC_KEY_LEN]; // HashKey^3 <<1 mod poly
+;; uint8_t shifted_hkey_2[GCM_ENC_KEY_LEN]; // HashKey^2 <<1 mod poly
+;; uint8_t shifted_hkey_1[GCM_ENC_KEY_LEN]; // HashKey <<1 mod poly
+;; uint8_t shifted_hkey_1_k[GCM_ENC_KEY_LEN]; // XOR of High and Low 64 bits of HashKey <<1 mod poly (Karatsuba)
+;; uint8_t shifted_hkey_2_k[GCM_ENC_KEY_LEN]; // XOR of High and Low 64 bits of HashKey^2 <<1 mod poly (Karatsuba)
+;; uint8_t shifted_hkey_3_k[GCM_ENC_KEY_LEN]; // XOR of High and Low 64 bits of HashKey^3 <<1 mod poly (Karatsuba)
+;; uint8_t shifted_hkey_4_k[GCM_ENC_KEY_LEN]; // XOR of High and Low 64 bits of HashKey^4 <<1 mod poly (Karatsuba)
+;; uint8_t shifted_hkey_5_k[GCM_ENC_KEY_LEN]; // XOR of High and Low 64 bits of HashKey^5 <<1 mod poly (Karatsuba)
+;; uint8_t shifted_hkey_6_k[GCM_ENC_KEY_LEN]; // XOR of High and Low 64 bits of HashKey^6 <<1 mod poly (Karatsuba)
+;; uint8_t shifted_hkey_7_k[GCM_ENC_KEY_LEN]; // XOR of High and Low 64 bits of HashKey^7 <<1 mod poly (Karatsuba)
+;; uint8_t shifted_hkey_8_k[GCM_ENC_KEY_LEN]; // XOR of High and Low 64 bits of HashKey^8 <<1 mod poly (Karatsuba)
+
+;;
+;; Key structure holds up to 8 ghash keys
+;;
+%define HashKey_8 (16*15) ; HashKey^8 <<1 mod poly
+%define HashKey_7 (16*16) ; HashKey^7 <<1 mod poly
+%define HashKey_6 (16*17) ; HashKey^6 <<1 mod poly
+%define HashKey_5 (16*18) ; HashKey^5 <<1 mod poly
+%define HashKey_4 (16*19) ; HashKey^4 <<1 mod poly
+%define HashKey_3 (16*20) ; HashKey^3 <<1 mod poly
+%define HashKey_2 (16*21) ; HashKey^2 <<1 mod poly
+%define HashKey_1 (16*22) ; HashKey <<1 mod poly
+%define HashKey (16*22) ; HashKey <<1 mod poly
+;; ghash keys for Karatsuba multiply
+%define HashKey_k (16*23) ; XOR of High 64 bits and Low 64 bits of HashKey <<1 mod poly
+%define HashKey_1_k (16*23) ; XOR of High 64 bits and Low 64 bits of HashKey <<1 mod poly
+%define HashKey_2_k (16*24) ; XOR of High 64 bits and Low 64 bits of HashKey^2 <<1 mod poly
+%define HashKey_3_k (16*25) ; XOR of High 64 bits and Low 64 bits of HashKey^3 <<1 mod poly
+%define HashKey_4_k (16*26) ; XOR of High 64 bits and Low 64 bits of HashKey^4 <<1 mod poly
+%define HashKey_5_k (16*27) ; XOR of High 64 bits and Low 64 bits of HashKey^5 <<1 mod poly
+%define HashKey_6_k (16*28) ; XOR of High 64 bits and Low 64 bits of HashKey^6 <<1 mod poly
+%define HashKey_7_k (16*29) ; XOR of High 64 bits and Low 64 bits of HashKey^7 <<1 mod poly
+%define HashKey_8_k (16*30) ; XOR of High 64 bits and Low 64 bits of HashKey^8 <<1 mod poly
+
+%endif ; GCM_KEYS_SSE_AVX_INCLUDED
diff --git a/src/spdk/intel-ipsec-mb/include/gcm_keys_vaes_avx512.asm b/src/spdk/intel-ipsec-mb/include/gcm_keys_vaes_avx512.asm
new file mode 100644
index 000000000..4aea2f5c9
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/include/gcm_keys_vaes_avx512.asm
@@ -0,0 +1,231 @@
+;;
+;; Copyright (c) 2019, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;; * Redistributions of source code must retain the above copyright notice,
+;; this list of conditions and the following disclaimer.
+;; * Redistributions in binary form must reproduce the above copyright
+;; notice, this list of conditions and the following disclaimer in the
+;; documentation and/or other materials provided with the distribution.
+;; * Neither the name of Intel Corporation nor the names of its contributors
+;; may be used to endorse or promote products derived from this software
+;; without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+%ifndef GCM_KEYS_VAES_AVX512_INCLUDED
+%define GCM_KEYS_VAES_AVX512_INCLUDED
+
+;; Define the fields of gcm_key_data struct:
+;; uint8_t expanded_keys[GCM_ENC_KEY_LEN * GCM_KEY_SETS];
+;; uint8_t shifted_hkey_9_128[GCM_ENC_KEY_LEN * (128 - 8)];
+;; uint8_t shifted_hkey_8[GCM_ENC_KEY_LEN]; // HashKey^8 <<1 mod poly
+;; uint8_t shifted_hkey_7[GCM_ENC_KEY_LEN]; // HashKey^7 <<1 mod poly
+;; uint8_t shifted_hkey_6[GCM_ENC_KEY_LEN]; // HashKey^6 <<1 mod poly
+;; uint8_t shifted_hkey_5[GCM_ENC_KEY_LEN]; // HashKey^5 <<1 mod poly
+;; uint8_t shifted_hkey_4[GCM_ENC_KEY_LEN]; // HashKey^4 <<1 mod poly
+;; uint8_t shifted_hkey_3[GCM_ENC_KEY_LEN]; // HashKey^3 <<1 mod poly
+;; uint8_t shifted_hkey_2[GCM_ENC_KEY_LEN]; // HashKey^2 <<1 mod poly
+;; uint8_t shifted_hkey_1[GCM_ENC_KEY_LEN]; // HashKey <<1 mod poly
+
+%ifdef GCM_BIG_DATA
+;;
+;; Key structure holds up to 128 ghash keys
+;;
+%define HashKey_128 (16*15) ; HashKey^128 <<1 mod poly
+%define HashKey_127 (16*16) ; HashKey^127 <<1 mod poly
+%define HashKey_126 (16*17) ; HashKey^126 <<1 mod poly
+%define HashKey_125 (16*18) ; HashKey^125 <<1 mod poly
+%define HashKey_124 (16*19) ; HashKey^124 <<1 mod poly
+%define HashKey_123 (16*20) ; HashKey^123 <<1 mod poly
+%define HashKey_122 (16*21) ; HashKey^122 <<1 mod poly
+%define HashKey_121 (16*22) ; HashKey^121 <<1 mod poly
+%define HashKey_120 (16*23) ; HashKey^120 <<1 mod poly
+%define HashKey_119 (16*24) ; HashKey^119 <<1 mod poly
+%define HashKey_118 (16*25) ; HashKey^118 <<1 mod poly
+%define HashKey_117 (16*26) ; HashKey^117 <<1 mod poly
+%define HashKey_116 (16*27) ; HashKey^116 <<1 mod poly
+%define HashKey_115 (16*28) ; HashKey^115 <<1 mod poly
+%define HashKey_114 (16*29) ; HashKey^114 <<1 mod poly
+%define HashKey_113 (16*30) ; HashKey^113 <<1 mod poly
+%define HashKey_112 (16*31) ; HashKey^112 <<1 mod poly
+%define HashKey_111 (16*32) ; HashKey^111 <<1 mod poly
+%define HashKey_110 (16*33) ; HashKey^110 <<1 mod poly
+%define HashKey_109 (16*34) ; HashKey^109 <<1 mod poly
+%define HashKey_108 (16*35) ; HashKey^108 <<1 mod poly
+%define HashKey_107 (16*36) ; HashKey^107 <<1 mod poly
+%define HashKey_106 (16*37) ; HashKey^106 <<1 mod poly
+%define HashKey_105 (16*38) ; HashKey^105 <<1 mod poly
+%define HashKey_104 (16*39) ; HashKey^104 <<1 mod poly
+%define HashKey_103 (16*40) ; HashKey^103 <<1 mod poly
+%define HashKey_102 (16*41) ; HashKey^102 <<1 mod poly
+%define HashKey_101 (16*42) ; HashKey^101 <<1 mod poly
+%define HashKey_100 (16*43) ; HashKey^100 <<1 mod poly
+%define HashKey_99 (16*44) ; HashKey^99 <<1 mod poly
+%define HashKey_98 (16*45) ; HashKey^98 <<1 mod poly
+%define HashKey_97 (16*46) ; HashKey^97 <<1 mod poly
+%define HashKey_96 (16*47) ; HashKey^96 <<1 mod poly
+%define HashKey_95 (16*48) ; HashKey^95 <<1 mod poly
+%define HashKey_94 (16*49) ; HashKey^94 <<1 mod poly
+%define HashKey_93 (16*50) ; HashKey^93 <<1 mod poly
+%define HashKey_92 (16*51) ; HashKey^92 <<1 mod poly
+%define HashKey_91 (16*52) ; HashKey^91 <<1 mod poly
+%define HashKey_90 (16*53) ; HashKey^90 <<1 mod poly
+%define HashKey_89 (16*54) ; HashKey^89 <<1 mod poly
+%define HashKey_88 (16*55) ; HashKey^88 <<1 mod poly
+%define HashKey_87 (16*56) ; HashKey^87 <<1 mod poly
+%define HashKey_86 (16*57) ; HashKey^86 <<1 mod poly
+%define HashKey_85 (16*58) ; HashKey^85 <<1 mod poly
+%define HashKey_84 (16*59) ; HashKey^84 <<1 mod poly
+%define HashKey_83 (16*60) ; HashKey^83 <<1 mod poly
+%define HashKey_82 (16*61) ; HashKey^82 <<1 mod poly
+%define HashKey_81 (16*62) ; HashKey^81 <<1 mod poly
+%define HashKey_80 (16*63) ; HashKey^80 <<1 mod poly
+%define HashKey_79 (16*64) ; HashKey^79 <<1 mod poly
+%define HashKey_78 (16*65) ; HashKey^78 <<1 mod poly
+%define HashKey_77 (16*66) ; HashKey^77 <<1 mod poly
+%define HashKey_76 (16*67) ; HashKey^76 <<1 mod poly
+%define HashKey_75 (16*68) ; HashKey^75 <<1 mod poly
+%define HashKey_74 (16*69) ; HashKey^74 <<1 mod poly
+%define HashKey_73 (16*70) ; HashKey^73 <<1 mod poly
+%define HashKey_72 (16*71) ; HashKey^72 <<1 mod poly
+%define HashKey_71 (16*72) ; HashKey^71 <<1 mod poly
+%define HashKey_70 (16*73) ; HashKey^70 <<1 mod poly
+%define HashKey_69 (16*74) ; HashKey^69 <<1 mod poly
+%define HashKey_68 (16*75) ; HashKey^68 <<1 mod poly
+%define HashKey_67 (16*76) ; HashKey^67 <<1 mod poly
+%define HashKey_66 (16*77) ; HashKey^66 <<1 mod poly
+%define HashKey_65 (16*78) ; HashKey^65 <<1 mod poly
+%define HashKey_64 (16*79) ; HashKey^64 <<1 mod poly
+%define HashKey_63 (16*80) ; HashKey^63 <<1 mod poly
+%define HashKey_62 (16*81) ; HashKey^62 <<1 mod poly
+%define HashKey_61 (16*82) ; HashKey^61 <<1 mod poly
+%define HashKey_60 (16*83) ; HashKey^60 <<1 mod poly
+%define HashKey_59 (16*84) ; HashKey^59 <<1 mod poly
+%define HashKey_58 (16*85) ; HashKey^58 <<1 mod poly
+%define HashKey_57 (16*86) ; HashKey^57 <<1 mod poly
+%define HashKey_56 (16*87) ; HashKey^56 <<1 mod poly
+%define HashKey_55 (16*88) ; HashKey^55 <<1 mod poly
+%define HashKey_54 (16*89) ; HashKey^54 <<1 mod poly
+%define HashKey_53 (16*90) ; HashKey^53 <<1 mod poly
+%define HashKey_52 (16*91) ; HashKey^52 <<1 mod poly
+%define HashKey_51 (16*92) ; HashKey^51 <<1 mod poly
+%define HashKey_50 (16*93) ; HashKey^50 <<1 mod poly
+%define HashKey_49 (16*94) ; HashKey^49 <<1 mod poly
+%define HashKey_48 (16*95) ; HashKey^48 <<1 mod poly
+%define HashKey_47 (16*96) ; HashKey^47 <<1 mod poly
+%define HashKey_46 (16*97) ; HashKey^46 <<1 mod poly
+%define HashKey_45 (16*98) ; HashKey^45 <<1 mod poly
+%define HashKey_44 (16*99) ; HashKey^44 <<1 mod poly
+%define HashKey_43 (16*100) ; HashKey^43 <<1 mod poly
+%define HashKey_42 (16*101) ; HashKey^42 <<1 mod poly
+%define HashKey_41 (16*102) ; HashKey^41 <<1 mod poly
+%define HashKey_40 (16*103) ; HashKey^40 <<1 mod poly
+%define HashKey_39 (16*104) ; HashKey^39 <<1 mod poly
+%define HashKey_38 (16*105) ; HashKey^38 <<1 mod poly
+%define HashKey_37 (16*106) ; HashKey^37 <<1 mod poly
+%define HashKey_36 (16*107) ; HashKey^36 <<1 mod poly
+%define HashKey_35 (16*108) ; HashKey^35 <<1 mod poly
+%define HashKey_34 (16*109) ; HashKey^34 <<1 mod poly
+%define HashKey_33 (16*110) ; HashKey^33 <<1 mod poly
+%define HashKey_32 (16*111) ; HashKey^32 <<1 mod poly
+%define HashKey_31 (16*112) ; HashKey^31 <<1 mod poly
+%define HashKey_30 (16*113) ; HashKey^30 <<1 mod poly
+%define HashKey_29 (16*114) ; HashKey^29 <<1 mod poly
+%define HashKey_28 (16*115) ; HashKey^28 <<1 mod poly
+%define HashKey_27 (16*116) ; HashKey^27 <<1 mod poly
+%define HashKey_26 (16*117) ; HashKey^26 <<1 mod poly
+%define HashKey_25 (16*118) ; HashKey^25 <<1 mod poly
+%define HashKey_24 (16*119) ; HashKey^24 <<1 mod poly
+%define HashKey_23 (16*120) ; HashKey^23 <<1 mod poly
+%define HashKey_22 (16*121) ; HashKey^22 <<1 mod poly
+%define HashKey_21 (16*122) ; HashKey^21 <<1 mod poly
+%define HashKey_20 (16*123) ; HashKey^20 <<1 mod poly
+%define HashKey_19 (16*124) ; HashKey^19 <<1 mod poly
+%define HashKey_18 (16*125) ; HashKey^18 <<1 mod poly
+%define HashKey_17 (16*126) ; HashKey^17 <<1 mod poly
+%define HashKey_16 (16*127) ; HashKey^16 <<1 mod poly
+%define HashKey_15 (16*128) ; HashKey^15 <<1 mod poly
+%define HashKey_14 (16*129) ; HashKey^14 <<1 mod poly
+%define HashKey_13 (16*130) ; HashKey^13 <<1 mod poly
+%define HashKey_12 (16*131) ; HashKey^12 <<1 mod poly
+%define HashKey_11 (16*132) ; HashKey^11 <<1 mod poly
+%define HashKey_10 (16*133) ; HashKey^10 <<1 mod poly
+%define HashKey_9 (16*134) ; HashKey^9 <<1 mod poly
+%define HashKey_8 (16*135) ; HashKey^8 <<1 mod poly
+%define HashKey_7 (16*136) ; HashKey^7 <<1 mod poly
+%define HashKey_6 (16*137) ; HashKey^6 <<1 mod poly
+%define HashKey_5 (16*138) ; HashKey^5 <<1 mod poly
+%define HashKey_4 (16*139) ; HashKey^4 <<1 mod poly
+%define HashKey_3 (16*140) ; HashKey^3 <<1 mod poly
+%define HashKey_2 (16*141) ; HashKey^2 <<1 mod poly
+%define HashKey_1 (16*142) ; HashKey <<1 mod poly
+%define HashKey (16*142) ; HashKey <<1 mod poly
+%else
+;;
+;; Key structure holds up to 48 ghash keys
+;;
+%define HashKey_48 (16*15) ; HashKey^48 <<1 mod poly
+%define HashKey_47 (16*16) ; HashKey^47 <<1 mod poly
+%define HashKey_46 (16*17) ; HashKey^46 <<1 mod poly
+%define HashKey_45 (16*18) ; HashKey^45 <<1 mod poly
+%define HashKey_44 (16*19) ; HashKey^44 <<1 mod poly
+%define HashKey_43 (16*20) ; HashKey^43 <<1 mod poly
+%define HashKey_42 (16*21) ; HashKey^42 <<1 mod poly
+%define HashKey_41 (16*22) ; HashKey^41 <<1 mod poly
+%define HashKey_40 (16*23) ; HashKey^40 <<1 mod poly
+%define HashKey_39 (16*24) ; HashKey^39 <<1 mod poly
+%define HashKey_38 (16*25) ; HashKey^38 <<1 mod poly
+%define HashKey_37 (16*26) ; HashKey^37 <<1 mod poly
+%define HashKey_36 (16*27) ; HashKey^36 <<1 mod poly
+%define HashKey_35 (16*28) ; HashKey^35 <<1 mod poly
+%define HashKey_34 (16*29) ; HashKey^34 <<1 mod poly
+%define HashKey_33 (16*30) ; HashKey^33 <<1 mod poly
+%define HashKey_32 (16*31) ; HashKey^32 <<1 mod poly
+%define HashKey_31 (16*32) ; HashKey^31 <<1 mod poly
+%define HashKey_30 (16*33) ; HashKey^30 <<1 mod poly
+%define HashKey_29 (16*34) ; HashKey^29 <<1 mod poly
+%define HashKey_28 (16*35) ; HashKey^28 <<1 mod poly
+%define HashKey_27 (16*36) ; HashKey^27 <<1 mod poly
+%define HashKey_26 (16*37) ; HashKey^26 <<1 mod poly
+%define HashKey_25 (16*38) ; HashKey^25 <<1 mod poly
+%define HashKey_24 (16*39) ; HashKey^24 <<1 mod poly
+%define HashKey_23 (16*40) ; HashKey^23 <<1 mod poly
+%define HashKey_22 (16*41) ; HashKey^22 <<1 mod poly
+%define HashKey_21 (16*42) ; HashKey^21 <<1 mod poly
+%define HashKey_20 (16*43) ; HashKey^20 <<1 mod poly
+%define HashKey_19 (16*44) ; HashKey^19 <<1 mod poly
+%define HashKey_18 (16*45) ; HashKey^18 <<1 mod poly
+%define HashKey_17 (16*46) ; HashKey^17 <<1 mod poly
+%define HashKey_16 (16*47) ; HashKey^16 <<1 mod poly
+%define HashKey_15 (16*48) ; HashKey^15 <<1 mod poly
+%define HashKey_14 (16*49) ; HashKey^14 <<1 mod poly
+%define HashKey_13 (16*50) ; HashKey^13 <<1 mod poly
+%define HashKey_12 (16*51) ; HashKey^12 <<1 mod poly
+%define HashKey_11 (16*52) ; HashKey^11 <<1 mod poly
+%define HashKey_10 (16*53) ; HashKey^10 <<1 mod poly
+%define HashKey_9 (16*54) ; HashKey^9 <<1 mod poly
+%define HashKey_8 (16*55) ; HashKey^8 <<1 mod poly
+%define HashKey_7 (16*56) ; HashKey^7 <<1 mod poly
+%define HashKey_6 (16*57) ; HashKey^6 <<1 mod poly
+%define HashKey_5 (16*58) ; HashKey^5 <<1 mod poly
+%define HashKey_4 (16*59) ; HashKey^4 <<1 mod poly
+%define HashKey_3 (16*60) ; HashKey^3 <<1 mod poly
+%define HashKey_2 (16*61) ; HashKey^2 <<1 mod poly
+%define HashKey_1 (16*62) ; HashKey <<1 mod poly
+%define HashKey (16*62) ; HashKey <<1 mod poly
+%endif ; !GCM_BIG_DATA
+
+%endif ; GCM_KEYS_VAES_AVX512_INCLUDED
diff --git a/src/spdk/intel-ipsec-mb/include/kasumi_internal.h b/src/spdk/intel-ipsec-mb/include/kasumi_internal.h
new file mode 100755
index 000000000..87b114d88
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/include/kasumi_internal.h
@@ -0,0 +1,1853 @@
+/*******************************************************************************
+ Copyright (c) 2009-2019, Intel Corporation
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice,
+ this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+ * Neither the name of Intel Corporation nor the names of its contributors
+ may be used to endorse or promote products derived from this software
+ without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+ FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+
+
+/*---------------------------------------------------------
+* Kasumi_internal.h
+*---------------------------------------------------------*/
+
+#ifndef _KASUMI_INTERNAL_H_
+#define _KASUMI_INTERNAL_H_
+
+#include <sys/types.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "intel-ipsec-mb.h"
+#include "wireless_common.h"
+#include "include/clear_regs_mem.h"
+#include "include/constant_lookup.h"
+
+/*---------------------------------------------------------------------
+* Kasumi Inner S-Boxes
+*---------------------------------------------------------------------*/
+
+/* Table version based on a small table, no cache trash */
+static const uint16_t sso_kasumi_S7e[] = {
+ 0x6c00, 0x6601, 0x7802, 0x7603, 0x2404, 0x4e05, 0xb006, 0xce07,
+ 0x5c08, 0x1e09, 0x6a0a, 0xac0b, 0x1c0c, 0x3e0d, 0xea0e, 0x5c0f,
+ 0x4e10, 0xc011, 0x6a12, 0xc213, 0x0214, 0xac15, 0xae16, 0x3617,
+ 0x6e18, 0xa019, 0x681a, 0x001b, 0x0a1c, 0xe41d, 0xc41e, 0x9c1f,
+ 0x2a20, 0x5021, 0xb622, 0xd823, 0x2024, 0x3225, 0x3826, 0x2e27,
+ 0x9a28, 0xac29, 0x042a, 0xa62b, 0x882c, 0xd62d, 0xd22e, 0x082f,
+ 0x4830, 0x9631, 0xf432, 0x1c33, 0x4634, 0xb035, 0x7636, 0xa637,
+ 0xea38, 0x7039, 0x543a, 0x783b, 0xdc3c, 0x6e3d, 0xae3e, 0xba3f,
+ 0x6a40, 0x6a41, 0x1c42, 0x9043, 0x3a44, 0x5e45, 0x8c46, 0x7447,
+ 0x7c48, 0x5449, 0x384a, 0x1c4b, 0xa44c, 0xe84d, 0x604e, 0x304f,
+ 0x4050, 0xc451, 0x8652, 0xac53, 0x1654, 0xb655, 0x1856, 0x0657,
+ 0x0658, 0xa259, 0xf25a, 0x785b, 0xf85c, 0x785d, 0x845e, 0x3a5f,
+ 0x0c60, 0xfc61, 0xf062, 0x9c63, 0x5e64, 0xc265, 0x6666, 0x7667,
+ 0x9a68, 0x4669, 0x746a, 0xb46b, 0x506c, 0xe06d, 0x3a6e, 0x866f,
+ 0x6070, 0x3471, 0x3c72, 0xd673, 0x3474, 0x4c75, 0xa476, 0x7277,
+ 0xa478, 0xd479, 0xea7a, 0xa47b, 0x487c, 0x147d, 0x8a7e, 0xf87f,
+ 0x6c00, 0x6601, 0x7802, 0x7603, 0x2404, 0x4e05, 0xb006, 0xce07,
+ 0x5c08, 0x1e09, 0x6a0a, 0xac0b, 0x1c0c, 0x3e0d, 0xea0e, 0x5c0f,
+ 0x4e10, 0xc011, 0x6a12, 0xc213, 0x0214, 0xac15, 0xae16, 0x3617,
+ 0x6e18, 0xa019, 0x681a, 0x001b, 0x0a1c, 0xe41d, 0xc41e, 0x9c1f,
+ 0x2a20, 0x5021, 0xb622, 0xd823, 0x2024, 0x3225, 0x3826, 0x2e27,
+ 0x9a28, 0xac29, 0x042a, 0xa62b, 0x882c, 0xd62d, 0xd22e, 0x082f,
+ 0x4830, 0x9631, 0xf432, 0x1c33, 0x4634, 0xb035, 0x7636, 0xa637,
+ 0xea38, 0x7039, 0x543a, 0x783b, 0xdc3c, 0x6e3d, 0xae3e, 0xba3f,
+ 0x6a40, 0x6a41, 0x1c42, 0x9043, 0x3a44, 0x5e45, 0x8c46, 0x7447,
+ 0x7c48, 0x5449, 0x384a, 0x1c4b, 0xa44c, 0xe84d, 0x604e, 0x304f,
+ 0x4050, 0xc451, 0x8652, 0xac53, 0x1654, 0xb655, 0x1856, 0x0657,
+ 0x0658, 0xa259, 0xf25a, 0x785b, 0xf85c, 0x785d, 0x845e, 0x3a5f,
+ 0x0c60, 0xfc61, 0xf062, 0x9c63, 0x5e64, 0xc265, 0x6666, 0x7667,
+ 0x9a68, 0x4669, 0x746a, 0xb46b, 0x506c, 0xe06d, 0x3a6e, 0x866f,
+ 0x6070, 0x3471, 0x3c72, 0xd673, 0x3474, 0x4c75, 0xa476, 0x7277,
+ 0xa478, 0xd479, 0xea7a, 0xa47b, 0x487c, 0x147d, 0x8a7e, 0xf87f
+};
+
+static const uint16_t sso_kasumi_S9e[] = {
+ 0x4ea7, 0xdeef, 0x42a1, 0xf77b, 0x0f87, 0x9d4e, 0x1209, 0xa552,
+ 0x4c26, 0xc4e2, 0x6030, 0xcd66, 0x89c4, 0x0381, 0xb45a, 0x1b8d,
+ 0x6eb7, 0xfafd, 0x2693, 0x974b, 0x3f9f, 0xa954, 0x6633, 0xd56a,
+ 0x6532, 0xe9f4, 0x0d06, 0xa452, 0xb0d8, 0x3e9f, 0xc964, 0x62b1,
+ 0x5eaf, 0xe2f1, 0xd3e9, 0x4a25, 0x9cce, 0x2211, 0x0000, 0x9b4d,
+ 0x582c, 0xfcfe, 0xf57a, 0x743a, 0x1e8f, 0xb8dc, 0xa251, 0x2190,
+ 0xbe5f, 0x0603, 0x773b, 0xeaf5, 0x6c36, 0xd6eb, 0xb4da, 0x2b95,
+ 0xb1d8, 0x1108, 0x58ac, 0xddee, 0xe773, 0x4522, 0x1f8f, 0x984c,
+ 0x4aa5, 0x8ac5, 0x178b, 0xf279, 0x0301, 0xc1e0, 0x4fa7, 0xa8d4,
+ 0xe0f0, 0x381c, 0x9dce, 0x60b0, 0x2d96, 0xf7fb, 0x4120, 0xbedf,
+ 0xebf5, 0x2f97, 0xf2f9, 0x1309, 0xb259, 0x74ba, 0xbadd, 0x59ac,
+ 0x48a4, 0x944a, 0x71b8, 0x88c4, 0x95ca, 0x4ba5, 0xbd5e, 0x46a3,
+ 0xd0e8, 0x3c9e, 0x0c86, 0xc562, 0x1a0d, 0xf4fa, 0xd7eb, 0x1c8e,
+ 0x7ebf, 0x8a45, 0x82c1, 0x53a9, 0x3098, 0xc6e3, 0xdd6e, 0x0e87,
+ 0xb158, 0x592c, 0x2914, 0xe4f2, 0x6bb5, 0x8140, 0xe271, 0x2d16,
+ 0x160b, 0xe6f3, 0xae57, 0x7b3d, 0x4824, 0xba5d, 0xe1f0, 0x361b,
+ 0xcfe7, 0x7dbe, 0xc5e2, 0x5229, 0x8844, 0x389c, 0x93c9, 0x0683,
+ 0x8d46, 0x2793, 0xa753, 0x2814, 0x4e27, 0xe673, 0x75ba, 0xf87c,
+ 0xb7db, 0x0180, 0xf9fc, 0x6a35, 0xe070, 0x54aa, 0xbfdf, 0x2e97,
+ 0xfc7e, 0x52a9, 0x9249, 0x190c, 0x2f17, 0x8341, 0x50a8, 0xd96c,
+ 0xd76b, 0x4924, 0x5c2e, 0xe7f3, 0x1389, 0x8f47, 0x8944, 0x3018,
+ 0x91c8, 0x170b, 0x3a9d, 0x99cc, 0xd1e8, 0x55aa, 0x6b35, 0xcae5,
+ 0x6fb7, 0xf5fa, 0xa0d0, 0x1f0f, 0xbb5d, 0x2391, 0x65b2, 0xd8ec,
+ 0x2010, 0xa2d1, 0xcf67, 0x6834, 0x7038, 0xf078, 0x8ec7, 0x2b15,
+ 0xa3d1, 0x41a0, 0xf8fc, 0x3f1f, 0xecf6, 0x0c06, 0xa653, 0x6331,
+ 0x49a4, 0xb359, 0x3299, 0xedf6, 0x8241, 0x7a3d, 0xe8f4, 0x351a,
+ 0x5aad, 0xbcde, 0x45a2, 0x8643, 0x0582, 0xe170, 0x0b05, 0xca65,
+ 0xb9dc, 0x4723, 0x86c3, 0x5dae, 0x6231, 0x9e4f, 0x4ca6, 0x954a,
+ 0x3118, 0xff7f, 0xeb75, 0x0080, 0xfd7e, 0x3198, 0x369b, 0xdfef,
+ 0xdf6f, 0x0984, 0x2512, 0xd66b, 0x97cb, 0x43a1, 0x7c3e, 0x8dc6,
+ 0x0884, 0xc2e1, 0x96cb, 0x793c, 0xd4ea, 0x1c0e, 0x5b2d, 0xb65b,
+ 0xeff7, 0x3d1e, 0x51a8, 0xa6d3, 0xb75b, 0x6733, 0x188c, 0xed76,
+ 0x4623, 0xce67, 0xfa7d, 0x57ab, 0x2613, 0xacd6, 0x8bc5, 0x2492,
+ 0xe5f2, 0x753a, 0x79bc, 0xcce6, 0x0100, 0x9349, 0x8cc6, 0x3b1d,
+ 0x6432, 0xe874, 0x9c4e, 0x359a, 0x140a, 0x9acd, 0xfdfe, 0x56ab,
+ 0xcee7, 0x5a2d, 0x168b, 0xa7d3, 0x3a1d, 0xac56, 0xf3f9, 0x4020,
+ 0x9048, 0x341a, 0xad56, 0x2c96, 0x7339, 0xd5ea, 0x5faf, 0xdcee,
+ 0x379b, 0x8b45, 0x2a95, 0xb3d9, 0x5028, 0xee77, 0x5cae, 0xc763,
+ 0x72b9, 0xd2e9, 0x0b85, 0x8e47, 0x81c0, 0x2311, 0xe974, 0x6e37,
+ 0xdc6e, 0x64b2, 0x8542, 0x180c, 0xabd5, 0x1188, 0xe371, 0x7cbe,
+ 0x0201, 0xda6d, 0xef77, 0x1289, 0x6ab5, 0xb058, 0x964b, 0x6934,
+ 0x0904, 0xc9e4, 0xc462, 0x2110, 0xe572, 0x2713, 0x399c, 0xde6f,
+ 0xa150, 0x7d3e, 0x0804, 0xf1f8, 0xd9ec, 0x0703, 0x6130, 0x9a4d,
+ 0xa351, 0x67b3, 0x2a15, 0xcb65, 0x5f2f, 0x994c, 0xc7e3, 0x2412,
+ 0x5e2f, 0xaa55, 0x3219, 0xe3f1, 0xb5da, 0x4321, 0xc864, 0x1b0d,
+ 0x5128, 0xbdde, 0x1d0e, 0xd46a, 0x3e1f, 0xd068, 0x63b1, 0xa854,
+ 0x3d9e, 0xcde6, 0x158a, 0xc060, 0xc663, 0x349a, 0xffff, 0x2894,
+ 0x3b9d, 0xd369, 0x3399, 0xfeff, 0x44a2, 0xaed7, 0x5d2e, 0x92c9,
+ 0x150a, 0xbf5f, 0xaf57, 0x2090, 0x73b9, 0xdb6d, 0xd86c, 0x552a,
+ 0xf6fb, 0x4422, 0x6cb6, 0xfbfd, 0x148a, 0xa4d2, 0x9f4f, 0x0a85,
+ 0x6f37, 0xc160, 0x9148, 0x1a8d, 0x198c, 0xb55a, 0xf67b, 0x7f3f,
+ 0x85c2, 0x3319, 0x5bad, 0xc8e4, 0x77bb, 0xc3e1, 0xb85c, 0x2994,
+ 0xcbe5, 0x4da6, 0xf0f8, 0x5329, 0x2e17, 0xaad5, 0x0482, 0xa5d2,
+ 0x2c16, 0xb2d9, 0x371b, 0x8c46, 0x4d26, 0xd168, 0x47a3, 0xfe7f,
+ 0x7138, 0xf379, 0x0e07, 0xa9d4, 0x84c2, 0x0402, 0xea75, 0x4f27,
+ 0x9fcf, 0x0502, 0xc0e0, 0x7fbf, 0xeef7, 0x76bb, 0xa050, 0x1d8e,
+ 0x391c, 0xc361, 0xd269, 0x0d86, 0x572b, 0xafd7, 0xadd6, 0x70b8,
+ 0x7239, 0x90c8, 0xb95c, 0x7e3f, 0x98cc, 0x78bc, 0x4221, 0x87c3,
+ 0xc261, 0x3c1e, 0x6d36, 0xb6db, 0xbc5e, 0x40a0, 0x0281, 0xdbed,
+ 0x8040, 0x66b3, 0x0f07, 0xcc66, 0x7abd, 0x9ecf, 0xe472, 0x2592,
+ 0x6db6, 0xbbdd, 0x0783, 0xf47a, 0x80c0, 0x542a, 0xfb7d, 0x0a05,
+ 0x2291, 0xec76, 0x68b4, 0x83c1, 0x4b25, 0x8743, 0x1088, 0xf97c,
+ 0x562b, 0x8442, 0x783c, 0x8fc7, 0xab55, 0x7bbd, 0x94ca, 0x61b0,
+ 0x1008, 0xdaed, 0x1e0f, 0xf178, 0x69b4, 0xa1d0, 0x763b, 0x9bcd
+};
+
+/* Range of input data for KASUMI is from 1 to 20000 bits */
+#define KASUMI_MIN_LEN 1
+#define KASUMI_MAX_LEN 20000
+
+/* KASUMI cipher definitions */
+#define NUM_KASUMI_ROUNDS (8) /* 8 rounds in the kasumi spec */
+#define QWORDSIZEINBITS (64)
+#define QWORDSIZEINBYTES (8)
+#define LAST_PADDING_BIT (1)
+
+#define BYTESIZE (8)
+#define BITSIZE(x) ((int)(sizeof(x)*BYTESIZE))
+
+/*--------- 16 bit rotate left ------------------------------------------*/
+#define ROL16(a,b) (uint16_t)((a<<b)|(a>>(16-b)))
+
+/*----- a 64-bit structure to help with kasumi endian issues -----*/
+typedef union _ku64 {
+ uint64_t b64[1];
+ uint32_t b32[2];
+ uint16_t b16[4];
+ uint8_t b8[8];
+} kasumi_union_t;
+
+typedef union SafeBuffer {
+ uint64_t b64;
+ uint32_t b32[2];
+ uint8_t b8[KASUMI_BLOCK_SIZE];
+} SafeBuf;
+
+/*---------------------------------------------------------------------
+* Inline 16-bit left rotation
+*---------------------------------------------------------------------*/
+
+#define ROL16(a,b) (uint16_t)((a<<b)|(a>>(16-b)))
+
+#define FIp1(data, key1, key2, key3) \
+ do { \
+ uint16_t datal, datah; \
+ \
+ (data) ^= (key1); \
+ datal = LOOKUP16_SSE(sso_kasumi_S7e, (uint8_t)(data), 256); \
+ datah = LOOKUP16_SSE(sso_kasumi_S9e, (data) >> 7, 512); \
+ (data) = datal ^ datah; \
+ (data) ^= (key2); \
+ datal = LOOKUP16_SSE(sso_kasumi_S7e, (data) >> 9, 256); \
+ datah = LOOKUP16_SSE(sso_kasumi_S9e, (data) & 0x1FF, 512); \
+ (data) = datal ^ datah; \
+ (data) ^= (key3); \
+ } while (0)
+
+#define FIp2(data1, data2, key1, key2, key3, key4) \
+ do { \
+ FIp1(data1, key1, key2, key3); \
+ FIp1(data2, key1, key2, key4); \
+ } while (0)
+
+#define FLpi(key1, key2, res_h, res_l) \
+ do { \
+ uint16_t l, r; \
+ r = (res_l) & (key1); \
+ r = (res_h) ^ ROL16(r, 1); \
+ l = r | (key2); \
+ (res_h) = (res_l) ^ ROL16(l, 1); \
+ (res_l) = r; \
+ } while (0)
+
+#define FLp1(index, h, l) \
+ do { \
+ uint16_t ka = *(index + 0); \
+ uint16_t kb = *(index + 1); \
+ FLpi(ka, kb, h, l); \
+ } while (0)
+
+#define FLp2(index, h1, l1, h2, l2) \
+ do { \
+ uint16_t ka = *(index + 0); \
+ uint16_t kb = *(index + 1); \
+ FLpi(ka, kb, h1, l1); \
+ FLpi(ka, kb, h2, l2); \
+ } while (0)
+
+#define FLp3(index, h1, l1, h2, l2, h3, l3) \
+ do { \
+ uint16_t ka = *(index + 0); \
+ uint16_t kb = *(index + 1); \
+ FLpi(ka, kb, h1, l1); \
+ FLpi(ka, kb, h2, l2); \
+ FLpi(ka, kb, h3, l3); \
+ } while (0)
+
+#define FLp4(index, h1, l1, h2, l2, h3, l3, h4, l4) \
+ do { \
+ FLp2(index, h1, l1, h2, l2); \
+ FLp2(index, h3, l3, h4, l4); \
+ } while (0)
+
+#define FOp1(index, h, l) \
+ do { \
+ FIp1(h, *(index + 2), *(index + 3), l); \
+ FIp1(l, *(index + 4), *(index + 5), h); \
+ FIp1(h, *(index + 6), *(index + 7), l); \
+ } while (0)
+
+#define FOp2(index, h1, l1, h2, l2) \
+ do { \
+ uint16_t ka = *(index + 2); \
+ uint16_t kb = *(index + 3); \
+ FIp2(h1, h2, ka, kb, l1, l2); \
+ ka = *(index + 4); \
+ kb = *(index + 5); \
+ FIp2(l1, l2, ka, kb, h1, h2); \
+ ka = *(index + 6); \
+ kb = *(index + 7); \
+ FIp2(h1, h2, ka, kb, l1, l2); \
+ } while (0)
+
+#define FOp3(index, h1, l1, h2, l2, h3, l3) \
+ do { \
+ uint16_t ka = *(index + 2); \
+ uint16_t kb = *(index + 3); \
+ FIp2(h1, h2, ka, kb, l1, l2); \
+ FIp1(h3, ka, kb, l3); \
+ ka = *(index + 4); \
+ kb = *(index + 5); \
+ FIp2(l1, l2, ka, kb, h1, h2); \
+ FIp1(l3, ka, kb, h3); \
+ ka = *(index + 6); \
+ kb = *(index + 7); \
+ FIp2(h1, h2, ka, kb, l1, l2); \
+ FIp1(h3, ka, kb, l3); \
+ } while (0)
+
+#define FOp4(index, h1, l1, h2, l2, h3, l3, h4, l4) \
+ do { \
+ uint16_t ka = *(index + 2); \
+ uint16_t kb = *(index + 3); \
+ FIp2(h1, h2, ka, kb, l1, l2); \
+ FIp2(h3, h4, ka, kb, l3, l4); \
+ ka = *(index + 4); \
+ kb = *(index + 5); \
+ FIp2(l1, l2, ka, kb, h1, h2); \
+ FIp2(l3, l4, ka, kb, h3, h4); \
+ ka = *(index + 6); \
+ kb = *(index + 7); \
+ FIp2(h1, h2, ka, kb, l1, l2); \
+ FIp2(h3, h4, ka, kb, l3, l4); \
+ } while (0)
+
+/**
+ *******************************************************************************
+ * @description
+ * This function performs the Kasumi operation on the given block using the key
+ * that is already scheduled in the context
+ *
+ * @param[in] pContext Context where the scheduled keys are stored
+ * @param[in/out] pData Block to be enc/dec
+ *
+ ******************************************************************************/
+static void kasumi_1_block(const uint16_t *context, uint16_t *data)
+{
+ const uint16_t *end = context + KASUMI_KEY_SCHEDULE_SIZE;
+ uint16_t temp_l, temp_h;
+
+ /* 4 iterations odd/even */
+ do {
+ temp_l = data[3];
+ temp_h = data[2];
+ FLp1(context, temp_h, temp_l);
+ FOp1(context, temp_h, temp_l);
+ context += 8;
+ data[1] ^= temp_l;
+ data[0] ^= temp_h;
+
+ temp_h = data[1];
+ temp_l = data[0];
+ FOp1(context, temp_h, temp_l);
+ FLp1(context, temp_h, temp_l);
+ context += 8;
+ data[3] ^= temp_h;
+ data[2] ^= temp_l;
+ } while (context < end);
+}
+
+/**
+ ******************************************************************************
+ * @description
+ * This function performs the Kasumi operation on the given blocks using the key
+ * that is already scheduled in the context
+ *
+ * @param[in] pContext Context where the scheduled keys are stored
+ * @param[in/out] pData1 First block to be enc/dec
+ * @param[in/out] pData2 Second block to be enc/dec
+ *
+ ******************************************************************************/
+static void
+kasumi_2_blocks(const uint16_t *context, uint16_t *data1, uint16_t *data2)
+{
+ const uint16_t *end = context + KASUMI_KEY_SCHEDULE_SIZE;
+ uint16_t temp1_l, temp1_h;
+ uint16_t temp2_l, temp2_h;
+
+ /* 4 iterations odd/even , with fine grain interleave */
+ do {
+ /* even */
+ temp1_l = data1[3];
+ temp1_h = data1[2];
+ temp2_l = data2[3];
+ temp2_h = data2[2];
+ FLp2(context, temp1_h, temp1_l, temp2_h, temp2_l);
+ FOp2(context, temp1_h, temp1_l, temp2_h, temp2_l);
+ context += 8;
+ data1[1] ^= temp1_l;
+ data1[0] ^= temp1_h;
+ data2[1] ^= temp2_l;
+ data2[0] ^= temp2_h;
+
+ /* odd */
+ temp1_h = data1[1];
+ temp1_l = data1[0];
+ temp2_h = data2[1];
+ temp2_l = data2[0];
+ FOp2(context, temp1_h, temp1_l, temp2_h, temp2_l);
+ FLp2(context, temp1_h, temp1_l, temp2_h, temp2_l);
+ context += 8;
+ data1[3] ^= temp1_h;
+ data1[2] ^= temp1_l;
+ data2[3] ^= temp2_h;
+ data2[2] ^= temp2_l;
+ } while (context < end);
+}
+
+
+/**
+ *******************************************************************************
+ * @description
+ * This function performs the Kasumi operation on the given blocks using the key
+ * that is already scheduled in the context
+ *
+ * @param[in] pContext Context where the scheduled keys are stored
+ * @param[in/out] pData1 First block to be enc/dec
+ * @param[in/out] pData2 Second block to be enc/dec
+ * @param[in/out] pData3 Third block to be enc/dec
+ *
+ ******************************************************************************/
+static void
+kasumi_3_blocks(const uint16_t *context, uint16_t *data1,
+ uint16_t *data2, uint16_t *data3)
+{
+ /* Case when the conmpiler is able to interleave efficiently */
+ const uint16_t *end = context + KASUMI_KEY_SCHEDULE_SIZE;
+ uint16_t temp1_l, temp1_h;
+ uint16_t temp2_l, temp2_h;
+ uint16_t temp3_l, temp3_h;
+
+ /* 4 iterations odd/even , with fine grain interleave */
+ do {
+ temp1_l = data1[3];
+ temp1_h = data1[2];
+ temp2_l = data2[3];
+ temp2_h = data2[2];
+ temp3_l = data3[3];
+ temp3_h = data3[2];
+ FLp3(context, temp1_h, temp1_l, temp2_h, temp2_l, temp3_h,
+ temp3_l);
+ FOp3(context, temp1_h, temp1_l, temp2_h, temp2_l, temp3_h,
+ temp3_l);
+ context += 8;
+ data1[1] ^= temp1_l;
+ data1[0] ^= temp1_h;
+ data2[1] ^= temp2_l;
+ data2[0] ^= temp2_h;
+ data3[1] ^= temp3_l;
+ data3[0] ^= temp3_h;
+
+ temp1_h = data1[1];
+ temp1_l = data1[0];
+ temp2_h = data2[1];
+ temp2_l = data2[0];
+ temp3_h = data3[1];
+ temp3_l = data3[0];
+ FOp3(context, temp1_h, temp1_l, temp2_h, temp2_l, temp3_h,
+ temp3_l);
+ FLp3(context, temp1_h, temp1_l, temp2_h, temp2_l, temp3_h,
+ temp3_l);
+ context += 8;
+ data1[3] ^= temp1_h;
+ data1[2] ^= temp1_l;
+ data2[3] ^= temp2_h;
+ data2[2] ^= temp2_l;
+ data3[3] ^= temp3_h;
+ data3[2] ^= temp3_l;
+ } while (context < end);
+}
+
+/**
+ *******************************************************************************
+ * @description
+ * This function performs the Kasumi operation on the given blocks using the key
+ * that is already scheduled in the context
+ *
+ * @param[in] pContext Context where the scheduled keys are stored
+ * @param[in] ppData Pointer to an array of addresses of blocks
+ *
+ ******************************************************************************/
+static void
+kasumi_4_blocks(const uint16_t *context, uint16_t **ppData)
+{
+ /* Case when the conmpiler is unable to interleave efficiently */
+ kasumi_2_blocks (context, ppData[0], ppData[1]);
+ kasumi_2_blocks (context, ppData[2], ppData[3]);
+}
+
+/**
+ ******************************************************************************
+ * @description
+ * This function performs the Kasumi operation on the given blocks using the key
+ * that is already scheduled in the context
+ *
+ * @param[in] pContext Context where the scheduled keys are stored
+ * @param[in] ppData Pointer to an array of addresses of blocks
+ *
+ ******************************************************************************/
+static void
+kasumi_8_blocks(const uint16_t *context, uint16_t **ppData)
+{
+ kasumi_4_blocks (context, &ppData[0]);
+ kasumi_4_blocks (context, &ppData[4]);
+}
+
+/******************************************************************************
+* @description
+* Multiple wrappers for the Kasumi rounds on up to 16 blocks of 64 bits at a
+*time.
+*
+* Depending on the variable packet lengths, different wrappers get called.
+* It has been measured that 1 packet is faster than 2, 2 packets is faster
+*than 3
+* 3 packets is faster than 4, and so on ...
+* It has also been measured that 6 = 4+2 packets is faster than 8
+* It has also been measured that 7 packets are processed faster as 8 packets,
+*
+* If the assumptions are not verified, it is easy to implmement
+* the right function and reference it in wrapperArray.
+*
+*******************************************************************************/
+static void
+kasumi_f8_1_buffer_wrapper(const uint16_t *context, uint16_t **data)
+{
+ kasumi_1_block(context, data[0]);
+}
+
+static void
+kasumi_f8_2_buffer_wrapper(const uint16_t *context, uint16_t **data)
+{
+ kasumi_2_blocks(context, data[0], data[1]);
+}
+
+static void
+kasumi_f8_3_buffer_wrapper(const uint16_t *context, uint16_t **data)
+{
+ kasumi_3_blocks(context, data[0], data[1], data[2]);
+}
+
+static void
+kasumi_f8_5_buffer_wrapper(const uint16_t *context, uint16_t **data)
+{
+ kasumi_4_blocks(context, &data[0]);
+ kasumi_1_block(context, data[4]);
+}
+
+static void
+kasumi_f8_6_buffer_wrapper(const uint16_t *context, uint16_t **data)
+{
+ /* It is also assumed 6 = 4+2 packets is faster than 8 */
+ kasumi_4_blocks(context, &data[0]);
+ kasumi_2_blocks(context, data[4], data[5]);
+}
+
+static void
+kasumi_f8_7_buffer_wrapper(const uint16_t *context, uint16_t **data)
+{
+ kasumi_4_blocks(context, &data[0]);
+ kasumi_3_blocks(context, data[4], data[5], data[6]);
+}
+
+static void
+kasumi_f8_9_buffer_wrapper(const uint16_t *context, uint16_t **data)
+{
+
+ kasumi_8_blocks(context, &data[0]);
+ kasumi_1_block(context, data[8]);
+}
+
+static void
+kasumi_f8_10_buffer_wrapper(const uint16_t *context, uint16_t **data)
+{
+ kasumi_8_blocks(context, &data[0]);
+ kasumi_2_blocks(context, data[8], data[9]);
+}
+
+static void
+kasumi_f8_11_buffer_wrapper(const uint16_t *context, uint16_t **data)
+{
+ kasumi_8_blocks(context, &data[0]);
+ kasumi_3_blocks(context, data[8], data[9], data[10]);
+}
+
+static void
+kasumi_f8_12_buffer_wrapper(const uint16_t *context, uint16_t **data)
+{
+ kasumi_8_blocks(context, &data[0]);
+ kasumi_4_blocks(context, &data[8]);
+}
+
+static void
+kasumi_f8_13_buffer_wrapper(const uint16_t *context, uint16_t **data)
+{
+
+ kasumi_8_blocks(context, &data[0]);
+ kasumi_4_blocks(context, &data[8]);
+ kasumi_1_block(context, data[12]);
+}
+
+static void
+kasumi_f8_14_buffer_wrapper(const uint16_t *context, uint16_t **data)
+{
+ kasumi_8_blocks(context, &data[0]);
+ kasumi_4_blocks(context, &data[8]);
+ kasumi_2_blocks(context, data[12], data[13]);
+}
+
+static void
+kasumi_f8_15_buffer_wrapper(const uint16_t *context, uint16_t **data)
+{
+ kasumi_8_blocks(context, &data[0]);
+ kasumi_4_blocks(context, &data[8]);
+ kasumi_3_blocks(context, data[12], data[13], data[14]);
+}
+
+static void
+kasumi_f8_16_buffer_wrapper(const uint16_t *context, uint16_t **data)
+{
+ kasumi_8_blocks(context, &data[0]);
+ kasumi_8_blocks(context, &data[8]);
+}
+
+typedef void (*kasumi_wrapper_t)(const uint16_t *, uint16_t **);
+
+static kasumi_wrapper_t kasumiWrapperArray[] = {
+ NULL,
+ kasumi_f8_1_buffer_wrapper,
+ kasumi_f8_2_buffer_wrapper,
+ kasumi_f8_3_buffer_wrapper,
+ kasumi_4_blocks,
+ kasumi_f8_5_buffer_wrapper,
+ kasumi_f8_6_buffer_wrapper,
+ kasumi_f8_7_buffer_wrapper,
+ kasumi_8_blocks,
+ kasumi_f8_9_buffer_wrapper,
+ kasumi_f8_10_buffer_wrapper,
+ kasumi_f8_11_buffer_wrapper,
+ kasumi_f8_12_buffer_wrapper,
+ kasumi_f8_13_buffer_wrapper,
+ kasumi_f8_14_buffer_wrapper,
+ kasumi_f8_15_buffer_wrapper,
+ kasumi_f8_16_buffer_wrapper};
+
+/*---------------------------------------------------------------------
+* kasumi_key_schedule_sk()
+* Build the key schedule. Most "key" operations use 16-bit
+*
+* Context is a flat array of 64 uint16. The context is built in the same order
+* it will be used.
+*---------------------------------------------------------------------*/
+static inline void
+kasumi_key_schedule_sk(uint16_t *context, const void *pKey)
+{
+
+ /* Kasumi constants*/
+ static const uint16_t C[] = {0x0123, 0x4567, 0x89AB, 0xCDEF,
+ 0xFEDC, 0xBA98, 0x7654, 0x3210};
+
+ uint16_t k[8], kprime[8], n;
+ const uint8_t *pk = (const uint8_t *) pKey;
+
+ /* Build K[] and K'[] keys */
+ for (n = 0; n < 8; n++, pk += 2) {
+ k[n] = (pk[0] << 8) + pk[1];
+ kprime[n] = k[n] ^ C[n];
+ }
+
+ /*
+ * Finally construct the various sub keys [Kli1, KlO ...) in the right
+ * order for easy usage at run-time
+ */
+ for (n = 0; n < 8; n++) {
+ context[0] = ROL16(k[n], 1);
+ context[1] = kprime[(n + 2) & 0x7];
+ context[2] = ROL16(k[(n + 1) & 0x7], 5);
+ context[3] = kprime[(n + 4) & 0x7];
+ context[4] = ROL16(k[(n + 5) & 0x7], 8);
+ context[5] = kprime[(n + 3) & 0x7];
+ context[6] = ROL16(k[(n + 6) & 0x7], 13);
+ context[7] = kprime[(n + 7) & 0x7];
+ context += 8;
+ }
+#ifdef SAFE_DATA
+ clear_mem(k, sizeof(k));
+ clear_mem(kprime, sizeof(kprime));
+#endif
+}
+
+/*---------------------------------------------------------------------
+* kasumi_compute_sched()
+* Generic ksaumi key sched init function.
+*
+*---------------------------------------------------------------------*/
+static inline int
+kasumi_compute_sched(const uint8_t modifier,
+ const void *const pKey, void *pCtx)
+{
+#ifdef SAFE_PARAM
+ /* Check for NULL pointers */
+ if (pKey == NULL || pCtx == NULL)
+ return -1;
+#endif
+ uint32_t i = 0;
+ const uint8_t *const key = (const uint8_t * const)pKey;
+ uint8_t ModKey[KASUMI_KEY_SIZE] = {0}; /* Modified key */
+ kasumi_key_sched_t *pLocalCtx = (kasumi_key_sched_t *)pCtx;
+
+ /* Construct the modified key*/
+ for (i = 0; i < KASUMI_KEY_SIZE; i++)
+ ModKey[i] = (uint8_t)key[i] ^ modifier;
+
+ kasumi_key_schedule_sk(pLocalCtx->sk16, pKey);
+ kasumi_key_schedule_sk(pLocalCtx->msk16, ModKey);
+
+#ifdef SAFE_DATA
+ clear_mem(ModKey, sizeof(ModKey));
+ CLEAR_SCRATCH_GPS();
+ CLEAR_SCRATCH_SIMD_REGS();
+#endif
+ return 0;
+}
+
+/*---------------------------------------------------------------------
+* kasumi_key_sched_size()
+* Get the size of a kasumi key sched context.
+*
+*---------------------------------------------------------------------*/
+static inline size_t
+kasumi_key_sched_size(void)
+{
+ /*
+ * There are two keys that need to be scheduled: the original one and
+ * the modified one (xored with the relevant modifier)
+ */
+ return sizeof(kasumi_key_sched_t);
+}
+
+/*---------------------------------------------------------------------
+* kasumi_init_f8_key_sched()
+* Compute the kasumi f8 key schedule.
+*
+*---------------------------------------------------------------------*/
+
+static inline int
+kasumi_init_f8_key_sched(const void *const pKey,
+ kasumi_key_sched_t *pCtx)
+{
+ return kasumi_compute_sched(0x55, pKey, pCtx);
+}
+
+/*---------------------------------------------------------------------
+* kasumi_init_f9_key_sched()
+* Compute the kasumi f9 key schedule.
+*
+*---------------------------------------------------------------------*/
+
+static inline int
+kasumi_init_f9_key_sched(const void *const pKey,
+ kasumi_key_sched_t *pCtx)
+{
+ return kasumi_compute_sched(0xAA, pKey, pCtx);
+}
+
+size_t
+kasumi_key_sched_size_sse(void);
+
+int
+kasumi_init_f8_key_sched_sse(const void *pKey, kasumi_key_sched_t *pCtx);
+
+int
+kasumi_init_f9_key_sched_sse(const void *pKey, kasumi_key_sched_t *pCtx);
+
+size_t
+kasumi_key_sched_size_avx(void);
+
+int
+kasumi_init_f8_key_sched_avx(const void *pKey, kasumi_key_sched_t *pCtx);
+
+int
+kasumi_init_f9_key_sched_avx(const void *pKey, kasumi_key_sched_t *pCtx);
+
+
+static inline void
+kasumi_f8_1_buffer(const kasumi_key_sched_t *pCtx, const uint64_t IV,
+ const void *pIn, void *pOut,
+ const uint32_t length)
+{
+ uint32_t blkcnt;
+ kasumi_union_t a, b; /* the modifier */
+ SafeBuf safeInBuf;
+ const uint8_t *pBufferIn = (const uint8_t *) pIn;
+ uint8_t *pBufferOut = (uint8_t *) pOut;
+ uint32_t lengthInBytes = length;
+
+ /* IV Endianity */
+ a.b64[0] = BSWAP64(IV);
+
+ /* First encryption to create modifier */
+ kasumi_1_block(pCtx->msk16, a.b16 );
+
+ /* Final initialisation steps */
+ blkcnt = 0;
+ b.b64[0] = a.b64[0];
+
+ /* Now run the block cipher */
+ while (lengthInBytes) {
+ /* KASUMI it to produce the next block of keystream */
+ kasumi_1_block(pCtx->sk16, b.b16 );
+
+ if (lengthInBytes > KASUMI_BLOCK_SIZE) {
+ pBufferIn = xor_keystrm_rev(pBufferOut, pBufferIn,
+ b.b64[0]);
+ pBufferOut += KASUMI_BLOCK_SIZE;
+ /* loop variant */
+ /* done another 64 bits */
+ lengthInBytes -= KASUMI_BLOCK_SIZE;
+
+ /* apply the modifier and update the block count */
+ b.b64[0] ^= a.b64[0];
+ b.b16[0] ^= (uint16_t)++blkcnt;
+ } else if (lengthInBytes < KASUMI_BLOCK_SIZE) {
+ /* end of the loop, handle the last bytes */
+ memcpy_keystrm(safeInBuf.b8, pBufferIn,
+ lengthInBytes);
+ xor_keystrm_rev(b.b8, safeInBuf.b8, b.b64[0]);
+ memcpy_keystrm(pBufferOut, b.b8, lengthInBytes);
+ lengthInBytes = 0;
+ /* lengthInBytes == KASUMI_BLOCK_SIZE */
+ } else {
+ xor_keystrm_rev(pBufferOut, pBufferIn, b.b64[0]);
+ lengthInBytes = 0;
+ }
+ }
+#ifdef SAFE_DATA
+ /* Clear sensitive data in stack */
+ clear_mem(&a, sizeof(a));
+ clear_mem(&b, sizeof(b));
+ clear_mem(&safeInBuf, sizeof(safeInBuf));
+#endif
+}
+
+static inline void
+preserve_bits(kasumi_union_t *c,
+ const uint8_t *pcBufferOut, const uint8_t *pcBufferIn,
+ SafeBuf *safeOutBuf, SafeBuf *safeInBuf,
+ const uint8_t bit_len, const uint8_t byte_len)
+{
+ const uint64_t mask = UINT64_MAX << (KASUMI_BLOCK_SIZE * 8 - bit_len);
+
+ /* Clear the last bits of the keystream and the input
+ * (input only in out-of-place case) */
+ c->b64[0] &= mask;
+ if (pcBufferIn != pcBufferOut) {
+ const uint64_t swapMask = BSWAP64(mask);
+
+ safeInBuf->b64 &= swapMask;
+
+ /*
+ * Merge the last bits from the output, to be preserved,
+ * in the keystream, to be XOR'd with the input
+ * (which last bits are 0, maintaining the output bits)
+ */
+ memcpy_keystrm(safeOutBuf->b8, pcBufferOut, byte_len);
+ c->b64[0] |= BSWAP64(safeOutBuf->b64 & ~swapMask);
+ }
+}
+
+static inline void
+kasumi_f8_1_buffer_bit(const kasumi_key_sched_t *pCtx, const uint64_t IV,
+ const void *pIn, void *pOut,
+ const uint32_t lengthInBits,
+ const uint32_t offsetInBits)
+{
+ const uint8_t *pBufferIn = (const uint8_t *) pIn;
+ uint8_t *pBufferOut = (uint8_t *) pOut;
+ uint32_t cipherLengthInBits = lengthInBits;
+ uint32_t blkcnt;
+ uint64_t shiftrem = 0;
+ kasumi_union_t a, b, c; /* the modifier */
+ const uint8_t *pcBufferIn = pBufferIn + (offsetInBits / 8);
+ uint8_t *pcBufferOut = pBufferOut + (offsetInBits / 8);
+ /* Offset into the first byte (0 - 7 bits) */
+ uint32_t remainOffset = offsetInBits % 8;
+ uint32_t byteLength = (cipherLengthInBits + 7) / 8;
+ SafeBuf safeOutBuf;
+ SafeBuf safeInBuf;
+
+ /* IV Endianity */
+ a.b64[0] = BSWAP64(IV);
+
+ /* First encryption to create modifier */
+ kasumi_1_block(pCtx->msk16, a.b16);
+
+ /* Final initialisation steps */
+ blkcnt = 0;
+ b.b64[0] = a.b64[0];
+ /* Now run the block cipher */
+
+ /* Start with potential partial block (due to offset and length) */
+ kasumi_1_block(pCtx->sk16, b.b16);
+ c.b64[0] = b.b64[0] >> remainOffset;
+ /* Only one block to encrypt */
+ if (cipherLengthInBits < (64 - remainOffset)) {
+ byteLength = (cipherLengthInBits + 7) / 8;
+ memcpy_keystrm(safeInBuf.b8, pcBufferIn, byteLength);
+ /*
+ * If operation is Out-of-place and there is offset
+ * to be applied, "remainOffset" bits from the output buffer
+ * need to be preserved (only applicable to first byte,
+ * since remainOffset is up to 7 bits)
+ */
+ if ((pIn != pOut) && remainOffset) {
+ const uint8_t mask8 =
+ (const uint8_t)(1 << (8 - remainOffset)) - 1;
+
+ safeInBuf.b8[0] = (safeInBuf.b8[0] & mask8) |
+ (pcBufferOut[0] & ~mask8);
+ }
+
+ /* If last byte is a partial byte, the last bits of the output
+ * need to be preserved */
+ const uint8_t bitlen_with_off = remainOffset +
+ cipherLengthInBits;
+
+ if ((bitlen_with_off & 0x7) != 0) {
+ preserve_bits(&c, pcBufferOut, pcBufferIn, &safeOutBuf,
+ &safeInBuf, bitlen_with_off, byteLength);
+ }
+ xor_keystrm_rev(safeOutBuf.b8, safeInBuf.b8, c.b64[0]);
+ memcpy_keystrm(pcBufferOut, safeOutBuf.b8, byteLength);
+ return;
+ }
+
+ /*
+ * If operation is Out-of-place and there is offset
+ * to be applied, "remainOffset" bits from the output buffer
+ * need to be preserved (only applicable to first byte,
+ * since remainOffset is up to 7 bits)
+ */
+ if ((pIn != pOut) && remainOffset) {
+ const uint8_t mask8 =
+ (const uint8_t)(1 << (8 - remainOffset)) - 1;
+
+ memcpy_keystrm(safeInBuf.b8, pcBufferIn, 8);
+ safeInBuf.b8[0] = (safeInBuf.b8[0] & mask8) |
+ (pcBufferOut[0] & ~mask8);
+ xor_keystrm_rev(pcBufferOut, safeInBuf.b8, c.b64[0]);
+ pcBufferIn += KASUMI_BLOCK_SIZE;
+ } else {
+ /* At least 64 bits to produce (including offset) */
+ pcBufferIn = xor_keystrm_rev(pcBufferOut, pcBufferIn, c.b64[0]);
+ }
+
+ if (remainOffset != 0)
+ shiftrem = b.b64[0] << (64 - remainOffset);
+ cipherLengthInBits -= KASUMI_BLOCK_SIZE * 8 - remainOffset;
+ pcBufferOut += KASUMI_BLOCK_SIZE;
+ /* apply the modifier and update the block count */
+ b.b64[0] ^= a.b64[0];
+ b.b16[0] ^= (uint16_t)++blkcnt;
+
+ while (cipherLengthInBits) {
+ /* KASUMI it to produce the next block of keystream */
+ kasumi_1_block(pCtx->sk16, b.b16);
+ c.b64[0] = (b.b64[0] >> remainOffset) | shiftrem;
+ if (remainOffset != 0)
+ shiftrem = b.b64[0] << (64 - remainOffset);
+ if (cipherLengthInBits >= KASUMI_BLOCK_SIZE * 8) {
+ pcBufferIn = xor_keystrm_rev(pcBufferOut,
+ pcBufferIn, c.b64[0]);
+ cipherLengthInBits -= KASUMI_BLOCK_SIZE * 8;
+ pcBufferOut += KASUMI_BLOCK_SIZE;
+ /* loop variant */
+
+ /* apply the modifier and update the block count */
+ b.b64[0] ^= a.b64[0];
+ b.b16[0] ^= (uint16_t)++blkcnt;
+ } else {
+ /* end of the loop, handle the last bytes */
+ byteLength = (cipherLengthInBits + 7) / 8;
+ memcpy_keystrm(safeInBuf.b8, pcBufferIn,
+ byteLength);
+
+ /* If last byte is a partial byte, the last bits
+ * of the output need to be preserved */
+ if ((cipherLengthInBits & 0x7) != 0)
+ preserve_bits(&c, pcBufferOut, pcBufferIn,
+ &safeOutBuf, &safeInBuf,
+ cipherLengthInBits, byteLength);
+ xor_keystrm_rev(safeOutBuf.b8, safeInBuf.b8, c.b64[0]);
+ memcpy_keystrm(pcBufferOut, safeOutBuf.b8, byteLength);
+ cipherLengthInBits = 0;
+ }
+ }
+#ifdef SAFE_DATA
+ /* Clear sensitive data in stack */
+ clear_mem(&a, sizeof(a));
+ clear_mem(&b, sizeof(b));
+ clear_mem(&c, sizeof(c));
+ clear_mem(&safeInBuf, sizeof(safeInBuf));
+ clear_mem(&safeOutBuf, sizeof(safeOutBuf));
+#endif
+}
+
+static inline void
+kasumi_f8_2_buffer(const kasumi_key_sched_t *pCtx,
+ const uint64_t IV1, const uint64_t IV2,
+ const void *pIn1, void *pOut1,
+ const uint32_t length1,
+ const void *pIn2, void *pOut2,
+ const uint32_t length2)
+{
+ const uint8_t *pBufferIn1 = (const uint8_t *) pIn1;
+ uint8_t *pBufferOut1 = (uint8_t *) pOut1;
+ uint32_t lengthInBytes1 = length1;
+ const uint8_t *pBufferIn2 = (const uint8_t *) pIn2;
+ uint8_t *pBufferOut2 = (uint8_t *) pOut2;
+ uint32_t lengthInBytes2 = length2;
+ uint32_t blkcnt, length;
+ kasumi_union_t a1, b1; /* the modifier */
+ kasumi_union_t a2, b2; /* the modifier */
+ SafeBuf safeInBuf;
+
+ kasumi_union_t temp;
+
+ /* IV Endianity */
+ a1.b64[0] = BSWAP64(IV1);
+ a2.b64[0] = BSWAP64(IV2);
+
+ kasumi_2_blocks(pCtx->msk16, a1.b16, a2.b16);
+
+ /* Final initialisation steps */
+ blkcnt = 0;
+ b1.b64[0] = a1.b64[0];
+ b2.b64[0] = a2.b64[0];
+
+ /* check which packet is longer and save "common" shortest length */
+ if (lengthInBytes1 > lengthInBytes2)
+ length = lengthInBytes2;
+ else
+ length = lengthInBytes1;
+
+ /* Round down to to a whole number of qwords. (QWORDLENGTHINBYTES-1 */
+ length &= ~7;
+ lengthInBytes1 -= length;
+ lengthInBytes2 -= length;
+
+ /* Now run the block cipher for common packet length, a whole number of
+ * blocks */
+ while (length) {
+ /* KASUMI it to produce the next block of keystream for both
+ * packets */
+ kasumi_2_blocks(pCtx->sk16, b1.b16, b2.b16);
+
+ /* xor and write keystream */
+ pBufferIn1 =
+ xor_keystrm_rev(pBufferOut1, pBufferIn1, b1.b64[0]);
+ pBufferOut1 += KASUMI_BLOCK_SIZE;
+ pBufferIn2 =
+ xor_keystrm_rev(pBufferOut2, pBufferIn2, b2.b64[0]);
+ pBufferOut2 += KASUMI_BLOCK_SIZE;
+ /* loop variant */
+ length -= KASUMI_BLOCK_SIZE; /* done another 64 bits */
+
+ /* apply the modifier and update the block count */
+ b1.b64[0] ^= a1.b64[0];
+ b1.b16[0] ^= (uint16_t)++blkcnt;
+ b2.b64[0] ^= a2.b64[0];
+ b2.b16[0] ^= (uint16_t)blkcnt;
+ }
+
+ /*
+ * Process common part at end of first packet and second packet.
+ * One of the packets has a length less than 8 bytes.
+ */
+ if (lengthInBytes1 > 0 && lengthInBytes2 > 0) {
+ /* final round for 1 of the packets */
+ kasumi_2_blocks(pCtx->sk16, b1.b16, b2.b16);
+ if (lengthInBytes1 > KASUMI_BLOCK_SIZE) {
+ pBufferIn1 = xor_keystrm_rev(pBufferOut1,
+ pBufferIn1, b1.b64[0]);
+ pBufferOut1 += KASUMI_BLOCK_SIZE;
+ b1.b64[0] ^= a1.b64[0];
+ b1.b16[0] ^= (uint16_t)++blkcnt;
+ lengthInBytes1 -= KASUMI_BLOCK_SIZE;
+ } else if (lengthInBytes1 < KASUMI_BLOCK_SIZE) {
+ memcpy_keystrm(safeInBuf.b8, pBufferIn1,
+ lengthInBytes1);
+ xor_keystrm_rev(temp.b8, safeInBuf.b8, b1.b64[0]);
+ memcpy_keystrm(pBufferOut1, temp.b8,
+ lengthInBytes1);
+ lengthInBytes1 = 0;
+ /* lengthInBytes1 == KASUMI_BLOCK_SIZE */
+ } else {
+ xor_keystrm_rev(pBufferOut1, pBufferIn1, b1.b64[0]);
+ lengthInBytes1 = 0;
+ }
+ if (lengthInBytes2 > KASUMI_BLOCK_SIZE) {
+ pBufferIn2 = xor_keystrm_rev(pBufferOut2,
+ pBufferIn2, b2.b64[0]);
+ pBufferOut2 += KASUMI_BLOCK_SIZE;
+ b2.b64[0] ^= a2.b64[0];
+ b2.b16[0] ^= (uint16_t)++blkcnt;
+ lengthInBytes2 -= KASUMI_BLOCK_SIZE;
+ } else if (lengthInBytes2 < KASUMI_BLOCK_SIZE) {
+ memcpy_keystrm(safeInBuf.b8, pBufferIn2,
+ lengthInBytes2);
+ xor_keystrm_rev(temp.b8, safeInBuf.b8, b2.b64[0]);
+ memcpy_keystrm(pBufferOut2, temp.b8,
+ lengthInBytes2);
+ lengthInBytes2 = 0;
+ /* lengthInBytes2 == KASUMI_BLOCK_SIZE */
+ } else {
+ xor_keystrm_rev(pBufferOut2, pBufferIn2, b2.b64[0]);
+ lengthInBytes2 = 0;
+ }
+ }
+
+ if (lengthInBytes1 < lengthInBytes2) {
+ /* packet 2 is not completed since lengthInBytes2 > 0
+ * packet 1 has less than 8 bytes.
+ */
+ if (lengthInBytes1) {
+ kasumi_1_block(pCtx->sk16, b1.b16);
+ xor_keystrm_rev(pBufferOut1, pBufferIn1, b1.b64[0]);
+ }
+ /* move pointers to right variables for packet 1 */
+ lengthInBytes1 = lengthInBytes2;
+ b1.b64[0] = b2.b64[0];
+ a1.b64[0] = a2.b64[0];
+ pBufferIn1 = pBufferIn2;
+ pBufferOut1 = pBufferOut2;
+ } else { /* lengthInBytes1 >= lengthInBytes2 */
+ if (!lengthInBytes1)
+ /* both packets are completed */
+ return;
+ /* process the remaining of packet 2 */
+ if (lengthInBytes2) {
+ kasumi_1_block(pCtx->sk16, b2.b16);
+ xor_keystrm_rev(pBufferOut2, pBufferIn2, b2.b64[0]);
+ }
+ /* packet 1 is not completed */
+ }
+
+ /* process the length difference from ipkt1 and pkt2 */
+ while (lengthInBytes1) {
+ /* KASUMI it to produce the next block of keystream */
+ kasumi_1_block(pCtx->sk16, b1.b16);
+
+ if (lengthInBytes1 > KASUMI_BLOCK_SIZE) {
+ pBufferIn1 = xor_keystrm_rev(pBufferOut1,
+ pBufferIn1, b1.b64[0]);
+ pBufferOut1 += KASUMI_BLOCK_SIZE;
+ /* loop variant */
+ lengthInBytes1 -= KASUMI_BLOCK_SIZE;
+
+ /* apply the modifier and update the block count */
+ b1.b64[0] ^= a1.b64[0];
+ b1.b16[0] ^= (uint16_t)++blkcnt;
+ } else if (lengthInBytes1 < KASUMI_BLOCK_SIZE) {
+ /* end of the loop, handle the last bytes */
+ memcpy_keystrm(safeInBuf.b8, pBufferIn1,
+ lengthInBytes1);
+ xor_keystrm_rev(temp.b8, safeInBuf.b8, b1.b64[0]);
+ memcpy_keystrm(pBufferOut1, temp.b8,
+ lengthInBytes1);
+ lengthInBytes1 = 0;
+ /* lengthInBytes1 == KASUMI_BLOCK_SIZE */
+ } else {
+ xor_keystrm_rev(pBufferOut1, pBufferIn1, b1.b64[0]);
+ lengthInBytes1 = 0;
+ }
+ }
+#ifdef SAFE_DATA
+ /* Clear sensitive data in stack */
+ clear_mem(&a1, sizeof(a1));
+ clear_mem(&b1, sizeof(b1));
+ clear_mem(&a2, sizeof(a2));
+ clear_mem(&b2, sizeof(b2));
+ clear_mem(&temp, sizeof(temp));
+ clear_mem(&safeInBuf, sizeof(safeInBuf));
+#endif
+}
+
+static inline void
+kasumi_f8_3_buffer(const kasumi_key_sched_t *pCtx,
+ const uint64_t IV1, const uint64_t IV2, const uint64_t IV3,
+ const void *pIn1, void *pOut1,
+ const void *pIn2, void *pOut2,
+ const void *pIn3, void *pOut3,
+ const uint32_t length)
+{
+ const uint8_t *pBufferIn1 = (const uint8_t *) pIn1;
+ uint8_t *pBufferOut1 = (uint8_t *) pOut1;
+ const uint8_t *pBufferIn2 = (const uint8_t *) pIn2;
+ uint8_t *pBufferOut2 = (uint8_t *) pOut2;
+ const uint8_t *pBufferIn3 = (const uint8_t *) pIn3;
+ uint8_t *pBufferOut3 = (uint8_t *) pOut3;
+ uint32_t lengthInBytes = length;
+ uint32_t blkcnt;
+ kasumi_union_t a1, b1; /* the modifier */
+ kasumi_union_t a2, b2; /* the modifier */
+ kasumi_union_t a3, b3; /* the modifier */
+ SafeBuf safeInBuf1, safeInBuf2, safeInBuf3;
+
+ /* IV Endianity */
+ a1.b64[0] = BSWAP64(IV1);
+ a2.b64[0] = BSWAP64(IV2);
+ a3.b64[0] = BSWAP64(IV3);
+
+ kasumi_3_blocks(pCtx->msk16, a1.b16, a2.b16, a3.b16);
+
+ /* Final initialisation steps */
+ blkcnt = 0;
+ b1.b64[0] = a1.b64[0];
+ b2.b64[0] = a2.b64[0];
+ b3.b64[0] = a3.b64[0];
+
+ /* Now run the block cipher for common packet lengthInBytes, a whole
+ * number of blocks */
+ while (lengthInBytes) {
+ /* KASUMI it to produce the next block of keystream for all the
+ * packets */
+ kasumi_3_blocks(pCtx->sk16, b1.b16, b2.b16, b3.b16);
+
+ if (lengthInBytes > KASUMI_BLOCK_SIZE) {
+ /* xor and write keystream */
+ pBufferIn1 = xor_keystrm_rev(pBufferOut1,
+ pBufferIn1, b1.b64[0]);
+ pBufferOut1 += KASUMI_BLOCK_SIZE;
+ pBufferIn2 = xor_keystrm_rev(pBufferOut2,
+ pBufferIn2, b2.b64[0]);
+ pBufferOut2 += KASUMI_BLOCK_SIZE;
+ pBufferIn3 = xor_keystrm_rev(pBufferOut3,
+ pBufferIn3, b3.b64[0]);
+ pBufferOut3 += KASUMI_BLOCK_SIZE;
+ /* loop variant */
+ lengthInBytes -= KASUMI_BLOCK_SIZE;
+
+ /* apply the modifier and update the block count */
+ b1.b64[0] ^= a1.b64[0];
+ b1.b16[0] ^= (uint16_t)++blkcnt;
+ b2.b64[0] ^= a2.b64[0];
+ b2.b16[0] ^= (uint16_t)blkcnt;
+ b3.b64[0] ^= a3.b64[0];
+ b3.b16[0] ^= (uint16_t)blkcnt;
+ } else if (lengthInBytes < KASUMI_BLOCK_SIZE) {
+ /* end of the loop, handle the last bytes */
+ memcpy_keystrm(safeInBuf1.b8, pBufferIn1,
+ lengthInBytes);
+ xor_keystrm_rev(b1.b8, safeInBuf1.b8, b1.b64[0]);
+ memcpy_keystrm(pBufferOut1, b1.b8, lengthInBytes);
+
+ memcpy_keystrm(safeInBuf2.b8, pBufferIn2,
+ lengthInBytes);
+ xor_keystrm_rev(b2.b8, safeInBuf2.b8, b2.b64[0]);
+ memcpy_keystrm(pBufferOut2, b2.b8, lengthInBytes);
+
+ memcpy_keystrm(safeInBuf3.b8, pBufferIn3,
+ lengthInBytes);
+ xor_keystrm_rev(b3.b8, safeInBuf3.b8, b3.b64[0]);
+ memcpy_keystrm(pBufferOut3, b3.b8, lengthInBytes);
+ lengthInBytes = 0;
+ /* lengthInBytes == KASUMI_BLOCK_SIZE */
+ } else {
+ xor_keystrm_rev(pBufferOut1, pBufferIn1, b1.b64[0]);
+ xor_keystrm_rev(pBufferOut2, pBufferIn2, b2.b64[0]);
+ xor_keystrm_rev(pBufferOut3, pBufferIn3, b3.b64[0]);
+ lengthInBytes = 0;
+ }
+ }
+#ifdef SAFE_DATA
+ /* Clear sensitive data in stack */
+ clear_mem(&a1, sizeof(a1));
+ clear_mem(&b1, sizeof(b1));
+ clear_mem(&a2, sizeof(a2));
+ clear_mem(&b2, sizeof(b2));
+ clear_mem(&a3, sizeof(a3));
+ clear_mem(&b3, sizeof(b3));
+ clear_mem(&safeInBuf1, sizeof(safeInBuf1));
+ clear_mem(&safeInBuf2, sizeof(safeInBuf2));
+ clear_mem(&safeInBuf3, sizeof(safeInBuf3));
+#endif
+}
+
+/*---------------------------------------------------------
+* @description
+* Kasumi F8 4 packet:
+* Four packets enc/dec with the same key schedule.
+* The 4 Ivs are independent and are passed as an array of values
+* The packets are separate, the datalength is common
+*---------------------------------------------------------*/
+
+static inline void
+kasumi_f8_4_buffer(const kasumi_key_sched_t *pCtx, const uint64_t IV1,
+ const uint64_t IV2, const uint64_t IV3, const uint64_t IV4,
+ const void *pIn1, void *pOut1,
+ const void *pIn2, void *pOut2,
+ const void *pIn3, void *pOut3,
+ const void *pIn4, void *pOut4,
+ const uint32_t length)
+{
+ const uint8_t *pBufferIn1 = (const uint8_t *) pIn1;
+ uint8_t *pBufferOut1 = (uint8_t *) pOut1;
+ const uint8_t *pBufferIn2 = (const uint8_t *) pIn2;
+ uint8_t *pBufferOut2 = (uint8_t *) pOut2;
+ const uint8_t *pBufferIn3 = (const uint8_t *) pIn3;
+ uint8_t *pBufferOut3 = (uint8_t *) pOut3;
+ const uint8_t *pBufferIn4 = (const uint8_t *) pIn4;
+ uint8_t *pBufferOut4 = (uint8_t *) pOut4;
+ uint32_t lengthInBytes = length;
+ uint32_t blkcnt;
+ kasumi_union_t a1, b1; /* the modifier */
+ kasumi_union_t a2, b2; /* the modifier */
+ kasumi_union_t a3, b3; /* the modifier */
+ kasumi_union_t a4, b4; /* the modifier */
+ uint16_t *pTemp[4] = {b1.b16, b2.b16, b3.b16, b4.b16};
+ SafeBuf safeInBuf1, safeInBuf2, safeInBuf3, safeInBuf4;
+
+ /* IV Endianity */
+ b1.b64[0] = BSWAP64(IV1);
+ b2.b64[0] = BSWAP64(IV2);
+ b3.b64[0] = BSWAP64(IV3);
+ b4.b64[0] = BSWAP64(IV4);
+
+ kasumi_4_blocks(pCtx->msk16, pTemp);
+
+ /* Final initialisation steps */
+ blkcnt = 0;
+ a1.b64[0] = b1.b64[0];
+ a2.b64[0] = b2.b64[0];
+ a3.b64[0] = b3.b64[0];
+ a4.b64[0] = b4.b64[0];
+
+ /* Now run the block cipher for common packet lengthInBytes, a whole
+ * number of blocks */
+ while (lengthInBytes) {
+ /* KASUMI it to produce the next block of keystream for all the
+ * packets */
+ kasumi_4_blocks(pCtx->sk16, pTemp);
+
+ if (lengthInBytes > KASUMI_BLOCK_SIZE) {
+ /* xor and write keystream */
+ pBufferIn1 = xor_keystrm_rev(pBufferOut1,
+ pBufferIn1, b1.b64[0]);
+ pBufferOut1 += KASUMI_BLOCK_SIZE;
+ pBufferIn2 = xor_keystrm_rev(pBufferOut2,
+ pBufferIn2, b2.b64[0]);
+ pBufferOut2 += KASUMI_BLOCK_SIZE;
+ pBufferIn3 = xor_keystrm_rev(pBufferOut3,
+ pBufferIn3, b3.b64[0]);
+ pBufferOut3 += KASUMI_BLOCK_SIZE;
+ pBufferIn4 = xor_keystrm_rev(pBufferOut4,
+ pBufferIn4, b4.b64[0]);
+ pBufferOut4 += KASUMI_BLOCK_SIZE;
+ /* loop variant */
+ lengthInBytes -= KASUMI_BLOCK_SIZE;
+
+ /* apply the modifier and update the block count */
+ b1.b64[0] ^= a1.b64[0];
+ b1.b16[0] ^= (uint16_t)++blkcnt;
+ b2.b64[0] ^= a2.b64[0];
+ b2.b16[0] ^= (uint16_t)blkcnt;
+ b3.b64[0] ^= a3.b64[0];
+ b3.b16[0] ^= (uint16_t)blkcnt;
+ b4.b64[0] ^= a4.b64[0];
+ b4.b16[0] ^= (uint16_t)blkcnt;
+ } else if (lengthInBytes < KASUMI_BLOCK_SIZE) {
+ /* end of the loop, handle the last bytes */
+ memcpy_keystrm(safeInBuf1.b8, pBufferIn1,
+ lengthInBytes);
+ xor_keystrm_rev(b1.b8, safeInBuf1.b8, b1.b64[0]);
+ memcpy_keystrm(pBufferOut1, b1.b8, lengthInBytes);
+
+ memcpy_keystrm(safeInBuf2.b8, pBufferIn2,
+ lengthInBytes);
+ xor_keystrm_rev(b2.b8, safeInBuf2.b8, b2.b64[0]);
+ memcpy_keystrm(pBufferOut2, b2.b8, lengthInBytes);
+
+ memcpy_keystrm(safeInBuf3.b8, pBufferIn3,
+ lengthInBytes);
+ xor_keystrm_rev(b3.b8, safeInBuf3.b8, b3.b64[0]);
+ memcpy_keystrm(pBufferOut3, b3.b8, lengthInBytes);
+
+ memcpy_keystrm(safeInBuf4.b8, pBufferIn4,
+ lengthInBytes);
+ xor_keystrm_rev(b4.b8, safeInBuf4.b8, b4.b64[0]);
+ memcpy_keystrm(pBufferOut4, b4.b8, lengthInBytes);
+ lengthInBytes = 0;
+ /* lengthInBytes == KASUMI_BLOCK_SIZE */
+ } else {
+ xor_keystrm_rev(pBufferOut1, pBufferIn1, b1.b64[0]);
+ xor_keystrm_rev(pBufferOut2, pBufferIn2, b2.b64[0]);
+ xor_keystrm_rev(pBufferOut3, pBufferIn3, b3.b64[0]);
+ xor_keystrm_rev(pBufferOut4, pBufferIn4, b4.b64[0]);
+ lengthInBytes = 0;
+ }
+ }
+#ifdef SAFE_DATA
+ /* Clear sensitive data in stack */
+ clear_mem(&a1, sizeof(a1));
+ clear_mem(&b1, sizeof(b1));
+ clear_mem(&a2, sizeof(a2));
+ clear_mem(&b2, sizeof(b2));
+ clear_mem(&a3, sizeof(a3));
+ clear_mem(&b3, sizeof(b3));
+ clear_mem(&a4, sizeof(a4));
+ clear_mem(&b4, sizeof(b4));
+ clear_mem(&safeInBuf1, sizeof(safeInBuf1));
+ clear_mem(&safeInBuf2, sizeof(safeInBuf2));
+ clear_mem(&safeInBuf3, sizeof(safeInBuf3));
+ clear_mem(&safeInBuf4, sizeof(safeInBuf4));
+#endif
+}
+
+/*---------------------------------------------------------
+* @description
+* Kasumi F8 2 packet:
+* Two packets enc/dec with the same key schedule.
+* The 2 Ivs are independent and are passed as an array of values.
+* The packets are separate, the datalength is common
+*---------------------------------------------------------*/
+/******************************************************************************
+* @description
+* Kasumi F8 n packet:
+* Performs F8 enc/dec on [n] packets. The operation is performed in-place.
+* The input IV's are passed in Big Endian format.
+* The KeySchedule is in Little Endian format.
+*******************************************************************************/
+
+static inline void
+kasumi_f8_n_buffer(const kasumi_key_sched_t *pKeySchedule, const uint64_t IV[],
+ const void * const pIn[], void *pOut[],
+ const uint32_t lengths[], const uint32_t bufCount)
+{
+ if (bufCount > 16) {
+ pOut[0] = NULL;
+ printf("dataCount too high (%d)\n", bufCount);
+ return;
+ }
+
+ uint32_t dataCount = bufCount;
+ kasumi_union_t A[NUM_PACKETS_16], temp[NUM_PACKETS_16], tempSort;
+ uint16_t *data[NUM_PACKETS_16];
+ uint32_t dataLen[NUM_PACKETS_16];
+ uint8_t *pDataOut[NUM_PACKETS_16] = {NULL};
+ const uint8_t *pDataIn[NUM_PACKETS_16] = {NULL};
+ const uint8_t *srctempbuff;
+ uint8_t *dsttempbuff;
+ uint32_t blkcnt = 0;
+ uint32_t len = 0;
+ uint32_t packet_idx, inner_idx, same_size_blocks;
+ int sortNeeded = 0, tempLen = 0;
+ SafeBuf safeInBuf;
+
+ memcpy((void *)dataLen, lengths, dataCount * sizeof(uint32_t));
+ memcpy((void *)pDataIn, pIn, dataCount * sizeof(void *));
+ memcpy((void *)pDataOut, pOut, dataCount * sizeof(void *));
+
+ /* save the IV to A for each packet */
+ packet_idx = dataCount;
+ while (packet_idx--) {
+ /*copy IV in reverse endian order as input IV is BE */
+ temp[packet_idx].b64[0] = BSWAP64(IV[packet_idx]);
+
+ /* set LE IV pointers */
+ data[packet_idx] = temp[packet_idx].b16;
+
+ /* check if all packets are sorted by decreasing length */
+ if (packet_idx > 0 &&
+ dataLen[packet_idx - 1] < dataLen[packet_idx])
+ /* this packet array is not correctly sorted */
+ sortNeeded = 1;
+ }
+
+ /* do 1st kasumi block on A with modified key, this overwrites A */
+ kasumiWrapperArray[dataCount](pKeySchedule->msk16, data);
+
+ if (sortNeeded) {
+ /* sort packets in decreasing buffer size from [0] to [n]th
+ packet,
+ ** where buffer[0] will contain longest buffer and
+ buffer[n] will
+ contain the shortest buffer.
+ 4 arrays are swapped :
+ - pointers to input buffers
+ - pointers to output buffers
+ - pointers to input IV's
+ - input buffer lengths
+ */
+ packet_idx = dataCount;
+ while (packet_idx--) {
+ inner_idx = packet_idx;
+ while (inner_idx--) {
+ if (dataLen[packet_idx] > dataLen[inner_idx]) {
+
+ /* swap buffers to arrange in descending
+ * order from [0]. */
+ srctempbuff = pDataIn[packet_idx];
+ dsttempbuff = pDataOut[packet_idx];
+ tempSort = temp[packet_idx];
+ tempLen = dataLen[packet_idx];
+
+ pDataIn[packet_idx] =
+ pDataIn[inner_idx];
+ pDataOut[packet_idx] =
+ pDataOut[inner_idx];
+ temp[packet_idx] = temp[inner_idx];
+ dataLen[packet_idx] =
+ dataLen[inner_idx];
+
+ pDataIn[inner_idx] = srctempbuff;
+ pDataOut[inner_idx] = dsttempbuff;
+ temp[inner_idx] = tempSort;
+ dataLen[inner_idx] = tempLen;
+ }
+ } /* for inner packet idx (inner bubble-sort) */
+ } /* for outer packet idx (outer bubble-sort) */
+ } /* if sortNeeded */
+
+ packet_idx = dataCount;
+ while (packet_idx--)
+ /* copy the schedule */
+ A[packet_idx].b64[0] = temp[packet_idx].b64[0];
+
+ while (dataCount > 0) {
+ /* max num of blocks left depends on roundUp(smallest packet),
+ * The shortest stream to process is always stored at location
+ * [dataCount - 1]
+ */
+ same_size_blocks =
+ ((dataLen[dataCount - 1] + KASUMI_BLOCK_SIZE - 1) /
+ KASUMI_BLOCK_SIZE) -
+ blkcnt;
+
+ /* process streams of complete blocks */
+ while (same_size_blocks-- > 1) {
+ /* do kasumi block encryption */
+ kasumiWrapperArray[dataCount](pKeySchedule->sk16,
+ data);
+
+ packet_idx = dataCount;
+ while (packet_idx--)
+ xor_keystrm_rev(pDataOut[packet_idx] + len,
+ pDataIn[packet_idx] + len,
+ temp[packet_idx].b64[0]);
+
+ /* length already done since the start of the packets */
+ len += KASUMI_BLOCK_SIZE;
+
+ /* block idx is incremented and rewritten in the
+ * keystream */
+ blkcnt += 1;
+ packet_idx = dataCount;
+ while (packet_idx--) {
+ temp[packet_idx].b64[0] ^= A[packet_idx].b64[0];
+ temp[packet_idx].b16[0] ^= (uint16_t)blkcnt;
+ } /* for packet_idx */
+
+ } /* while same_size_blocks (iteration on multiple blocks) */
+
+ /* keystream for last block of all packets */
+ kasumiWrapperArray[dataCount](pKeySchedule->sk16, data);
+
+ /* process incomplete blocks without overwriting past the buffer
+ * end */
+ while ((dataCount > 0) &&
+ (dataLen[dataCount - 1] < (len + KASUMI_BLOCK_SIZE))) {
+
+ dataCount--;
+ /* incomplete block is copied into a temp buffer */
+ memcpy_keystrm(safeInBuf.b8, pDataIn[dataCount] + len,
+ dataLen[dataCount] - len);
+ xor_keystrm_rev(temp[dataCount].b8,
+ safeInBuf.b8,
+ temp[dataCount].b64[0]);
+
+ memcpy_keystrm(pDataOut[dataCount] + len,
+ temp[dataCount].b8,
+ dataLen[dataCount] - len);
+ } /* while dataCount */
+
+ /* process last blocks: it can be the last complete block of the
+ packets or, if
+ KASUMI_SAFE_BUFFER is defined, the last block (complete or not)
+ of the packets*/
+ while ((dataCount > 0) &&
+ (dataLen[dataCount - 1] <= (len + KASUMI_BLOCK_SIZE))) {
+
+ dataCount--;
+ xor_keystrm_rev(pDataOut[dataCount] + len,
+ pDataIn[dataCount] + len,
+ temp[dataCount].b64[0]);
+ } /* while dataCount */
+ /* block idx is incremented and rewritten in the keystream */
+ blkcnt += 1;
+
+ /* for the following packets, this block is not the last one:
+ dataCount is not decremented */
+ packet_idx = dataCount;
+ while (packet_idx--) {
+
+ xor_keystrm_rev(pDataOut[packet_idx] + len,
+ pDataIn[packet_idx] + len,
+ temp[packet_idx].b64[0]);
+ temp[packet_idx].b64[0] ^= A[packet_idx].b64[0];
+ temp[packet_idx].b16[0] ^= (uint16_t)blkcnt;
+ } /* while packet_idx */
+
+ /* length already done since the start of the packets */
+ len += KASUMI_BLOCK_SIZE;
+
+ /* the remaining packets, if any, have now at least one valid
+ block, which might be complete or not */
+
+ } /* while (dataCount) */
+#ifdef SAFE_DATA
+ uint32_t i;
+
+ /* Clear sensitive data in stack */
+ for (i = 0; i < dataCount; i++) {
+ clear_mem(&A[i], sizeof(A[i]));
+ clear_mem(&temp[i], sizeof(temp[i]));
+ }
+ clear_mem(&tempSort, sizeof(tempSort));
+ clear_mem(&safeInBuf, sizeof(safeInBuf));
+#endif
+}
+
+static inline void
+kasumi_f9_1_buffer(const kasumi_key_sched_t *pCtx, const void *dataIn,
+ const uint32_t length, void *pDigest)
+{
+ kasumi_union_t a, b, mask;
+ const uint64_t *pIn = (const uint64_t *)dataIn;
+ uint32_t lengthInBytes = length;
+ SafeBuf safeBuf;
+
+ /* Init */
+ a.b64[0] = 0;
+ b.b64[0] = 0;
+ mask.b64[0] = -1;
+
+ /* Now run kasumi for all 8 byte blocks */
+ while (lengthInBytes >= 8) {
+
+ a.b64[0] ^= BSWAP64(*(pIn++));
+
+ /* KASUMI it */
+ kasumi_1_block(pCtx->sk16, a.b16);
+
+ /* loop variant */
+ lengthInBytes -= 8; /* done another 64 bits */
+
+ /* update */
+ b.b64[0] ^= a.b64[0];
+ }
+
+ if (lengthInBytes) {
+ /* Not a whole 8 byte block remaining */
+ mask.b64[0] = ~(mask.b64[0] >> (BYTESIZE * lengthInBytes));
+ memcpy(&safeBuf.b64, pIn, lengthInBytes);
+ mask.b64[0] &= BSWAP64(safeBuf.b64);
+ a.b64[0] ^= mask.b64[0];
+
+ /* KASUMI it */
+ kasumi_1_block(pCtx->sk16, a.b16);
+
+ /* update */
+ b.b64[0] ^= a.b64[0];
+ }
+
+ /* Kasumi b */
+ kasumi_1_block(pCtx->msk16, b.b16);
+
+ /* swap result */
+ *(uint32_t *)pDigest = bswap4(b.b32[1]);
+#ifdef SAFE_DATA
+ /* Clear sensitive data in stack */
+ clear_mem(&a, sizeof(a));
+ clear_mem(&b, sizeof(b));
+ clear_mem(&mask, sizeof(mask));
+ clear_mem(&safeBuf, sizeof(safeBuf));
+#endif
+}
+
+/*---------------------------------------------------------
+* @description
+* Kasumi F9 1 packet with user config:
+* Single packet digest with user defined IV, and precomputed key schedule.
+*
+* IV = swap32(count) << 32 | swap32(fresh)
+*
+*---------------------------------------------------------*/
+
+static inline void
+kasumi_f9_1_buffer_user(const kasumi_key_sched_t *pCtx, const uint64_t IV,
+ const void *pDataIn, const uint32_t length,
+ void *pDigest, const uint32_t direction)
+{
+ kasumi_union_t a, b, mask, message, temp;
+ uint32_t lengthInBits = length;
+ const uint64_t *pIn = (const uint64_t *)pDataIn;
+ kasumi_union_t safebuff;
+
+ a.b64[0] = 0;
+ b.b64[0] = 0;
+
+ /* Use the count and fresh for first round */
+ a.b64[0] = BSWAP64(IV);
+ /* KASUMI it */
+ kasumi_1_block(pCtx->sk16, a.b16);
+ /* update */
+ b.b64[0] = a.b64[0];
+
+ /* Now run kasumi for all 8 byte blocks */
+ while (lengthInBits >= QWORDSIZEINBITS) {
+ a.b64[0] ^= BSWAP64(*(pIn++));
+ /* KASUMI it */
+ kasumi_1_block(pCtx->sk16, a.b16);
+ /* loop variant */
+ lengthInBits -= 64; /* done another 64 bits */
+ /* update */
+ b.b64[0] ^= a.b64[0];
+ }
+
+ /* Is there any non 8 byte blocks remaining ? */
+ if (lengthInBits == 0) {
+ /* last block is : direct + 1 + 62 0's */
+ a.b64[0] ^= ((uint64_t)direction + direction + LAST_PADDING_BIT)
+ << (QWORDSIZEINBITS - 2);
+ kasumi_1_block(pCtx->sk16, a.b16);
+ /* update */
+ b.b64[0] ^= a.b64[0];
+ } else if (lengthInBits <= (QWORDSIZEINBITS - 2)) {
+ /* last block is : message + direction + LAST_PADDING_BITS(1) +
+ * less than 62 0's */
+ mask.b64[0] = -1;
+ temp.b64[0] = 0;
+ message.b64[0] = 0;
+ mask.b64[0] = ~(mask.b64[0] >> lengthInBits);
+ /*round up and copy last lengthInBits */
+ memcpy(&safebuff.b64[0], pIn, (lengthInBits + 7) / 8);
+ message.b64[0] = BSWAP64(safebuff.b64[0]);
+ temp.b64[0] = mask.b64[0] & message.b64[0];
+ temp.b64[0] |=
+ ((uint64_t)direction + direction + LAST_PADDING_BIT)
+ << ((QWORDSIZEINBITS - 2) - lengthInBits);
+ a.b64[0] ^= temp.b64[0];
+ /* KASUMI it */
+ kasumi_1_block(pCtx->sk16, a.b16);
+
+ /* update */
+ b.b64[0] ^= a.b64[0];
+ } else if (lengthInBits == (QWORDSIZEINBITS - 1)) {
+ /* next block is : message + direct */
+ /* last block is : 1 + 63 0's */
+ a.b64[0] ^= direction | (~1 & BSWAP64(*(pIn++)));
+ /* KASUMI it */
+ kasumi_1_block(pCtx->sk16, a.b16);
+ /* update */
+ b.b64[0] ^= a.b64[0];
+ a.b8[QWORDSIZEINBYTES - 1] ^= (LAST_PADDING_BIT)
+ << (QWORDSIZEINBYTES - 1);
+ /* KASUMI it */
+ kasumi_1_block(pCtx->sk16, a.b16);
+ /* update */
+ b.b64[0] ^= a.b64[0];
+ }
+ /* Kasumi b */
+ kasumi_1_block(pCtx->msk16, b.b16);
+
+ /* swap result */
+ *(uint32_t *)pDigest = bswap4(b.b32[1]);
+#ifdef SAFE_DATA
+ /* Clear sensitive data in stack */
+ clear_mem(&a, sizeof(a));
+ clear_mem(&b, sizeof(b));
+ clear_mem(&mask, sizeof(mask));
+ clear_mem(&message, sizeof(message));
+ clear_mem(&temp, sizeof(temp));
+ clear_mem(&safebuff, sizeof(safebuff));
+#endif
+}
+
+void kasumi_f8_1_buffer_sse(const kasumi_key_sched_t *pCtx, const uint64_t IV,
+ const void *pBufferIn, void *pBufferOut,
+ const uint32_t cipherLengthInBytes);
+
+void kasumi_f8_1_buffer_bit_sse(const kasumi_key_sched_t *pCtx,
+ const uint64_t IV,
+ const void *pBufferIn, void *pBufferOut,
+ const uint32_t cipherLengthInBits,
+ const uint32_t offsetInBits);
+
+void kasumi_f8_2_buffer_sse(const kasumi_key_sched_t *pCtx,
+ const uint64_t IV1, const uint64_t IV2,
+ const void *pBufferIn1, void *pBufferOut1,
+ const uint32_t lengthInBytes1,
+ const void *pBufferIn2, void *pBufferOut2,
+ const uint32_t lengthInBytes2);
+
+void kasumi_f8_3_buffer_sse(const kasumi_key_sched_t *pCtx, const uint64_t IV1,
+ const uint64_t IV2, const uint64_t IV3,
+ const void *pBufferIn1, void *pBufferOut1,
+ const void *pBufferIn2, void *pBufferOut2,
+ const void *pBufferIn3, void *pBufferOut3,
+ const uint32_t lengthInBytes);
+
+void kasumi_f8_4_buffer_sse(const kasumi_key_sched_t *pCtx,
+ const uint64_t IV1, const uint64_t IV2,
+ const uint64_t IV3, const uint64_t IV4,
+ const void *pBufferIn1, void *pBufferOut1,
+ const void *pBufferIn2, void *pBufferOut2,
+ const void *pBufferIn3, void *pBufferOut3,
+ const void *pBufferIn4, void *pBufferOut4,
+ const uint32_t lengthInBytes);
+
+void kasumi_f8_n_buffer_sse(const kasumi_key_sched_t *pKeySchedule,
+ const uint64_t IV[],
+ const void * const pDataIn[], void *pDataOut[],
+ const uint32_t dataLen[], const uint32_t dataCount);
+
+void kasumi_f9_1_buffer_sse(const kasumi_key_sched_t *pCtx,
+ const void *pBufferIn,
+ const uint32_t lengthInBytes, void *pDigest);
+
+void kasumi_f9_1_buffer_user_sse(const kasumi_key_sched_t *pCtx,
+ const uint64_t IV, const void *pBufferIn,
+ const uint32_t lengthInBits,
+ void *pDigest, const uint32_t direction);
+
+
+void kasumi_f8_1_buffer_avx(const kasumi_key_sched_t *pCtx, const uint64_t IV,
+ const void *pBufferIn, void *pBufferOut,
+ const uint32_t cipherLengthInBytes);
+void kasumi_f8_1_buffer_bit_avx(const kasumi_key_sched_t *pCtx,
+ const uint64_t IV,
+ const void *pBufferIn, void *pBufferOut,
+ const uint32_t cipherLengthInBits,
+ const uint32_t offsetInBits);
+void kasumi_f8_2_buffer_avx(const kasumi_key_sched_t *pCtx,
+ const uint64_t IV1, const uint64_t IV2,
+ const void *pBufferIn1, void *pBufferOut1,
+ const uint32_t lengthInBytes1,
+ const void *pBufferIn2, void *pBufferOut2,
+ const uint32_t lengthInBytes2);
+void kasumi_f8_3_buffer_avx(const kasumi_key_sched_t *pCtx, const uint64_t IV1,
+ const uint64_t IV2, const uint64_t IV3,
+ const void *pBufferIn1, void *pBufferOut1,
+ const void *pBufferIn2, void *pBufferOut2,
+ const void *pBufferIn3, void *pBufferOut3,
+ const uint32_t lengthInBytes);
+void kasumi_f8_4_buffer_avx(const kasumi_key_sched_t *pCtx,
+ const uint64_t IV1, const uint64_t IV2,
+ const uint64_t IV3, const uint64_t IV4,
+ const void *pBufferIn1, void *pBufferOut1,
+ const void *pBufferIn2, void *pBufferOut2,
+ const void *pBufferIn3, void *pBufferOut3,
+ const void *pBufferIn4, void *pBufferOut4,
+ const uint32_t lengthInBytes);
+void kasumi_f8_n_buffer_avx(const kasumi_key_sched_t *pKeySchedule,
+ const uint64_t IV[],
+ const void * const pDataIn[], void *pDataOut[],
+ const uint32_t dataLen[], const uint32_t dataCount);
+
+void kasumi_f9_1_buffer_avx(const kasumi_key_sched_t *pCtx,
+ const void *pBufferIn,
+ const uint32_t lengthInBytes, void *pDigest);
+
+void kasumi_f9_1_buffer_user_avx(const kasumi_key_sched_t *pCtx,
+ const uint64_t IV, const void *pBufferIn,
+ const uint32_t lengthInBits,
+ void *pDigest, const uint32_t direction);
+#endif /*_KASUMI_INTERNAL_H_*/
+
diff --git a/src/spdk/intel-ipsec-mb/include/memcpy.asm b/src/spdk/intel-ipsec-mb/include/memcpy.asm
new file mode 100644
index 000000000..82e4f2cb2
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/include/memcpy.asm
@@ -0,0 +1,613 @@
+;;
+;; Copyright (c) 2012-2018, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;; * Redistributions of source code must retain the above copyright notice,
+;; this list of conditions and the following disclaimer.
+;; * Redistributions in binary form must reproduce the above copyright
+;; notice, this list of conditions and the following disclaimer in the
+;; documentation and/or other materials provided with the distribution.
+;; * Neither the name of Intel Corporation nor the names of its contributors
+;; may be used to endorse or promote products derived from this software
+;; without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+%ifndef __MEMCPY_ASM__
+%define __MEMCPY_ASM__
+
+%include "include/reg_sizes.asm"
+
+
+; This section defines a series of macros to copy small to medium amounts
+; of data from memory to memory, where the size is variable but limited.
+;
+; The macros are all called as:
+; memcpy DST, SRC, SIZE, TMP0, TMP1, XTMP0, XTMP1, XTMP2, XTMP3
+; with the parameters defined as:
+; DST : register: pointer to dst (not modified)
+; SRC : register: pointer to src (not modified)
+; SIZE : register: length in bytes (not modified)
+; TMP0 : 64-bit temp GPR (clobbered)
+; TMP1 : 64-bit temp GPR (clobbered)
+; XTMP0 : temp XMM (clobbered)
+; XTMP1 : temp XMM (clobbered)
+; XTMP2 : temp XMM (clobbered)
+; XTMP3 : temp XMM (clobbered)
+;
+; The name indicates the options. The name is of the form:
+; memcpy_<VEC>_<SZ><ZERO><RET>
+; where:
+; <VEC> is either "sse" or "avx" or "avx2"
+; <SZ> is either "64" or "128" and defines largest value of SIZE
+; <ZERO> is blank or "_1". If "_1" then the min SIZE is 1 (otherwise 0)
+; <RET> is blank or "_ret". If blank, the code falls through. If "ret"
+; it does a "ret" at the end
+;
+; For the avx2 versions, the temp XMM registers need to be YMM registers
+; If the SZ is 64, then only two YMM temps are needed, i.e. it is called as:
+; memcpy_avx2_64 DST, SRC, SIZE, TMP0, TMP1, YTMP0, YTMP1
+; memcpy_avx2_128 DST, SRC, SIZE, TMP0, TMP1, YTMP0, YTMP1, YTMP2, YTMP3
+;
+; For example:
+; memcpy_sse_64 : SSE, 0 <= size < 64, falls through
+; memcpy_avx_64_1 : AVX1, 1 <= size < 64, falls through
+; memcpy_sse_128_ret : SSE, 0 <= size < 128, ends with ret
+; mempcy_avx_128_1_ret : AVX1, 1 <= size < 128, ends with ret
+;
+
+%macro memcpy_sse_64 9
+ __memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 0, 64, 0, 0
+%endm
+
+%macro memcpy_sse_64_1 9
+ __memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 1, 64, 0, 0
+%endm
+
+%macro memcpy_sse_128 9
+ __memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 0, 128, 0, 0
+%endm
+
+%macro memcpy_sse_128_1 9
+ __memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 1, 128, 0, 0
+%endm
+
+%macro memcpy_sse_64_ret 9
+ __memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 0, 64, 1, 0
+%endm
+
+%macro memcpy_sse_64_1_ret 9
+ __memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 1, 64, 1, 0
+%endm
+
+%macro memcpy_sse_128_ret 9
+ __memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 0, 128, 1, 0
+%endm
+
+%macro memcpy_sse_128_1_ret 9
+ __memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 1, 128, 1, 0
+%endm
+
+
+%macro memcpy_sse_16 5
+ __memcpy_int %1,%2,%3,%4,%5,,,,, 0, 16, 0, 0
+%endm
+
+%macro memcpy_sse_16_1 5
+ __memcpy_int %1,%2,%3,%4,%5,,,,, 1, 16, 0, 0
+%endm
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%macro memcpy_avx_64 9
+ __memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 0, 64, 0, 1
+%endm
+
+%macro memcpy_avx_64_1 9
+ __memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 1, 64, 0, 1
+%endm
+
+%macro memcpy_avx_128 9
+ __memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 0, 128, 0, 1
+%endm
+
+%macro memcpy_avx_128_1 9
+ __memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 1, 128, 0, 1
+%endm
+
+%macro memcpy_avx_64_ret 9
+ __memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 0, 64, 1, 1
+%endm
+
+%macro memcpy_avx_64_1_ret 9
+ __memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 1, 64, 1, 1
+%endm
+
+%macro memcpy_avx_128_ret 9
+ __memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 0, 128, 1, 1
+%endm
+
+%macro memcpy_avx_128_1_ret 9
+ __memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 1, 128, 1, 1
+%endm
+
+
+%macro memcpy_avx_16 5
+ __memcpy_int %1,%2,%3,%4,%5,,,,, 0, 16, 0, 1
+%endm
+
+%macro memcpy_avx_16_1 5
+ __memcpy_int %1,%2,%3,%4,%5,,,,, 1, 16, 0, 1
+%endm
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%macro memcpy_avx2_64 7
+ __memcpy_int %1,%2,%3,%4,%5,%6,%7,--,--, 0, 64, 0, 2
+%endm
+
+%macro memcpy_avx2_64_1 7
+ __memcpy_int %1,%2,%3,%4,%5,%6,%7,--,--, 1, 64, 0, 2
+%endm
+
+%macro memcpy_avx2_128 9
+ __memcpy_int %1,%2,%3,%4,%5,%6,%7, %8, %9, 0, 128, 0, 2
+%endm
+
+%macro memcpy_avx2_128_1 9
+ __memcpy_int %1,%2,%3,%4,%5,%6,%7, %8, %9, 1, 128, 0, 2
+%endm
+
+%macro memcpy_avx2_64_ret 7
+ __memcpy_int %1,%2,%3,%4,%5,%6,%7,--,--, 0, 64, 1, 2
+%endm
+
+%macro memcpy_avx2_64_1_ret 7
+ __memcpy_int %1,%2,%3,%4,%5,%6,%7,--,--, 1, 64, 1, 2
+%endm
+
+%macro memcpy_avx2_128_ret 9
+ __memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 0, 128, 1, 2
+%endm
+
+%macro memcpy_avx2_128_1_ret 9
+ __memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 1, 128, 1, 2
+%endm
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+
+%macro __memcpy_int 13
+%define %%DST %1 ; register: pointer to dst (not modified)
+%define %%SRC %2 ; register: pointer to src (not modified)
+%define %%SIZE %3 ; register: length in bytes (not modified)
+%define %%TMP0 %4 ; 64-bit temp GPR (clobbered)
+%define %%TMP1 %5 ; 64-bit temp GPR (clobbered)
+%define %%XTMP0 %6 ; temp XMM (clobbered)
+%define %%XTMP1 %7 ; temp XMM (clobbered)
+%define %%XTMP2 %8 ; temp XMM (clobbered)
+%define %%XTMP3 %9 ; temp XMM (clobbered)
+%define %%NOT0 %10 ; if not 0, then assume size cannot be zero
+%define %%MAXSIZE %11 ; 128, 64, etc
+%define %%USERET %12 ; if not 0, use "ret" at end
+%define %%USEAVX %13 ; 0 = SSE, 1 = AVX1, 2 = AVX2
+
+%if (%%USERET != 0)
+ %define %%DONE ret
+%else
+ %define %%DONE jmp %%end
+%endif
+
+%if (%%USEAVX != 0)
+ %define %%MOVDQU vmovdqu
+%else
+ %define %%MOVDQU movdqu
+%endif
+
+%if (%%MAXSIZE >= 128)
+ test %%SIZE, 64
+ jz %%lt64
+ %if (%%USEAVX >= 2)
+ %%MOVDQU %%XTMP0, [%%SRC + 0*32]
+ %%MOVDQU %%XTMP1, [%%SRC + 1*32]
+ %%MOVDQU %%XTMP2, [%%SRC + %%SIZE - 2*32]
+ %%MOVDQU %%XTMP3, [%%SRC + %%SIZE - 1*32]
+
+ %%MOVDQU [%%DST + 0*32], %%XTMP0
+ %%MOVDQU [%%DST + 1*32], %%XTMP1
+ %%MOVDQU [%%DST + %%SIZE - 2*32], %%XTMP2
+ %%MOVDQU [%%DST + %%SIZE - 1*32], %%XTMP3
+ %else
+ %%MOVDQU %%XTMP0, [%%SRC + 0*16]
+ %%MOVDQU %%XTMP1, [%%SRC + 1*16]
+ %%MOVDQU %%XTMP2, [%%SRC + 2*16]
+ %%MOVDQU %%XTMP3, [%%SRC + 3*16]
+ %%MOVDQU [%%DST + 0*16], %%XTMP0
+ %%MOVDQU [%%DST + 1*16], %%XTMP1
+ %%MOVDQU [%%DST + 2*16], %%XTMP2
+ %%MOVDQU [%%DST + 3*16], %%XTMP3
+
+ %%MOVDQU %%XTMP0, [%%SRC + %%SIZE - 4*16]
+ %%MOVDQU %%XTMP1, [%%SRC + %%SIZE - 3*16]
+ %%MOVDQU %%XTMP2, [%%SRC + %%SIZE - 2*16]
+ %%MOVDQU %%XTMP3, [%%SRC + %%SIZE - 1*16]
+ %%MOVDQU [%%DST + %%SIZE - 4*16], %%XTMP0
+ %%MOVDQU [%%DST + %%SIZE - 3*16], %%XTMP1
+ %%MOVDQU [%%DST + %%SIZE - 2*16], %%XTMP2
+ %%MOVDQU [%%DST + %%SIZE - 1*16], %%XTMP3
+ %endif
+ %%DONE
+%endif
+
+%if (%%MAXSIZE >= 64)
+%%lt64:
+ test %%SIZE, 32
+ jz %%lt32
+ %if (%%USEAVX >= 2)
+ %%MOVDQU %%XTMP0, [%%SRC + 0*32]
+ %%MOVDQU %%XTMP1, [%%SRC + %%SIZE - 1*32]
+ %%MOVDQU [%%DST + 0*32], %%XTMP0
+ %%MOVDQU [%%DST + %%SIZE - 1*32], %%XTMP1
+ %else
+ %%MOVDQU %%XTMP0, [%%SRC + 0*16]
+ %%MOVDQU %%XTMP1, [%%SRC + 1*16]
+ %%MOVDQU %%XTMP2, [%%SRC + %%SIZE - 2*16]
+ %%MOVDQU %%XTMP3, [%%SRC + %%SIZE - 1*16]
+ %%MOVDQU [%%DST + 0*16], %%XTMP0
+ %%MOVDQU [%%DST + 1*16], %%XTMP1
+ %%MOVDQU [%%DST + %%SIZE - 2*16], %%XTMP2
+ %%MOVDQU [%%DST + %%SIZE - 1*16], %%XTMP3
+ %endif
+ %%DONE
+%endif
+
+%if (%%MAXSIZE >= 32)
+%%lt32:
+ test %%SIZE, 16
+ jz %%lt16
+ %if (%%USEAVX >= 2)
+ %%MOVDQU XWORD(%%XTMP0), [%%SRC + 0*16]
+ %%MOVDQU XWORD(%%XTMP1), [%%SRC + %%SIZE - 1*16]
+ %%MOVDQU [%%DST + 0*16], XWORD(%%XTMP0)
+ %%MOVDQU [%%DST + %%SIZE - 1*16], XWORD(%%XTMP1)
+ %else
+ %%MOVDQU %%XTMP0, [%%SRC + 0*16]
+ %%MOVDQU %%XTMP1, [%%SRC + %%SIZE - 1*16]
+ %%MOVDQU [%%DST + 0*16], %%XTMP0
+ %%MOVDQU [%%DST + %%SIZE - 1*16], %%XTMP1
+ %endif
+ %%DONE
+%endif
+
+%if (%%MAXSIZE >= 16)
+%%lt16:
+ test %%SIZE, 8
+ jz %%lt8
+ mov %%TMP0, [%%SRC]
+ mov %%TMP1, [%%SRC + %%SIZE - 8]
+ mov [%%DST], %%TMP0
+ mov [%%DST + %%SIZE - 8], %%TMP1
+ %%DONE
+%endif
+
+%if (%%MAXSIZE >= 8)
+%%lt8:
+ test %%SIZE, 4
+ jz %%lt4
+ mov DWORD(%%TMP0), [%%SRC]
+ mov DWORD(%%TMP1), [%%SRC + %%SIZE - 4]
+ mov [%%DST], DWORD(%%TMP0)
+ mov [%%DST + %%SIZE - 4], DWORD(%%TMP1)
+ %%DONE
+%endif
+
+%if (%%MAXSIZE >= 4)
+%%lt4:
+ test %%SIZE, 2
+ jz %%lt2
+ movzx DWORD(%%TMP0), word [%%SRC]
+ movzx DWORD(%%TMP1), byte [%%SRC + %%SIZE - 1]
+ mov [%%DST], WORD(%%TMP0)
+ mov [%%DST + %%SIZE - 1], BYTE(%%TMP1)
+ %%DONE
+%endif
+
+%%lt2:
+%if (%%NOT0 == 0)
+ test %%SIZE, 1
+ jz %%end
+%endif
+ movzx DWORD(%%TMP0), byte [%%SRC]
+ mov [%%DST], BYTE(%%TMP0)
+%%end:
+%if (%%USERET != 0)
+ ret
+%endif
+%endm
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;; Utility macro to assist with SIMD shifting
+%macro _PSRLDQ 3
+%define %%VEC %1
+%define %%REG %2
+%define %%IMM %3
+
+%ifidn %%VEC, SSE
+ psrldq %%REG, %%IMM
+%else
+ vpsrldq %%REG, %%REG, %%IMM
+%endif
+%endm
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; This section defines a series of macros to store small to medium amounts
+; of data from SIMD registers to memory, where the size is variable but limited.
+;
+; The macros are all called as:
+; memcpy DST, SRC, SIZE, TMP, IDX
+; with the parameters defined as:
+; DST : register: pointer to dst (not modified)
+; SRC : register: src data (clobbered)
+; SIZE : register: length in bytes (not modified)
+; TMP : 64-bit temp GPR (clobbered)
+; IDX : 64-bit GPR to store dst index/offset (clobbered)
+;
+; The name indicates the options. The name is of the form:
+; simd_store_<VEC>
+; where <VEC> is the SIMD instruction type e.g. "sse" or "avx"
+
+
+%macro simd_store_sse 5
+ __simd_store %1,%2,%3,%4,%5,SSE
+%endm
+
+%macro simd_store_avx 5
+ __simd_store %1,%2,%3,%4,%5,AVX
+%endm
+
+%macro simd_store_sse_15 5
+ __simd_store %1,%2,%3,%4,%5,SSE,15
+%endm
+
+%macro simd_store_avx_15 5
+ __simd_store %1,%2,%3,%4,%5,AVX,15
+%endm
+
+%macro __simd_store 6-7
+%define %%DST %1 ; register: pointer to dst (not modified)
+%define %%SRC %2 ; register: src data (clobbered)
+%define %%SIZE %3 ; register: length in bytes (not modified)
+%define %%TMP %4 ; 64-bit temp GPR (clobbered)
+%define %%IDX %5 ; 64-bit temp GPR to store dst idx (clobbered)
+%define %%SIMDTYPE %6 ; "SSE" or "AVX"
+%define %%MAX_LEN %7 ; [optional] maximum length to be stored, default 16
+
+%define %%PSRLDQ _PSRLDQ %%SIMDTYPE,
+
+%ifidn %%SIMDTYPE, SSE
+ %define %%MOVDQU movdqu
+ %define %%MOVQ movq
+%else
+ %define %%MOVDQU vmovdqu
+ %define %%MOVQ vmovq
+%endif
+
+;; determine max byte size for store operation
+%if %0 > 6
+%assign max_length_to_store %%MAX_LEN
+%else
+%assign max_length_to_store 16
+%endif
+
+%if max_length_to_store > 16
+%error "__simd_store macro invoked with MAX_LEN bigger than 16!"
+%endif
+
+ xor %%IDX, %%IDX ; zero idx
+
+%if max_length_to_store == 16
+ test %%SIZE, 16
+ jz %%lt16
+ %%MOVDQU [%%DST], %%SRC
+ jmp %%end
+%%lt16:
+%endif
+
+%if max_length_to_store >= 8
+ test %%SIZE, 8
+ jz %%lt8
+ %%MOVQ [%%DST + %%IDX], %%SRC
+ %%PSRLDQ %%SRC, 8
+ add %%IDX, 8
+%%lt8:
+%endif
+
+ %%MOVQ %%TMP, %%SRC ; use GPR from now on
+
+%if max_length_to_store >= 4
+ test %%SIZE, 4
+ jz %%lt4
+ mov [%%DST + %%IDX], DWORD(%%TMP)
+ shr %%TMP, 32
+ add %%IDX, 4
+%%lt4:
+%endif
+
+ test %%SIZE, 2
+ jz %%lt2
+ mov [%%DST + %%IDX], WORD(%%TMP)
+ shr %%TMP, 16
+ add %%IDX, 2
+%%lt2:
+ test %%SIZE, 1
+ jz %%end
+ mov [%%DST + %%IDX], BYTE(%%TMP)
+%%end:
+%endm
+
+; This section defines a series of macros to load small to medium amounts
+; (from 0 to 16 bytes) of data from memory to SIMD registers,
+; where the size is variable but limited.
+;
+; The macros are all called as:
+; simd_load DST, SRC, SIZE
+; with the parameters defined as:
+; DST : register: destination XMM register
+; SRC : register: pointer to src data (not modified)
+; SIZE : register: length in bytes (not modified)
+;
+; The name indicates the options. The name is of the form:
+; simd_load_<VEC>_<SZ><ZERO>
+; where:
+; <VEC> is either "sse" or "avx"
+; <SZ> is either "15" or "16" and defines largest value of SIZE
+; <ZERO> is blank or "_1". If "_1" then the min SIZE is 1 (otherwise 0)
+;
+; For example:
+; simd_load_sse_16 : SSE, 0 <= size <= 16
+; simd_load_avx_15_1 : AVX, 1 <= size <= 15
+
+%macro simd_load_sse_15_1 3
+ __simd_load %1,%2,%3,0,0,SSE
+%endm
+%macro simd_load_sse_15 3
+ __simd_load %1,%2,%3,1,0,SSE
+%endm
+%macro simd_load_sse_16_1 3
+ __simd_load %1,%2,%3,0,1,SSE
+%endm
+%macro simd_load_sse_16 3
+ __simd_load %1,%2,%3,1,1,SSE
+%endm
+
+%macro simd_load_avx_15_1 3
+ __simd_load %1,%2,%3,0,0,AVX
+%endm
+%macro simd_load_avx_15 3
+ __simd_load %1,%2,%3,1,0,AVX
+%endm
+%macro simd_load_avx_16_1 3
+ __simd_load %1,%2,%3,0,1,AVX
+%endm
+%macro simd_load_avx_16 3
+ __simd_load %1,%2,%3,1,1,AVX
+%endm
+
+%macro __simd_load 6
+%define %%DST %1 ; [out] destination XMM register
+%define %%SRC %2 ; [in] pointer to src data
+%define %%SIZE %3 ; [in] length in bytes (0-16 bytes)
+%define %%ACCEPT_0 %4 ; 0 = min length = 1, 1 = min length = 0
+%define %%ACCEPT_16 %5 ; 0 = max length = 15 , 1 = max length = 16
+%define %%SIMDTYPE %6 ; "SSE" or "AVX"
+
+%ifidn %%SIMDTYPE, SSE
+ %define %%MOVDQU movdqu
+ %define %%PINSRB pinsrb
+ %define %%PINSRQ pinsrq
+ %define %%PXOR pxor
+%else
+ %define %%MOVDQU vmovdqu
+ %define %%PINSRB vpinsrb
+ %define %%PINSRQ vpinsrq
+ %define %%PXOR vpxor
+%endif
+
+%if (%%ACCEPT_16 != 0)
+ test %%SIZE, 16
+ jz %%_skip_16
+ %%MOVDQU %%DST, [%%SRC]
+ jmp %%end_load
+
+%%_skip_16:
+%endif
+ %%PXOR %%DST, %%DST ; clear XMM register
+%if (%%ACCEPT_0 != 0)
+ or %%SIZE, %%SIZE
+ je %%end_load
+%endif
+ cmp %%SIZE, 1
+ je %%_size_1
+ cmp %%SIZE, 2
+ je %%_size_2
+ cmp %%SIZE, 3
+ je %%_size_3
+ cmp %%SIZE, 4
+ je %%_size_4
+ cmp %%SIZE, 5
+ je %%_size_5
+ cmp %%SIZE, 6
+ je %%_size_6
+ cmp %%SIZE, 7
+ je %%_size_7
+ cmp %%SIZE, 8
+ je %%_size_8
+ cmp %%SIZE, 9
+ je %%_size_9
+ cmp %%SIZE, 10
+ je %%_size_10
+ cmp %%SIZE, 11
+ je %%_size_11
+ cmp %%SIZE, 12
+ je %%_size_12
+ cmp %%SIZE, 13
+ je %%_size_13
+ cmp %%SIZE, 14
+ je %%_size_14
+
+%%_size_15:
+ %%PINSRB %%DST, [%%SRC + 14], 14
+%%_size_14:
+ %%PINSRB %%DST, [%%SRC + 13], 13
+%%_size_13:
+ %%PINSRB %%DST, [%%SRC + 12], 12
+%%_size_12:
+ %%PINSRB %%DST, [%%SRC + 11], 11
+%%_size_11:
+ %%PINSRB %%DST, [%%SRC + 10], 10
+%%_size_10:
+ %%PINSRB %%DST, [%%SRC + 9], 9
+%%_size_9:
+ %%PINSRB %%DST, [%%SRC + 8], 8
+%%_size_8:
+ %%PINSRQ %%DST, [%%SRC], 0
+ jmp %%end_load
+%%_size_7:
+ %%PINSRB %%DST, [%%SRC + 6], 6
+%%_size_6:
+ %%PINSRB %%DST, [%%SRC + 5], 5
+%%_size_5:
+ %%PINSRB %%DST, [%%SRC + 4], 4
+%%_size_4:
+ %%PINSRB %%DST, [%%SRC + 3], 3
+%%_size_3:
+ %%PINSRB %%DST, [%%SRC + 2], 2
+%%_size_2:
+ %%PINSRB %%DST, [%%SRC + 1], 1
+%%_size_1:
+ %%PINSRB %%DST, [%%SRC + 0], 0
+%%end_load:
+%endm
+%endif ; ifndef __MEMCPY_ASM__
diff --git a/src/spdk/intel-ipsec-mb/include/noaesni.h b/src/spdk/intel-ipsec-mb/include/noaesni.h
new file mode 100644
index 000000000..30d970edf
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/include/noaesni.h
@@ -0,0 +1,65 @@
+/*******************************************************************************
+ Copyright (c) 2018, Intel Corporation
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice,
+ this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+ * Neither the name of Intel Corporation nor the names of its contributors
+ may be used to endorse or promote products derived from this software
+ without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+ FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+
+#include "intel-ipsec-mb.h"
+
+#ifndef NOAESNI_H
+#define NOAESNI_H
+
+IMB_DLL_EXPORT void init_mb_mgr_sse_no_aesni(MB_MGR *state);
+IMB_DLL_EXPORT JOB_AES_HMAC *submit_job_sse_no_aesni(MB_MGR *state);
+IMB_DLL_EXPORT JOB_AES_HMAC *submit_job_nocheck_sse_no_aesni(MB_MGR *state);
+IMB_DLL_EXPORT JOB_AES_HMAC *flush_job_sse_no_aesni(MB_MGR *state);
+IMB_DLL_EXPORT uint32_t queue_size_sse_no_aesni(MB_MGR *state);
+IMB_DLL_EXPORT JOB_AES_HMAC *get_completed_job_sse_no_aesni(MB_MGR *state);
+IMB_DLL_EXPORT JOB_AES_HMAC *get_next_job_sse_no_aesni(MB_MGR *state);
+
+IMB_DLL_EXPORT void
+aes_keyexp_128_sse_no_aesni(const void *key, void *enc_exp_keys,
+ void *dec_exp_keys);
+IMB_DLL_EXPORT void
+aes_keyexp_192_sse_no_aesni(const void *key, void *enc_exp_keys,
+ void *dec_exp_keys);
+IMB_DLL_EXPORT void
+aes_keyexp_256_sse_no_aesni(const void *key, void *enc_exp_keys,
+ void *dec_exp_keys);
+IMB_DLL_EXPORT void
+aes_xcbc_expand_key_sse_no_aesni(const void *key, void *k1_exp, void *k2,
+ void *k3);
+IMB_DLL_EXPORT void
+aes_keyexp_128_enc_sse_no_aesni(const void *key, void *enc_exp_keys);
+IMB_DLL_EXPORT void
+aes_keyexp_192_enc_sse_no_aesni(const void *key, void *enc_exp_keys);
+IMB_DLL_EXPORT void
+aes_keyexp_256_enc_sse_no_aesni(const void *key, void *enc_exp_keys);
+IMB_DLL_EXPORT void
+aes_cmac_subkey_gen_sse_no_aesni(const void *key_exp, void *key1, void *key2);
+IMB_DLL_EXPORT void
+aes_cfb_128_one_sse_no_aesni(void *out, const void *in, const void *iv,
+ const void *keys, uint64_t len);
+
+#endif /* NOAESNI_H */
diff --git a/src/spdk/intel-ipsec-mb/include/os.asm b/src/spdk/intel-ipsec-mb/include/os.asm
new file mode 100644
index 000000000..f54043ed2
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/include/os.asm
@@ -0,0 +1,58 @@
+;;
+;; Copyright (c) 2017-2018, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;; * Redistributions of source code must retain the above copyright notice,
+;; this list of conditions and the following disclaimer.
+;; * Redistributions in binary form must reproduce the above copyright
+;; notice, this list of conditions and the following disclaimer in the
+;; documentation and/or other materials provided with the distribution.
+;; * Neither the name of Intel Corporation nor the names of its contributors
+;; may be used to endorse or promote products derived from this software
+;; without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+%ifndef OS_ASM_FILE
+%define OS_ASM_FILE
+
+%ifndef WIN_ABI
+%ifidn __OUTPUT_FORMAT__, win64
+%define WIN_ABI
+%endif
+%endif
+
+%ifndef LINUX
+%ifidn __OUTPUT_FORMAT__, elf64
+%define LINUX
+%endif
+%endif
+
+%ifdef LINUX
+;;; macro to declare global symbols
+;;; - name : symbol name
+;;; - type : funtion or data
+;;; - scope : internal, private, default
+%define MKGLOBAL(name,type,scope) global name %+ : %+ type scope
+%endif ; LINUX
+
+%ifdef WIN_ABI
+;;; macro to declare global symbols
+;;; - name : symbol name
+;;; - type : funtion or data
+;;; - scope : internal, private, default (ignored in win64 coff format)
+%define MKGLOBAL(name,type,scope) global name
+%endif ; WIN_ABI
+
+%endif ; OS_ASM_FILE
diff --git a/src/spdk/intel-ipsec-mb/include/reg_sizes.asm b/src/spdk/intel-ipsec-mb/include/reg_sizes.asm
new file mode 100644
index 000000000..c9f9f8cd2
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/include/reg_sizes.asm
@@ -0,0 +1,300 @@
+;;
+;; Copyright (c) 2012-2018, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;; * Redistributions of source code must retain the above copyright notice,
+;; this list of conditions and the following disclaimer.
+;; * Redistributions in binary form must reproduce the above copyright
+;; notice, this list of conditions and the following disclaimer in the
+;; documentation and/or other materials provided with the distribution.
+;; * Neither the name of Intel Corporation nor the names of its contributors
+;; may be used to endorse or promote products derived from this software
+;; without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+; define d and w variants for registers
+
+%ifndef _REG_SIZES_ASM_
+%define _REG_SIZES_ASM_
+
+%define raxd eax
+%define raxw ax
+%define raxb al
+
+%define rbxd ebx
+%define rbxw bx
+%define rbxb bl
+
+%define rcxd ecx
+%define rcxw cx
+%define rcxb cl
+
+%define rdxd edx
+%define rdxw dx
+%define rdxb dl
+
+%define rsid esi
+%define rsiw si
+%define rsib sil
+
+%define rdid edi
+%define rdiw di
+%define rdib dil
+
+%define rbpd ebp
+%define rbpw bp
+%define rbpb bpl
+
+%define zmm0x xmm0
+%define zmm1x xmm1
+%define zmm2x xmm2
+%define zmm3x xmm3
+%define zmm4x xmm4
+%define zmm5x xmm5
+%define zmm6x xmm6
+%define zmm7x xmm7
+%define zmm8x xmm8
+%define zmm9x xmm9
+%define zmm10x xmm10
+%define zmm11x xmm11
+%define zmm12x xmm12
+%define zmm13x xmm13
+%define zmm14x xmm14
+%define zmm15x xmm15
+%define zmm16x xmm16
+%define zmm17x xmm17
+%define zmm18x xmm18
+%define zmm19x xmm19
+%define zmm20x xmm20
+%define zmm21x xmm21
+%define zmm22x xmm22
+%define zmm23x xmm23
+%define zmm24x xmm24
+%define zmm25x xmm25
+%define zmm26x xmm26
+%define zmm27x xmm27
+%define zmm28x xmm28
+%define zmm29x xmm29
+%define zmm30x xmm30
+%define zmm31x xmm31
+
+%define ymm0x xmm0
+%define ymm1x xmm1
+%define ymm2x xmm2
+%define ymm3x xmm3
+%define ymm4x xmm4
+%define ymm5x xmm5
+%define ymm6x xmm6
+%define ymm7x xmm7
+%define ymm8x xmm8
+%define ymm9x xmm9
+%define ymm10x xmm10
+%define ymm11x xmm11
+%define ymm12x xmm12
+%define ymm13x xmm13
+%define ymm14x xmm14
+%define ymm15x xmm15
+%define ymm16x xmm16
+%define ymm17x xmm17
+%define ymm18x xmm18
+%define ymm19x xmm19
+%define ymm20x xmm20
+%define ymm21x xmm21
+%define ymm22x xmm22
+%define ymm23x xmm23
+%define ymm24x xmm24
+%define ymm25x xmm25
+%define ymm26x xmm26
+%define ymm27x xmm27
+%define ymm28x xmm28
+%define ymm29x xmm29
+%define ymm30x xmm30
+%define ymm31x xmm31
+
+%define xmm0x xmm0
+%define xmm1x xmm1
+%define xmm2x xmm2
+%define xmm3x xmm3
+%define xmm4x xmm4
+%define xmm5x xmm5
+%define xmm6x xmm6
+%define xmm7x xmm7
+%define xmm8x xmm8
+%define xmm9x xmm9
+%define xmm10x xmm10
+%define xmm11x xmm11
+%define xmm12x xmm12
+%define xmm13x xmm13
+%define xmm14x xmm14
+%define xmm15x xmm15
+%define xmm16x xmm16
+%define xmm17x xmm17
+%define xmm18x xmm18
+%define xmm19x xmm19
+%define xmm20x xmm20
+%define xmm21x xmm21
+%define xmm22x xmm22
+%define xmm23x xmm23
+%define xmm24x xmm24
+%define xmm25x xmm25
+%define xmm26x xmm26
+%define xmm27x xmm27
+%define xmm28x xmm28
+%define xmm29x xmm29
+%define xmm30x xmm30
+%define xmm31x xmm31
+
+%define zmm0y ymm0
+%define zmm1y ymm1
+%define zmm2y ymm2
+%define zmm3y ymm3
+%define zmm4y ymm4
+%define zmm5y ymm5
+%define zmm6y ymm6
+%define zmm7y ymm7
+%define zmm8y ymm8
+%define zmm9y ymm9
+%define zmm10y ymm10
+%define zmm11y ymm11
+%define zmm12y ymm12
+%define zmm13y ymm13
+%define zmm14y ymm14
+%define zmm15y ymm15
+%define zmm16y ymm16
+%define zmm17y ymm17
+%define zmm18y ymm18
+%define zmm19y ymm19
+%define zmm20y ymm20
+%define zmm21y ymm21
+%define zmm22y ymm22
+%define zmm23y ymm23
+%define zmm24y ymm24
+%define zmm25y ymm25
+%define zmm26y ymm26
+%define zmm27y ymm27
+%define zmm28y ymm28
+%define zmm29y ymm29
+%define zmm30y ymm30
+%define zmm31y ymm31
+
+%define xmm0y ymm0
+%define xmm1y ymm1
+%define xmm2y ymm2
+%define xmm3y ymm3
+%define xmm4y ymm4
+%define xmm5y ymm5
+%define xmm6y ymm6
+%define xmm7y ymm7
+%define xmm8y ymm8
+%define xmm9y ymm9
+%define xmm10y ymm10
+%define xmm11y ymm11
+%define xmm12y ymm12
+%define xmm13y ymm13
+%define xmm14y ymm14
+%define xmm15y ymm15
+%define xmm16y ymm16
+%define xmm17y ymm17
+%define xmm18y ymm18
+%define xmm19y ymm19
+%define xmm20y ymm20
+%define xmm21y ymm21
+%define xmm22y ymm22
+%define xmm23y ymm23
+%define xmm24y ymm24
+%define xmm25y ymm25
+%define xmm26y ymm26
+%define xmm27y ymm27
+%define xmm28y ymm28
+%define xmm29y ymm29
+%define xmm30y ymm30
+%define xmm31y ymm31
+
+%define xmm0z zmm0
+%define xmm1z zmm1
+%define xmm2z zmm2
+%define xmm3z zmm3
+%define xmm4z zmm4
+%define xmm5z zmm5
+%define xmm6z zmm6
+%define xmm7z zmm7
+%define xmm8z zmm8
+%define xmm9z zmm9
+%define xmm10z zmm10
+%define xmm11z zmm11
+%define xmm12z zmm12
+%define xmm13z zmm13
+%define xmm14z zmm14
+%define xmm15z zmm15
+%define xmm16z zmm16
+%define xmm17z zmm17
+%define xmm18z zmm18
+%define xmm19z zmm19
+%define xmm20z zmm20
+%define xmm21z zmm21
+%define xmm22z zmm22
+%define xmm23z zmm23
+%define xmm24z zmm24
+%define xmm25z zmm25
+%define xmm26z zmm26
+%define xmm27z zmm27
+%define xmm28z zmm28
+%define xmm29z zmm29
+%define xmm30z zmm30
+%define xmm31z zmm31
+
+%define ymm0z zmm0
+%define ymm1z zmm1
+%define ymm2z zmm2
+%define ymm3z zmm3
+%define ymm4z zmm4
+%define ymm5z zmm5
+%define ymm6z zmm6
+%define ymm7z zmm7
+%define ymm8z zmm8
+%define ymm9z zmm9
+%define ymm10z zmm10
+%define ymm11z zmm11
+%define ymm12z zmm12
+%define ymm13z zmm13
+%define ymm14z zmm14
+%define ymm15z zmm15
+%define ymm16z zmm16
+%define ymm17z zmm17
+%define ymm18z zmm18
+%define ymm19z zmm19
+%define ymm20z zmm20
+%define ymm21z zmm21
+%define ymm22z zmm22
+%define ymm23z zmm23
+%define ymm24z zmm24
+%define ymm25z zmm25
+%define ymm26z zmm26
+%define ymm27z zmm27
+%define ymm28z zmm28
+%define ymm29z zmm29
+%define ymm30z zmm30
+%define ymm31z zmm31
+
+%define DWORD(reg) reg %+ d
+%define WORD(reg) reg %+ w
+%define BYTE(reg) reg %+ b
+
+%define XWORD(reg) reg %+ x
+%define YWORD(reg) reg %+ y
+%define ZWORD(reg) reg %+ z
+
+%endif ;; _REG_SIZES_ASM_
diff --git a/src/spdk/intel-ipsec-mb/include/save_xmms.asm b/src/spdk/intel-ipsec-mb/include/save_xmms.asm
new file mode 100644
index 000000000..c9fd67eb5
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/include/save_xmms.asm
@@ -0,0 +1,132 @@
+;;
+;; Copyright (c) 2012-2018, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;; * Redistributions of source code must retain the above copyright notice,
+;; this list of conditions and the following disclaimer.
+;; * Redistributions in binary form must reproduce the above copyright
+;; notice, this list of conditions and the following disclaimer in the
+;; documentation and/or other materials provided with the distribution.
+;; * Neither the name of Intel Corporation nor the names of its contributors
+;; may be used to endorse or promote products derived from this software
+;; without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+%include "include/os.asm"
+
+%ifdef LINUX
+%define ARG1 rdi
+%else
+%define ARG1 rcx
+%endif
+
+section .text
+; void save_xmms(UINT128 array[10])
+MKGLOBAL(save_xmms,function,internal)
+save_xmms:
+ movdqa [ARG1 + 0*16], xmm6
+ movdqa [ARG1 + 1*16], xmm7
+ movdqa [ARG1 + 2*16], xmm8
+ movdqa [ARG1 + 3*16], xmm9
+ movdqa [ARG1 + 4*16], xmm10
+ movdqa [ARG1 + 5*16], xmm11
+ movdqa [ARG1 + 6*16], xmm12
+ movdqa [ARG1 + 7*16], xmm13
+ movdqa [ARG1 + 8*16], xmm14
+ movdqa [ARG1 + 9*16], xmm15
+ ret
+
+
+; void restore_xmms(UINT128 array[10])
+MKGLOBAL(restore_xmms,function,internal)
+restore_xmms:
+ movdqa xmm6, [ARG1 + 0*16]
+ movdqa xmm7, [ARG1 + 1*16]
+ movdqa xmm8, [ARG1 + 2*16]
+ movdqa xmm9, [ARG1 + 3*16]
+ movdqa xmm10, [ARG1 + 4*16]
+ movdqa xmm11, [ARG1 + 5*16]
+ movdqa xmm12, [ARG1 + 6*16]
+ movdqa xmm13, [ARG1 + 7*16]
+ movdqa xmm14, [ARG1 + 8*16]
+ movdqa xmm15, [ARG1 + 9*16]
+%ifdef SAFE_DATA
+ ;; Clear potential sensitive data stored in stack
+ pxor xmm0, xmm0
+ movdqa [ARG1 + 0 * 16], xmm0
+ movdqa [ARG1 + 1 * 16], xmm0
+ movdqa [ARG1 + 2 * 16], xmm0
+ movdqa [ARG1 + 3 * 16], xmm0
+ movdqa [ARG1 + 4 * 16], xmm0
+ movdqa [ARG1 + 5 * 16], xmm0
+ movdqa [ARG1 + 6 * 16], xmm0
+ movdqa [ARG1 + 7 * 16], xmm0
+ movdqa [ARG1 + 8 * 16], xmm0
+ movdqa [ARG1 + 9 * 16], xmm0
+%endif
+
+ ret
+
+
+ ; void save_xmms_avx(UINT128 array[10])
+MKGLOBAL(save_xmms_avx,function,internal)
+save_xmms_avx:
+ vmovdqa [ARG1 + 0*16], xmm6
+ vmovdqa [ARG1 + 1*16], xmm7
+ vmovdqa [ARG1 + 2*16], xmm8
+ vmovdqa [ARG1 + 3*16], xmm9
+ vmovdqa [ARG1 + 4*16], xmm10
+ vmovdqa [ARG1 + 5*16], xmm11
+ vmovdqa [ARG1 + 6*16], xmm12
+ vmovdqa [ARG1 + 7*16], xmm13
+ vmovdqa [ARG1 + 8*16], xmm14
+ vmovdqa [ARG1 + 9*16], xmm15
+ ret
+
+
+; void restore_xmms_avx(UINT128 array[10])
+MKGLOBAL(restore_xmms_avx,function,internal)
+restore_xmms_avx:
+ vmovdqa xmm6, [ARG1 + 0*16]
+ vmovdqa xmm7, [ARG1 + 1*16]
+ vmovdqa xmm8, [ARG1 + 2*16]
+ vmovdqa xmm9, [ARG1 + 3*16]
+ vmovdqa xmm10, [ARG1 + 4*16]
+ vmovdqa xmm11, [ARG1 + 5*16]
+ vmovdqa xmm12, [ARG1 + 6*16]
+ vmovdqa xmm13, [ARG1 + 7*16]
+ vmovdqa xmm14, [ARG1 + 8*16]
+ vmovdqa xmm15, [ARG1 + 9*16]
+
+%ifdef SAFE_DATA
+ ;; Clear potential sensitive data stored in stack
+ vpxor xmm0, xmm0
+ vmovdqa [ARG1 + 0 * 16], xmm0
+ vmovdqa [ARG1 + 1 * 16], xmm0
+ vmovdqa [ARG1 + 2 * 16], xmm0
+ vmovdqa [ARG1 + 3 * 16], xmm0
+ vmovdqa [ARG1 + 4 * 16], xmm0
+ vmovdqa [ARG1 + 5 * 16], xmm0
+ vmovdqa [ARG1 + 6 * 16], xmm0
+ vmovdqa [ARG1 + 7 * 16], xmm0
+ vmovdqa [ARG1 + 8 * 16], xmm0
+ vmovdqa [ARG1 + 9 * 16], xmm0
+%endif
+ ret
+
+%ifdef LINUX
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
diff --git a/src/spdk/intel-ipsec-mb/include/save_xmms.h b/src/spdk/intel-ipsec-mb/include/save_xmms.h
new file mode 100644
index 000000000..e711958da
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/include/save_xmms.h
@@ -0,0 +1,39 @@
+/*******************************************************************************
+ Copyright (c) 2012-2018, Intel Corporation
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice,
+ this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+ * Neither the name of Intel Corporation nor the names of its contributors
+ may be used to endorse or promote products derived from this software
+ without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+ FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+
+#ifndef SAVE_XMMS_H
+#define SAVE_XMMS_H
+
+#include "intel-ipsec-mb.h"
+
+void save_xmms(uint128_t array[10]);
+void restore_xmms(uint128_t array[10]);
+
+void save_xmms_avx(uint128_t array[10]);
+void restore_xmms_avx(uint128_t array[10]);
+
+#endif /* SAVE_XMMS_H */
diff --git a/src/spdk/intel-ipsec-mb/include/snow3g.h b/src/spdk/intel-ipsec-mb/include/snow3g.h
new file mode 100644
index 000000000..520a4b41f
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/include/snow3g.h
@@ -0,0 +1,511 @@
+/*******************************************************************************
+ Copyright (c) 2009-2019, Intel Corporation
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice,
+ this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+ * Neither the name of Intel Corporation nor the names of its contributors
+ may be used to endorse or promote products derived from this software
+ without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+ FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+
+#ifndef _SNOW3G_H_
+#define _SNOW3G_H_
+
+/*******************************************************************************
+ * SSE
+ ******************************************************************************/
+void
+snow3g_f8_1_buffer_bit_sse(const snow3g_key_schedule_t *pCtx,
+ const void *pIV,
+ const void *pBufferIn,
+ void *pBufferOut,
+ const uint32_t cipherLengthInBits,
+ const uint32_t offsetInBits);
+
+void
+snow3g_f8_1_buffer_sse(const snow3g_key_schedule_t *pCtx,
+ const void *pIV,
+ const void *pBufferIn,
+ void *pBufferOut,
+ const uint32_t lengthInBytes);
+
+void
+snow3g_f8_2_buffer_sse(const snow3g_key_schedule_t *pCtx,
+ const void *pIV1,
+ const void *pIV2,
+ const void *pBufferIn1,
+ void *pBufferOut1,
+ const uint32_t lengthInBytes1,
+ const void *pBufferIn2,
+ void *pBufferOut2,
+ const uint32_t lengthInBytes2);
+
+void
+snow3g_f8_4_buffer_sse(const snow3g_key_schedule_t *pCtx,
+ const void *pIV1,
+ const void *pIV2,
+ const void *pIV3,
+ const void *pIV4,
+ const void *pBufferIn1,
+ void *pBufferOut1,
+ const uint32_t lengthInBytes1,
+ const void *pBufferIn2,
+ void *pBufferOut2,
+ const uint32_t lengthInBytes2,
+ const void *pBufferIn3,
+ void *pBufferOut3,
+ const uint32_t lengthInBytes3,
+ const void *pBufferIn4,
+ void *pBufferOut4,
+ const uint32_t lengthInBytes4);
+
+void
+snow3g_f8_8_buffer_sse(const snow3g_key_schedule_t *pCtx,
+ const void *pIV1,
+ const void *pIV2,
+ const void *pIV3,
+ const void *pIV4,
+ const void *pIV5,
+ const void *pIV6,
+ const void *pIV7,
+ const void *pIV8,
+ const void *pBufferIn1,
+ void *pBufferOut1,
+ const uint32_t lengthInBytes1,
+ const void *pBufferIn2,
+ void *pBufferOut2,
+ const uint32_t lengthInBytes2,
+ const void *pBufferIn3,
+ void *pBufferOut3,
+ const uint32_t lengthInBytes3,
+ const void *pBufferIn4,
+ void *pBufferOut4,
+ const uint32_t lengthInBytes4,
+ const void *pBufferIn5,
+ void *pBufferOut5,
+ const uint32_t lengthInBytes5,
+ const void *pBufferIn6,
+ void *pBufferOut6,
+ const uint32_t lengthInBytes6,
+ const void *pBufferIn7,
+ void *pBufferOut7,
+ const uint32_t lengthInBytes7,
+ const void *pBufferIn8,
+ void *pBufferOut8,
+ const uint32_t lengthInBytes8);
+
+void
+snow3g_f8_8_buffer_multikey_sse(const snow3g_key_schedule_t * const pCtx[],
+ const void * const pIV[],
+ const void * const pBufferIn[],
+ void *pBufferOut[],
+ const uint32_t lengthInBytes[]);
+
+void
+snow3g_f8_n_buffer_sse(const snow3g_key_schedule_t *pCtx,
+ const void * const IV[],
+ const void * const pBufferIn[],
+ void *pBufferOut[],
+ const uint32_t bufferLenInBytes[],
+ const uint32_t bufferCount);
+
+void
+snow3g_f8_n_buffer_multikey_sse(const snow3g_key_schedule_t * const pCtx[],
+ const void * const IV[],
+ const void * const pBufferIn[],
+ void *pBufferOut[],
+ const uint32_t bufferLenInBytes[],
+ const uint32_t bufferCount);
+
+void
+snow3g_f9_1_buffer_sse(const snow3g_key_schedule_t *pCtx,
+ const void *pIV,
+ const void *pBufferIn,
+ const uint64_t lengthInBits,
+ void *pDigest);
+
+size_t
+snow3g_key_sched_size_sse(void);
+
+int
+snow3g_init_key_sched_sse(const void *pKey, snow3g_key_schedule_t *pCtx);
+
+/*******************************************************************************
+ * SSE NO-AESNI
+ ******************************************************************************/
+void
+snow3g_f8_1_buffer_bit_sse_no_aesni(const snow3g_key_schedule_t *pCtx,
+ const void *pIV,
+ const void *pBufferIn,
+ void *pBufferOut,
+ const uint32_t cipherLengthInBits,
+ const uint32_t offsetInBits);
+
+void
+snow3g_f8_1_buffer_sse_no_aesni(const snow3g_key_schedule_t *pCtx,
+ const void *pIV,
+ const void *pBufferIn,
+ void *pBufferOut,
+ const uint32_t lengthInBytes);
+
+void
+snow3g_f8_2_buffer_sse_no_aesni(const snow3g_key_schedule_t *pCtx,
+ const void *pIV1,
+ const void *pIV2,
+ const void *pBufferIn1,
+ void *pBufferOut1,
+ const uint32_t lengthInBytes1,
+ const void *pBufferIn2,
+ void *pBufferOut2,
+ const uint32_t lengthInBytes2);
+
+void
+snow3g_f8_4_buffer_sse_no_aesni(const snow3g_key_schedule_t *pCtx,
+ const void *pIV1,
+ const void *pIV2,
+ const void *pIV3,
+ const void *pIV4,
+ const void *pBufferIn1,
+ void *pBufferOut1,
+ const uint32_t lengthInBytes1,
+ const void *pBufferIn2,
+ void *pBufferOut2,
+ const uint32_t lengthInBytes2,
+ const void *pBufferIn3,
+ void *pBufferOut3,
+ const uint32_t lengthInBytes3,
+ const void *pBufferIn4,
+ void *pBufferOut4,
+ const uint32_t lengthInBytes4);
+
+void
+snow3g_f8_8_buffer_sse_no_aesni(const snow3g_key_schedule_t *pCtx,
+ const void *pIV1,
+ const void *pIV2,
+ const void *pIV3,
+ const void *pIV4,
+ const void *pIV5,
+ const void *pIV6,
+ const void *pIV7,
+ const void *pIV8,
+ const void *pBufferIn1,
+ void *pBufferOut1,
+ const uint32_t lengthInBytes1,
+ const void *pBufferIn2,
+ void *pBufferOut2,
+ const uint32_t lengthInBytes2,
+ const void *pBufferIn3,
+ void *pBufferOut3,
+ const uint32_t lengthInBytes3,
+ const void *pBufferIn4,
+ void *pBufferOut4,
+ const uint32_t lengthInBytes4,
+ const void *pBufferIn5,
+ void *pBufferOut5,
+ const uint32_t lengthInBytes5,
+ const void *pBufferIn6,
+ void *pBufferOut6,
+ const uint32_t lengthInBytes6,
+ const void *pBufferIn7,
+ void *pBufferOut7,
+ const uint32_t lengthInBytes7,
+ const void *pBufferIn8,
+ void *pBufferOut8,
+ const uint32_t lengthInBytes8);
+
+void
+snow3g_f8_8_buffer_multikey_sse_no_aesni(const snow3g_key_schedule_t * const
+ pCtx[],
+ const void * const pIV[],
+ const void * const pBufferIn[],
+ void *pBufferOut[],
+ const uint32_t lengthInBytes[]);
+
+void
+snow3g_f8_n_buffer_sse_no_aesni(const snow3g_key_schedule_t *pCtx,
+ const void * const IV[],
+ const void * const pBufferIn[],
+ void *pBufferOut[],
+ const uint32_t bufferLenInBytes[],
+ const uint32_t bufferCount);
+
+void
+snow3g_f8_n_buffer_multikey_sse_no_aesni(const snow3g_key_schedule_t * const
+ pCtx[],
+ const void * const IV[],
+ const void * const pBufferIn[],
+ void *pBufferOut[],
+ const uint32_t bufferLenInBytes[],
+ const uint32_t bufferCount);
+
+void
+snow3g_f9_1_buffer_sse_no_aesni(const snow3g_key_schedule_t *pCtx,
+ const void *pIV,
+ const void *pBufferIn,
+ const uint64_t lengthInBits,
+ void *pDigest);
+
+size_t
+snow3g_key_sched_size_sse_no_aesni(void);
+
+int
+snow3g_init_key_sched_sse_no_aesni(const void *pKey,
+ snow3g_key_schedule_t *pCtx);
+
+/*******************************************************************************
+ * AVX
+ ******************************************************************************/
+void
+snow3g_f8_1_buffer_bit_avx(const snow3g_key_schedule_t *pCtx,
+ const void *pIV,
+ const void *pBufferIn,
+ void *pBufferOut,
+ const uint32_t cipherLengthInBits,
+ const uint32_t offsetInBits);
+
+void
+snow3g_f8_1_buffer_avx(const snow3g_key_schedule_t *pCtx,
+ const void *pIV,
+ const void *pBufferIn,
+ void *pBufferOut,
+ const uint32_t lengthInBytes);
+
+void
+snow3g_f8_2_buffer_avx(const snow3g_key_schedule_t *pCtx,
+ const void *pIV1,
+ const void *pIV2,
+ const void *pBufferIn1,
+ void *pBufferOut1,
+ const uint32_t lengthInBytes1,
+ const void *pBufferIn2,
+ void *pBufferOut2,
+ const uint32_t lengthInBytes2);
+
+void
+snow3g_f8_4_buffer_avx(const snow3g_key_schedule_t *pCtx,
+ const void *pIV1,
+ const void *pIV2,
+ const void *pIV3,
+ const void *pIV4,
+ const void *pBufferIn1,
+ void *pBufferOut1,
+ const uint32_t lengthInBytes1,
+ const void *pBufferIn2,
+ void *pBufferOut2,
+ const uint32_t lengthInBytes2,
+ const void *pBufferIn3,
+ void *pBufferOut3,
+ const uint32_t lengthInBytes3,
+ const void *pBufferIn4,
+ void *pBufferOut4,
+ const uint32_t lengthInBytes4);
+
+void
+snow3g_f8_8_buffer_avx(const snow3g_key_schedule_t *pCtx,
+ const void *pIV1,
+ const void *pIV2,
+ const void *pIV3,
+ const void *pIV4,
+ const void *pIV5,
+ const void *pIV6,
+ const void *pIV7,
+ const void *pIV8,
+ const void *pBufferIn1,
+ void *pBufferOut1,
+ const uint32_t lengthInBytes1,
+ const void *pBufferIn2,
+ void *pBufferOut2,
+ const uint32_t lengthInBytes2,
+ const void *pBufferIn3,
+ void *pBufferOut3,
+ const uint32_t lengthInBytes3,
+ const void *pBufferIn4,
+ void *pBufferOut4,
+ const uint32_t lengthInBytes4,
+ const void *pBufferIn5,
+ void *pBufferOut5,
+ const uint32_t lengthInBytes5,
+ const void *pBufferIn6,
+ void *pBufferOut6,
+ const uint32_t lengthInBytes6,
+ const void *pBufferIn7,
+ void *pBufferOut7,
+ const uint32_t lengthInBytes7,
+ const void *pBufferIn8,
+ void *pBufferOut8,
+ const uint32_t lengthInBytes8);
+
+void
+snow3g_f8_8_buffer_multikey_avx(const snow3g_key_schedule_t * const pCtx[],
+ const void * const pIV[],
+ const void * const pBufferIn[],
+ void *pBufferOut[],
+ const uint32_t lengthInBytes[]);
+
+void
+snow3g_f8_n_buffer_avx(const snow3g_key_schedule_t *pCtx,
+ const void * const IV[],
+ const void * const pBufferIn[],
+ void *pBufferOut[],
+ const uint32_t bufferLenInBytes[],
+ const uint32_t bufferCount);
+
+void
+snow3g_f8_n_buffer_multikey_avx(const snow3g_key_schedule_t * const pCtx[],
+ const void * const IV[],
+ const void * const pBufferIn[],
+ void *pBufferOut[],
+ const uint32_t bufferLenInBytes[],
+ const uint32_t bufferCount);
+
+void
+snow3g_f9_1_buffer_avx(const snow3g_key_schedule_t *pCtx,
+ const void *pIV,
+ const void *pBufferIn,
+ const uint64_t lengthInBits,
+ void *pDigest);
+
+size_t
+snow3g_key_sched_size_avx(void);
+
+int
+snow3g_init_key_sched_avx(const void *pKey, snow3g_key_schedule_t *pCtx);
+
+/*******************************************************************************
+ * AVX2
+ ******************************************************************************/
+
+void
+snow3g_f8_1_buffer_bit_avx2(const snow3g_key_schedule_t *pCtx,
+ const void *pIV,
+ const void *pBufferIn,
+ void *pBufferOut,
+ const uint32_t cipherLengthInBits,
+ const uint32_t offsetInBits);
+
+void
+snow3g_f8_1_buffer_avx2(const snow3g_key_schedule_t *pCtx,
+ const void *pIV,
+ const void *pBufferIn,
+ void *pBufferOut,
+ const uint32_t lengthInBytes);
+
+void
+snow3g_f8_2_buffer_avx2(const snow3g_key_schedule_t *pCtx,
+ const void *pIV1,
+ const void *pIV2,
+ const void *pBufferIn1,
+ void *pBufferOut1,
+ const uint32_t lengthInBytes1,
+ const void *pBufferIn2,
+ void *pBufferOut2,
+ const uint32_t lengthInBytes2);
+
+void
+snow3g_f8_4_buffer_avx2(const snow3g_key_schedule_t *pCtx,
+ const void *pIV1,
+ const void *pIV2,
+ const void *pIV3,
+ const void *pIV4,
+ const void *pBufferIn1,
+ void *pBufferOut1,
+ const uint32_t lengthInBytes1,
+ const void *pBufferIn2,
+ void *pBufferOut2,
+ const uint32_t lengthInBytes2,
+ const void *pBufferIn3,
+ void *pBufferOut3,
+ const uint32_t lengthInBytes3,
+ const void *pBufferIn4,
+ void *pBufferOut4,
+ const uint32_t lengthInBytes4);
+
+void
+snow3g_f8_8_buffer_avx2(const snow3g_key_schedule_t *pCtx,
+ const void *pIV1,
+ const void *pIV2,
+ const void *pIV3,
+ const void *pIV4,
+ const void *pIV5,
+ const void *pIV6,
+ const void *pIV7,
+ const void *pIV8,
+ const void *pBufferIn1,
+ void *pBufferOut1,
+ const uint32_t lengthInBytes1,
+ const void *pBufferIn2,
+ void *pBufferOut2,
+ const uint32_t lengthInBytes2,
+ const void *pBufferIn3,
+ void *pBufferOut3,
+ const uint32_t lengthInBytes3,
+ const void *pBufferIn4,
+ void *pBufferOut4,
+ const uint32_t lengthInBytes4,
+ const void *pBufferIn5,
+ void *pBufferOut5,
+ const uint32_t lengthInBytes5,
+ const void *pBufferIn6,
+ void *pBufferOut6,
+ const uint32_t lengthInBytes6,
+ const void *pBufferIn7,
+ void *pBufferOut7,
+ const uint32_t lengthInBytes7,
+ const void *pBufferIn8,
+ void *pBufferOut8,
+ const uint32_t lengthInBytes8);
+
+void
+snow3g_f8_8_buffer_multikey_avx2(const snow3g_key_schedule_t * const pCtx[],
+ const void * const pIV[],
+ const void * const pBufferIn[],
+ void *pBufferOut[],
+ const uint32_t lengthInBytes[]);
+
+void
+snow3g_f8_n_buffer_avx2(const snow3g_key_schedule_t *pCtx,
+ const void * const IV[],
+ const void * const pBufferIn[],
+ void *pBufferOut[],
+ const uint32_t bufferLenInBytes[],
+ const uint32_t bufferCount);
+
+void
+snow3g_f8_n_buffer_multikey_avx2(const snow3g_key_schedule_t * const pCtx[],
+ const void * const IV[],
+ const void * const pBufferIn[],
+ void *pBufferOut[],
+ const uint32_t bufferLenInBytes[],
+ const uint32_t bufferCount);
+
+void
+snow3g_f9_1_buffer_avx2(const snow3g_key_schedule_t *pCtx,
+ const void *pIV,
+ const void *pBufferIn,
+ const uint64_t lengthInBits,
+ void *pDigest);
+
+size_t
+snow3g_key_sched_size_avx2(void);
+
+int
+snow3g_init_key_sched_avx2(const void *pKey, snow3g_key_schedule_t *pCtx);
+
+#endif /* _SNOW3G_H_ */
diff --git a/src/spdk/intel-ipsec-mb/include/snow3g_common.h b/src/spdk/intel-ipsec-mb/include/snow3g_common.h
new file mode 100644
index 000000000..d7c7e63c1
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/include/snow3g_common.h
@@ -0,0 +1,2840 @@
+/*******************************************************************************
+ Copyright (c) 2009-2019, Intel Corporation
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice,
+ this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+ * Neither the name of Intel Corporation nor the names of its contributors
+ may be used to endorse or promote products derived from this software
+ without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+ FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+
+/*-----------------------------------------------------------------------
+ *
+ * An implementation of SNOW 3G, the core algorithm for the
+ * 3GPP Confidentiality and Integrity algorithms.
+ *
+ *-----------------------------------------------------------------------*/
+
+#ifndef SNOW3G_COMMON_H
+#define SNOW3G_COMMON_H
+
+#include <stdio.h>
+#include <string.h>
+#include <stdint.h>
+
+#include "intel-ipsec-mb.h"
+#include "include/snow3g.h"
+#include "include/snow3g_internal.h"
+#include "clear_regs_mem.h"
+
+#define CLEAR_MEM clear_mem
+#define CLEAR_VAR clear_var
+
+/* -------------------------------------------------------------------
+ * LFSR array shift by 1 position, 4 packets at a time
+ * ------------------------------------------------------------------ */
+
+#ifdef AVX2
+/* LFSR array shift */
+static inline void ShiftLFSR_8(snow3gKeyState8_t *pCtx)
+{
+ pCtx->iLFSR_X = (pCtx->iLFSR_X + 1) & 15;
+}
+#endif /* AVX2 */
+
+/* LFSR array shift */
+static inline void ShiftLFSR_4(snow3gKeyState4_t *pCtx)
+{
+ pCtx->iLFSR_X = (pCtx->iLFSR_X + 1) % 16;
+}
+
+/*---------------------------------------------------------
+ * @description
+ * Gf2 modular multiplication/reduction
+ *
+ *---------------------------------------------------------*/
+static inline uint64_t multiply_and_reduce64(uint64_t a, uint64_t b)
+{
+ uint64_t msk;
+ uint64_t res = 0;
+ uint64_t i = 64;
+
+ while (i--) {
+ msk = ((int64_t)res >> 63) & 0x1b;
+ res <<= 1;
+ res ^= msk;
+ msk = ((int64_t)b >> 63) & a;
+ b <<= 1;
+ res ^= msk;
+ }
+ return res;
+}
+
+#ifdef AVX2
+/* -------------------------------------------------------------------
+ * ClockLFSR sub-function as defined in snow3g standard
+ * S = LFSR[2]
+ * ^ table_Alpha_div[LFSR[11] & 0xff]
+ * ^ table_Alpha_mul[LFSR[0] & 0xff]
+ * ------------------------------------------------------------------ */
+static void C0_C11_8(__m256i *S, const __m256i *L0, const __m256i *L11)
+{
+ __m256i mask, Sx, B11, B0, offset;
+
+ offset = _mm256_set1_epi32(3);
+ mask = _mm256_setr_epi32(0xF0F0F000, 0xF0F0F004, 0xF0F0F008, 0xF0F0F00C,
+ 0xF0F0F000, 0xF0F0F004, 0xF0F0F008,
+ 0xF0F0F00C);
+ B11 = _mm256_shuffle_epi8(*L11, mask);
+ *S = _mm256_i32gather_epi32(snow3g_table_A_div, B11, 4);
+
+ mask = _mm256_add_epi32(mask, offset);
+ B0 = _mm256_shuffle_epi8(*L0, mask);
+ Sx = _mm256_i32gather_epi32(snow3g_table_A_mul, B0, 4);
+ *S = _mm256_xor_si256(*S, Sx);
+}
+#endif /* AVX2 */
+
+/* -------------------------------------------------------------------
+ * ClockLFSR sub-function as defined in snow3g standard
+ * S = LFSR[2]
+ * ^ table_Alpha_div[LFSR[11] & 0xff]
+ * ^ table_Alpha_mul[LFSR[0] & 0xff]
+ * ------------------------------------------------------------------ */
+static inline void C0_C11_4(uint32_t *S, const __m128i *L0, const __m128i *L11)
+{
+ unsigned B11[4], B0[4];
+
+ B11[0] = _mm_extract_epi8(*L11, 0);
+ B11[1] = _mm_extract_epi8(*L11, 4);
+ B11[2] = _mm_extract_epi8(*L11, 8);
+ B11[3] = _mm_extract_epi8(*L11, 12);
+
+ S[0] = snow3g_table_A_div[B11[0]];
+ S[1] = snow3g_table_A_div[B11[1]];
+ S[2] = snow3g_table_A_div[B11[2]];
+ S[3] = snow3g_table_A_div[B11[3]];
+
+ B0[0] = _mm_extract_epi8(*L0, 3);
+ B0[1] = _mm_extract_epi8(*L0, 7);
+ B0[2] = _mm_extract_epi8(*L0, 11);
+ B0[3] = _mm_extract_epi8(*L0, 15);
+
+ S[0] ^= snow3g_table_A_mul[B0[0]];
+ S[1] ^= snow3g_table_A_mul[B0[1]];
+ S[2] ^= snow3g_table_A_mul[B0[2]];
+ S[3] ^= snow3g_table_A_mul[B0[3]];
+}
+
+#ifdef AVX2
+/* -------------------------------------------------------------------
+ * ClockLFSR function as defined in snow3g standard
+ * S = table_Alpha_div[LFSR[11] & 0xff]
+ * ^ table_Alpha_mul[LFSR[0] >> 24]
+ * ^ LFSR[2] ^ LFSR[0] << 8 ^ LFSR[11] >> 8
+ * ------------------------------------------------------------------ */
+static inline void ClockLFSR_8(snow3gKeyState8_t *pCtx)
+{
+ __m256i X2;
+ __m256i S, T, U;
+
+ U = pCtx->LFSR_X[pCtx->iLFSR_X];
+ S = pCtx->LFSR_X[(pCtx->iLFSR_X + 11) % 16];
+
+ C0_C11_8(&X2, &U, &S);
+
+ T = _mm256_slli_epi32(U, 8);
+ S = _mm256_srli_epi32(S, 8);
+ U = _mm256_xor_si256(T, pCtx->LFSR_X[(pCtx->iLFSR_X + 2) % 16]);
+
+ ShiftLFSR_8(pCtx);
+
+ S = _mm256_xor_si256(S, U);
+ S = _mm256_xor_si256(S, X2);
+ pCtx->LFSR_X[(pCtx->iLFSR_X + 15) % 16] = S;
+}
+#endif /* AVX2 */
+
+/* -------------------------------------------------------------------
+ * ClockLFSR function as defined in snow3g standard
+ * S = table_Alpha_div[LFSR[11] & 0xff]
+ * ^ table_Alpha_mul[LFSR[0] >> 24]
+ * ^ LFSR[2] ^ LFSR[0] << 8 ^ LFSR[11] >> 8
+ * ------------------------------------------------------------------ */
+static inline void ClockLFSR_4(snow3gKeyState4_t *pCtx)
+{
+ uint32_t X2[4];
+ __m128i S, T, U;
+
+ U = pCtx->LFSR_X[pCtx->iLFSR_X];
+ S = pCtx->LFSR_X[(pCtx->iLFSR_X + 11) % 16];
+ C0_C11_4(X2, &U, &S);
+
+ T = _mm_slli_epi32(U, 8);
+ S = _mm_srli_epi32(S, 8);
+ U = _mm_xor_si128(T, pCtx->LFSR_X[(pCtx->iLFSR_X + 2) % 16]);
+ ShiftLFSR_4(pCtx);
+
+ /* (SSE4) */
+ T = _mm_insert_epi32(T, X2[0], 0);
+ T = _mm_insert_epi32(T, X2[1], 1);
+ T = _mm_insert_epi32(T, X2[2], 2);
+ T = _mm_insert_epi32(T, X2[3], 3);
+ S = _mm_xor_si128(S, U);
+ S = _mm_xor_si128(S, T);
+ pCtx->LFSR_X[(pCtx->iLFSR_X + 15) % 16] = S;
+}
+
+#ifdef AVX2
+/* -------------------------------------------------------------------
+ * ClockFSM function as defined in snow3g standard
+ * 8 packets at a time
+ * ------------------------------------------------------------------ */
+static inline void ClockFSM_8(snow3gKeyState8_t *pCtx, __m256i *data)
+{
+ __m256i F, R, S2T0, S2T1, S2T2, S2T3, S1T0, S1T1, S1T2, S1T3;
+ __m256i w3, w2, w1, w0, offset, mask;
+
+ F = _mm256_add_epi32(pCtx->LFSR_X[(pCtx->iLFSR_X + 15)%16],
+ pCtx->FSM_X[0]);
+ R = _mm256_xor_si256(pCtx->LFSR_X[(pCtx->iLFSR_X + 5)%16],
+ pCtx->FSM_X[2]);
+ *data = _mm256_xor_si256(F, pCtx->FSM_X[1]);
+ R = _mm256_add_epi32(R, pCtx->FSM_X[1]);
+ offset = _mm256_set1_epi32(0x1);
+
+ F = pCtx->FSM_X[1];
+ w3 = _mm256_setr_epi32(0xF0F0F000, 0xF0F0F004, 0xF0F0F008,
+ 0xF0F0F00C, 0xF0F0F000, 0xF0F0F004,
+ 0xF0F0F008, 0xF0F0F00C);
+ mask = _mm256_shuffle_epi8(F,w3);
+ S2T0 = _mm256_i32gather_epi32(S2_T0,mask,4);
+
+ w2 = _mm256_add_epi32(w3,offset);
+ mask = _mm256_shuffle_epi8(F,w2);
+ S2T1 = _mm256_i32gather_epi32(S2_T1,mask,4);
+
+ w1 = _mm256_add_epi32(w2,offset);
+ mask = _mm256_shuffle_epi8(pCtx->FSM_X[1],w1);
+ S2T2 = _mm256_i32gather_epi32(S2_T2,mask,4);
+
+ w0 = _mm256_add_epi32(w1,offset);
+ mask = _mm256_shuffle_epi8(F,w0);
+ S2T3 = _mm256_i32gather_epi32(S2_T3,mask,4);
+
+
+ F = pCtx->FSM_X[0];
+ w3 = _mm256_setr_epi32(0xF0F0F000, 0xF0F0F004, 0xF0F0F008,
+ 0xF0F0F00C, 0xF0F0F010, 0xF0F0F014,
+ 0xF0F0F018, 0xF0F0F01C);
+ mask = _mm256_shuffle_epi8(F,w3);
+ S1T0 = _mm256_i32gather_epi32(S1_T0,mask,4);
+
+ w2 = _mm256_add_epi32(w3,offset);
+ mask = _mm256_shuffle_epi8(F,w2);
+ S1T1 = _mm256_i32gather_epi32(S1_T1,mask,4);
+
+ w1 = _mm256_add_epi32(w2,offset);
+ mask = _mm256_shuffle_epi8(F,w1);
+ S1T2 = _mm256_i32gather_epi32(S1_T2,mask,4);
+
+ w0 = _mm256_add_epi32(w1,offset);
+ mask = _mm256_shuffle_epi8(F,w0);
+ S1T3 = _mm256_i32gather_epi32(S1_T3,mask,4);
+
+ S2T0 = _mm256_xor_si256(S2T0, S2T1);
+ S2T2 = _mm256_xor_si256(S2T2, S2T3);
+ S2T0 = _mm256_xor_si256(S2T0, S2T2);
+
+ S1T0 = _mm256_xor_si256(S1T0, S1T1);
+ S1T2 = _mm256_xor_si256(S1T2, S1T3);
+ S1T0 = _mm256_xor_si256(S1T0, S1T2);
+
+
+ pCtx->FSM_X[2] = S2T0;
+ pCtx->FSM_X[1] = S1T0;
+ pCtx->FSM_X[2] = S2T0;
+ pCtx->FSM_X[0] = R;
+}
+
+#endif /* AVX2 */
+
+/* -------------------------------------------------------------------
+ * ClockFSM function as defined in snow3g standard
+ * 4 packets at a time
+ * ------------------------------------------------------------------ */
+static inline void ClockFSM_4(snow3gKeyState4_t *pCtx, __m128i *data)
+{
+ __m128i F, R;
+#ifdef _WIN32
+#pragma warning(push)
+#pragma warning(disable:4556)
+#endif
+#if defined (NO_AESNI) || defined (SAFE_LOOKUP)
+ uint32_t L = 0;
+#endif
+ uint32_t K = 0;
+
+ F = _mm_add_epi32(pCtx->LFSR_X[(pCtx->iLFSR_X + 15) % 16],
+ pCtx->FSM_X[0]);
+ R = _mm_xor_si128(pCtx->LFSR_X[(pCtx->iLFSR_X + 5) % 16],
+ pCtx->FSM_X[2]);
+ *data = _mm_xor_si128(F, pCtx->FSM_X[1]);
+ R = _mm_add_epi32(R, pCtx->FSM_X[1]);
+#if defined (NO_AESNI) || defined (SAFE_LOOKUP)
+ S1_S2_4(pCtx->FSM_X[2], pCtx->FSM_X[1], pCtx->FSM_X[0], K, L, 0);
+ S1_S2_4(pCtx->FSM_X[2], pCtx->FSM_X[1], pCtx->FSM_X[0], K, L, 1);
+ S1_S2_4(pCtx->FSM_X[2], pCtx->FSM_X[1], pCtx->FSM_X[0], K, L, 2);
+ S1_S2_4(pCtx->FSM_X[2], pCtx->FSM_X[1], pCtx->FSM_X[0], K, L, 3);
+#else
+ S1_S2_4(pCtx->FSM_X[2], pCtx->FSM_X[1], pCtx->FSM_X[0], K, 0);
+ S1_S2_4(pCtx->FSM_X[2], pCtx->FSM_X[1], pCtx->FSM_X[0], K, 1);
+ S1_S2_4(pCtx->FSM_X[2], pCtx->FSM_X[1], pCtx->FSM_X[0], K, 2);
+ S1_S2_4(pCtx->FSM_X[2], pCtx->FSM_X[1], pCtx->FSM_X[0], K, 3);
+#endif /* NO_AESNI */
+ pCtx->FSM_X[0] = R;
+
+#ifdef _WIN32
+#pragma warning(pop)
+#endif
+}
+
+/**
+*******************************************************************************
+* @description
+* This function generates 4 bytes of keystream 1 buffer at a time
+*
+* @param[in] pCtx Context where the scheduled keys are stored
+* @param[in/out] pKeyStream Pointer to generated keystream
+*
+*******************************************************************************/
+static inline void snow3g_keystream_1_4(snow3gKeyState1_t *pCtx,
+ uint32_t *pKeyStream)
+{
+ uint32_t F;
+
+ ClockFSM_1(pCtx, &F);
+ *pKeyStream = F ^ pCtx->LFSR_S[0];
+ ClockLFSR_1(pCtx);
+}
+
+/**
+*******************************************************************************
+* @description
+* This function generates 8 bytes of keystream 1 buffer at a time
+*
+* @param[in] pCtx Context where the scheduled keys are stored
+* @param[in/out] pKeyStream Pointer to generated keystream
+*
+*******************************************************************************/
+static inline void snow3g_keystream_1_8(snow3gKeyState1_t *pCtx,
+ uint64_t *pKeyStream)
+{
+ uint64_t F;
+ uint32_t FSM4;
+ uint32_t V0, V1;
+ uint32_t F0, F1;
+ uint32_t R0, R1;
+ uint32_t L0, L1, L11, L12;
+
+ /* Merged clock FSM + clock LFSR + clock FSM + clockLFSR
+ * in order to avoid redundancies in function processing
+ * and less instruction immediate dependencies
+ */
+ L0 = pCtx->LFSR_S[0];
+ V0 = pCtx->LFSR_S[2];
+ L1 = pCtx->LFSR_S[1];
+ V1 = pCtx->LFSR_S[3];
+ R1 = pCtx->FSM_R1;
+ L11 = pCtx->LFSR_S[11];
+ L12 = pCtx->LFSR_S[12];
+ V0 ^= snow3g_table_A_mul[L0 >> 24];
+ V1 ^= snow3g_table_A_mul[L1 >> 24];
+ V0 ^= snow3g_table_A_div[L11 & 0xff];
+ V1 ^= snow3g_table_A_div[L12 & 0xff];
+ V0 ^= L0 << 8;
+ V1 ^= L1 << 8;
+ V0 ^= L11 >> 8;
+ V1 ^= L12 >> 8;
+ F0 = pCtx->LFSR_S[15] + R1;
+ F0 ^= L0;
+ F0 ^= pCtx->FSM_R2;
+ R0 = pCtx->FSM_R3 ^ pCtx->LFSR_S[5];
+ R0 += pCtx->FSM_R2;
+ S1_S2_S3_1(pCtx->FSM_R3, pCtx->FSM_R2, R1, FSM4, R0);
+ R1 = pCtx->FSM_R3 ^ pCtx->LFSR_S[6];
+ F1 = V0 + R0;
+ F1 ^= L1;
+ F1 ^= pCtx->FSM_R2;
+ R1 += pCtx->FSM_R2;
+ pCtx->FSM_R3 = Snow3g_S2(pCtx->FSM_R2);
+ pCtx->FSM_R2 = FSM4;
+ pCtx->FSM_R1 = R1;
+
+ /* Shift LFSR twice */
+ ShiftTwiceLFSR_1(pCtx);
+
+ /* keystream mode LFSR update */
+ pCtx->LFSR_S[14] = V0;
+ pCtx->LFSR_S[15] = V1;
+
+ F = F0;
+ F <<= 32;
+ F |= (uint64_t)F1;
+
+ *pKeyStream = F;
+}
+
+#ifdef AVX2
+/**
+*******************************************************************************
+* @description
+* This function generates 8 bytes of keystream 8 buffers at a time
+*
+* @param[in] pCtx Context where the scheduled keys are stored
+* @param[in/out] pKeyStream Pointer to generated keystream
+*
+*******************************************************************************/
+static inline void snow3g_keystream_8_8(snow3gKeyState8_t *pCtx,
+ __m256i *pKeyStreamLo,
+ __m256i *pKeyStreamHi)
+{
+ __m256i H, L;
+
+ /* first set of 4 bytes */
+ ClockFSM_8(pCtx, &L);
+ L = _mm256_xor_si256(L, pCtx->LFSR_X[pCtx->iLFSR_X]);
+ ClockLFSR_8(pCtx);
+
+ /* second set of 4 bytes */
+ ClockFSM_8(pCtx, &H);
+ H = _mm256_xor_si256(H, pCtx->LFSR_X[pCtx->iLFSR_X]);
+ ClockLFSR_8(pCtx);
+
+ /* merge the 2 sets */
+ *pKeyStreamLo = _mm256_unpacklo_epi32(H, L);
+ *pKeyStreamHi = _mm256_unpackhi_epi32(H, L);
+}
+
+/**
+*******************************************************************************
+* @description
+* This function generates 4 bytes of keystream 8 buffers at a time
+*
+* @param[in] pCtx Context where the scheduled keys are stored
+* @param[in/out] pKeyStream Pointer to generated keystream
+*
+*******************************************************************************/
+static inline void snow3g_keystream_8_4(snow3gKeyState8_t *pCtx,
+ __m256i *pKeyStream)
+{
+ __m256i F;
+
+ ClockFSM_8(pCtx, &F);
+ *pKeyStream = _mm256_xor_si256(F, pCtx->LFSR_X[pCtx->iLFSR_X]);
+ ClockLFSR_8(pCtx);
+}
+
+/**
+*****************************************************************************
+* @description
+* This function generates 32 bytes of keystream 8 buffers at a time
+*
+* @param[in] pCtx Context where the scheduled keys are stored
+* @param[in/out] pKeyStream Array of generated keystreams
+*
+******************************************************************************/
+static inline void snow3g_keystream_8_32(snow3gKeyState8_t *pCtx,
+ __m256i *pKeyStream)
+{
+
+ __m256i temp[8];
+
+ /** produces the next 4 bytes for each buffer */
+ int i;
+
+ /** Byte reversal on each KS */
+ __m256i mask1 = {0x0001020304050607ULL, 0x08090a0b0c0d0e0fULL,
+ 0x0001020304050607ULL, 0x08090a0b0c0d0e0fULL};
+ /** Reversal, shifted 4 bytes right */
+ __m256i mask2 = {0x0405060708090a0bULL, 0x0c0d0e0f00010203ULL,
+ 0x0405060708090a0bULL, 0x0c0d0e0f00010203ULL};
+ /** Reversal, shifted 8 bytes right */
+ __m256i mask3 = {0x08090a0b0c0d0e0fULL, 0x0001020304050607ULL,
+ 0x08090a0b0c0d0e0fULL, 0x0001020304050607ULL};
+ /** Reversal, shifted 12 bytes right */
+ __m256i mask4 = {0x0c0d0e0f00010203ULL, 0x0405060708090a0bULL,
+ 0x0c0d0e0f00010203ULL, 0x0405060708090a0bULL};
+
+ snow3g_keystream_8_4(pCtx, &temp[0]);
+ snow3g_keystream_8_4(pCtx, &temp[1]);
+ snow3g_keystream_8_4(pCtx, &temp[2]);
+ snow3g_keystream_8_4(pCtx, &temp[3]);
+ snow3g_keystream_8_4(pCtx, &temp[4]);
+ snow3g_keystream_8_4(pCtx, &temp[5]);
+ snow3g_keystream_8_4(pCtx, &temp[6]);
+ snow3g_keystream_8_4(pCtx, &temp[7]);
+
+ temp[0] = _mm256_shuffle_epi8(temp[0], mask1);
+ temp[1] = _mm256_shuffle_epi8(temp[1], mask2);
+ temp[2] = _mm256_shuffle_epi8(temp[2], mask3);
+ temp[3] = _mm256_shuffle_epi8(temp[3], mask4);
+ temp[4] = _mm256_shuffle_epi8(temp[4], mask1);
+ temp[5] = _mm256_shuffle_epi8(temp[5], mask2);
+ temp[6] = _mm256_shuffle_epi8(temp[6], mask3);
+ temp[7] = _mm256_shuffle_epi8(temp[7], mask4);
+
+ __m256i blended[8];
+ /* blends KS together: 128bit slice consists
+ of 4 32-bit words for one packet */
+ blended[0] = _mm256_blend_epi32(temp[0], temp[1], 0xaa);
+ blended[1] = _mm256_blend_epi32(temp[0], temp[1], 0x55);
+ blended[2] = _mm256_blend_epi32(temp[2], temp[3], 0xaa);
+ blended[3] = _mm256_blend_epi32(temp[2], temp[3], 0x55);
+ blended[4] = _mm256_blend_epi32(temp[4], temp[5], 0xaa);
+ blended[5] = _mm256_blend_epi32(temp[4], temp[5], 0x55);
+ blended[6] = _mm256_blend_epi32(temp[6], temp[7], 0xaa);
+ blended[7] = _mm256_blend_epi32(temp[6], temp[7], 0x55);
+
+ temp[0] = _mm256_blend_epi32(blended[0], blended[2], 0xcc);
+ temp[1] = _mm256_blend_epi32(blended[1], blended[3], 0x99);
+ temp[2] = _mm256_blend_epi32(blended[0], blended[2], 0x33);
+ temp[3] = _mm256_blend_epi32(blended[1], blended[3], 0x66);
+ temp[4] = _mm256_blend_epi32(blended[4], blended[6], 0xcc);
+ temp[5] = _mm256_blend_epi32(blended[5], blended[7], 0x99);
+ temp[6] = _mm256_blend_epi32(blended[4], blended[6], 0x33);
+ temp[7] = _mm256_blend_epi32(blended[5], blended[7], 0x66);
+
+ /** sorts 32 bit words back into order */
+ blended[0] = temp[0];
+ blended[1] = _mm256_shuffle_epi32(temp[1], 0x39);
+ blended[2] = _mm256_shuffle_epi32(temp[2], 0x4e);
+ blended[3] = _mm256_shuffle_epi32(temp[3], 0x93);
+ blended[4] = temp[4];
+ blended[5] = _mm256_shuffle_epi32(temp[5], 0x39);
+ blended[6] = _mm256_shuffle_epi32(temp[6], 0x4e);
+ blended[7] = _mm256_shuffle_epi32(temp[7], 0x93);
+
+ for (i = 0; i < 4; i++) {
+ pKeyStream[i] = _mm256_permute2x128_si256(blended[i],
+ blended[i + 4], 0x20);
+ pKeyStream[i + 4] = _mm256_permute2x128_si256(
+ blended[i], blended[i + 4], 0x31);
+ }
+}
+
+#endif /* AVX2 */
+
+/**
+*******************************************************************************
+* @description
+* This function generates 4 bytes of keystream 4 buffers at a time
+*
+* @param[in] pCtx Context where the scheduled keys are stored
+* @param[in/out] pKeyStream Pointer to generated keystream
+*
+*******************************************************************************/
+static inline void snow3g_keystream_4_4(snow3gKeyState4_t *pCtx,
+ __m128i *pKeyStream)
+{
+ __m128i F;
+
+ ClockFSM_4(pCtx, &F);
+ *pKeyStream = _mm_xor_si128(F, pCtx->LFSR_X[pCtx->iLFSR_X]);
+ ClockLFSR_4(pCtx);
+}
+
+/**
+*******************************************************************************
+* @description
+* This function generates 8 bytes of keystream 4 buffers at a time
+*
+* @param[in] pCtx Context where the scheduled keys are stored
+* @param[in/out] pKeyStreamLo Pointer to lower end of generated keystream
+* @param[in/out] pKeyStreamHi Pointer to higer end of generated keystream
+*
+*******************************************************************************/
+static inline void snow3g_keystream_4_8(snow3gKeyState4_t *pCtx,
+ __m128i *pKeyStreamLo,
+ __m128i *pKeyStreamHi)
+{
+ __m128i H, L;
+
+ /* first set of 4 bytes */
+ ClockFSM_4(pCtx, &L);
+ L = _mm_xor_si128(L, pCtx->LFSR_X[pCtx->iLFSR_X]);
+ ClockLFSR_4(pCtx);
+
+ /* second set of 4 bytes */
+ ClockFSM_4(pCtx, &H);
+ H = _mm_xor_si128(H, pCtx->LFSR_X[pCtx->iLFSR_X]);
+ ClockLFSR_4(pCtx);
+
+ /* merge the 2 sets */
+ *pKeyStreamLo = _mm_unpacklo_epi32(H, L);
+ *pKeyStreamHi = _mm_unpackhi_epi32(H, L);
+}
+
+/**
+*******************************************************************************
+* @description
+* This function initializes the key schedule for 4 buffers for snow3g f8/f9.
+*
+* @param [in] pCtx Context where the scheduled keys are stored
+* @param [in] pKeySched Key schedule
+* @param [in] pIV1 IV for buffer 1
+* @param [in] pIV2 IV for buffer 2
+* @param [in] pIV3 IV for buffer 3
+* @param [in] pIV4 IV for buffer 4
+*
+*******************************************************************************/
+static inline void
+snow3gStateInitialize_4(snow3gKeyState4_t *pCtx,
+ const snow3g_key_schedule_t *pKeySched,
+ const void *pIV1, const void *pIV2,
+ const void *pIV3, const void *pIV4)
+{
+ uint32_t K, L;
+ int i;
+ __m128i R, S, T, U;
+ __m128i V0, V1, T0, T1;
+
+ /* Initialize the LFSR table from constants, Keys, and IV */
+
+ /* Load complete 128b IV into register (SSE2)*/
+ uint64_t sm[2] = {0x0405060700010203ULL, 0x0c0d0e0f08090a0bULL};
+ __m128i *swapMask = (__m128i *) sm;
+
+ R = _mm_loadu_si128((const __m128i *)pIV1);
+ S = _mm_loadu_si128((const __m128i *)pIV2);
+ T = _mm_loadu_si128((const __m128i *)pIV3);
+ U = _mm_loadu_si128((const __m128i *)pIV4);
+
+ /* initialize the array block (SSE4) */
+ for (i = 0; i < 4; i++) {
+ K = pKeySched->k[i];
+ L = ~K;
+ V0 = _mm_set1_epi32(K);
+ V1 = _mm_set1_epi32(L);
+ pCtx->LFSR_X[i + 4] = V0;
+ pCtx->LFSR_X[i + 12] = V0;
+ pCtx->LFSR_X[i + 0] = V1;
+ pCtx->LFSR_X[i + 8] = V1;
+ }
+ /* Update the schedule structure with IVs */
+ /* Store the 4 IVs in LFSR by a column/row matrix swap
+ * after endianness correction */
+
+ /* endianness swap (SSSE3) */
+ R = _mm_shuffle_epi8(R, *swapMask);
+ S = _mm_shuffle_epi8(S, *swapMask);
+ T = _mm_shuffle_epi8(T, *swapMask);
+ U = _mm_shuffle_epi8(U, *swapMask);
+
+ /* row/column dword inversion (SSE2) */
+ T0 = _mm_unpacklo_epi32(R, S);
+ R = _mm_unpackhi_epi32(R, S);
+ T1 = _mm_unpacklo_epi32(T, U);
+ T = _mm_unpackhi_epi32(T, U);
+
+ /* row/column qword inversion (SSE2) */
+ U = _mm_unpackhi_epi64(R, T);
+ T = _mm_unpacklo_epi64(R, T);
+ S = _mm_unpackhi_epi64(T0, T1);
+ R = _mm_unpacklo_epi64(T0, T1);
+
+ /*IV ^ LFSR (SSE2) */
+ pCtx->LFSR_X[15] = _mm_xor_si128(pCtx->LFSR_X[15], U);
+ pCtx->LFSR_X[12] = _mm_xor_si128(pCtx->LFSR_X[12], T);
+ pCtx->LFSR_X[10] = _mm_xor_si128(pCtx->LFSR_X[10], S);
+ pCtx->LFSR_X[9] = _mm_xor_si128(pCtx->LFSR_X[9], R);
+ pCtx->iLFSR_X = 0;
+ /* FSM initialization (SSE2) */
+ S = _mm_setzero_si128();
+ for (i = 0; i < 3; i++)
+ pCtx->FSM_X[i] = S;
+
+ /* Initialisation rounds */
+ for (i = 0; i < 32; i++) {
+ ClockFSM_4(pCtx, &S);
+ ClockLFSR_4(pCtx);
+ pCtx->LFSR_X[(pCtx->iLFSR_X + 15) % 16] = _mm_xor_si128(
+ pCtx->LFSR_X[(pCtx->iLFSR_X + 15) % 16], S);
+ }
+}
+
+#ifdef AVX2
+/**
+*******************************************************************************
+* @description
+* This function intializes the key schedule for 8 buffers with
+* individual keys, for snow3g f8/f9.
+*
+* @param [in] pCtx Context where scheduled keys are stored
+* @param [in] pKeySched Key schedule
+* @param [in] pIV1 IV for buffer 1
+* @param [in] pIV2 IV for buffer 2
+* @param [in] pIV3 IV for buffer 3
+* @param [in] pIV4 IV for buffer 4
+* @param [in] pIV5 IV for buffer 5
+* @param [in] pIV6 IV for buffer 6
+* @param [in] pIV7 IV for buffer 7
+* @param [in] pIV8 IV for buffer 8
+*
+*******************************************************************************/
+static inline void
+snow3gStateInitialize_8_multiKey(snow3gKeyState8_t *pCtx,
+ const snow3g_key_schedule_t * const KeySched[],
+ const void * const pIV[])
+{
+ DECLARE_ALIGNED(uint32_t k[8], 32);
+ DECLARE_ALIGNED(uint32_t l[8], 32);
+ __m256i *K = (__m256i *)k;
+ __m256i *L = (__m256i *)l;
+
+ int i, j;
+ __m256i mR, mS, mT, mU, T0, T1;
+
+ /* Initialize the LFSR table from constants, Keys, and IV */
+
+ /* Load complete 256b IV into register (SSE2)*/
+ __m256i swapMask = {0x0405060700010203ULL, 0x0c0d0e0f08090a0bULL,
+ 0x0405060700010203ULL, 0x0c0d0e0f08090a0bULL};
+ mR = _mm256_loadu2_m128i((const __m128i *)pIV[4],
+ (const __m128i *)pIV[0]);
+ mS = _mm256_loadu2_m128i((const __m128i *)pIV[5],
+ (const __m128i *)pIV[1]);
+ mT = _mm256_loadu2_m128i((const __m128i *)pIV[6],
+ (const __m128i *)pIV[2]);
+ mU = _mm256_loadu2_m128i((const __m128i *)pIV[7],
+ (const __m128i *)pIV[3]);
+
+ /* initialize the array block (SSE4) */
+ for (i = 0; i < 4; i++) {
+ for (j = 0; j < 8; j++) {
+ k[j] = KeySched[j]->k[i];
+ l[j] = ~k[j];
+ }
+
+ pCtx->LFSR_X[i + 4] = *K;
+ pCtx->LFSR_X[i + 12] = *K;
+ pCtx->LFSR_X[i + 0] = *L;
+ pCtx->LFSR_X[i + 8] = *L;
+ }
+
+ /* Update the schedule structure with IVs */
+ /* Store the 4 IVs in LFSR by a column/row matrix swap
+ * after endianness correction */
+
+ /* endianness swap (SSSE3) */
+ mR = _mm256_shuffle_epi8(mR, swapMask);
+ mS = _mm256_shuffle_epi8(mS, swapMask);
+ mT = _mm256_shuffle_epi8(mT, swapMask);
+ mU = _mm256_shuffle_epi8(mU, swapMask);
+
+ /* row/column dword inversion (SSE2) */
+ T0 = _mm256_unpacklo_epi32(mR, mS);
+ mR = _mm256_unpackhi_epi32(mR, mS);
+ T1 = _mm256_unpacklo_epi32(mT, mU);
+ mT = _mm256_unpackhi_epi32(mT, mU);
+
+ /* row/column qword inversion (SSE2) */
+ mU = _mm256_unpackhi_epi64(mR, mT);
+ mT = _mm256_unpacklo_epi64(mR, mT);
+ mS = _mm256_unpackhi_epi64(T0, T1);
+ mR = _mm256_unpacklo_epi64(T0, T1);
+
+ /*IV ^ LFSR (SSE2) */
+ pCtx->LFSR_X[15] = _mm256_xor_si256(pCtx->LFSR_X[15], mU);
+ pCtx->LFSR_X[12] = _mm256_xor_si256(pCtx->LFSR_X[12], mT);
+ pCtx->LFSR_X[10] = _mm256_xor_si256(pCtx->LFSR_X[10], mS);
+ pCtx->LFSR_X[9] = _mm256_xor_si256(pCtx->LFSR_X[9], mR);
+ pCtx->iLFSR_X = 0;
+ /* FSM initialization (SSE2) */
+ mS = _mm256_setzero_si256();
+ for (i = 0; i < 3; i++)
+ pCtx->FSM_X[i] = mS;
+
+ /* Initialisation rounds */
+ for (i = 0; i < 32; i++) {
+ ClockFSM_8(pCtx, &mS);
+ ClockLFSR_8(pCtx);
+ pCtx->LFSR_X[(pCtx->iLFSR_X + 15) % 16] = _mm256_xor_si256(
+ pCtx->LFSR_X[(pCtx->iLFSR_X + 15) % 16], mS);
+ }
+}
+
+/**
+*******************************************************************************
+* @description
+* This function initializes the key schedule for 8 buffers for snow3g f8/f9.
+*
+* @param [in] pCtx Context where the scheduled keys are stored
+* @param [in] pKeySched Key schedule
+* @param [in] pIV1 IV for buffer 1
+* @param [in] pIV2 IV for buffer 2
+* @param [in] pIV3 IV for buffer 3
+* @param [in] pIV4 IV for buffer 4
+* @param [in] pIV5 IV for buffer 5
+* @param [in] pIV6 IV for buffer 6
+* @param [in] pIV7 IV for buffer 7
+* @param [in] pIV8 IV for buffer 8
+*
+*******************************************************************************/
+static inline void
+snow3gStateInitialize_8(snow3gKeyState8_t *pCtx,
+ const snow3g_key_schedule_t *pKeySched,
+ const void *pIV1, const void *pIV2,
+ const void *pIV3, const void *pIV4,
+ const void *pIV5, const void *pIV6,
+ const void *pIV7, const void *pIV8)
+{
+ uint32_t K, L;
+ int i;
+ __m256i mR, mS, mT, mU, V0, V1, T0, T1;
+
+ /* Initialize the LFSR table from constants, Keys, and IV */
+
+ /* Load complete 256b IV into register (SSE2)*/
+ __m256i swapMask = {0x0405060700010203ULL, 0x0c0d0e0f08090a0bULL,
+ 0x0405060700010203ULL, 0x0c0d0e0f08090a0bULL};
+ mR = _mm256_loadu2_m128i((const __m128i *)pIV5, (const __m128i *)pIV1);
+ mS = _mm256_loadu2_m128i((const __m128i *)pIV6, (const __m128i *)pIV2);
+ mT = _mm256_loadu2_m128i((const __m128i *)pIV7, (const __m128i *)pIV3);
+ mU = _mm256_loadu2_m128i((const __m128i *)pIV8, (const __m128i *)pIV4);
+
+ /* initialize the array block (SSE4) */
+ for (i = 0; i < 4; i++) {
+ K = pKeySched->k[i];
+ L = ~K;
+ V0 = _mm256_set1_epi32(K);
+ V1 = _mm256_set1_epi32(L);
+ pCtx->LFSR_X[i + 4] = V0;
+ pCtx->LFSR_X[i + 12] = V0;
+ pCtx->LFSR_X[i + 0] = V1;
+ pCtx->LFSR_X[i + 8] = V1;
+ }
+
+ /* Update the schedule structure with IVs */
+ /* Store the 4 IVs in LFSR by a column/row matrix swap
+ * after endianness correction */
+
+ /* endianness swap (SSSE3) */
+ mR = _mm256_shuffle_epi8(mR, swapMask);
+ mS = _mm256_shuffle_epi8(mS, swapMask);
+ mT = _mm256_shuffle_epi8(mT, swapMask);
+ mU = _mm256_shuffle_epi8(mU, swapMask);
+
+ /* row/column dword inversion (SSE2) */
+ T0 = _mm256_unpacklo_epi32(mR, mS);
+ mR = _mm256_unpackhi_epi32(mR, mS);
+ T1 = _mm256_unpacklo_epi32(mT, mU);
+ mT = _mm256_unpackhi_epi32(mT, mU);
+
+ /* row/column qword inversion (SSE2) */
+ mU = _mm256_unpackhi_epi64(mR, mT);
+ mT = _mm256_unpacklo_epi64(mR, mT);
+ mS = _mm256_unpackhi_epi64(T0, T1);
+ mR = _mm256_unpacklo_epi64(T0, T1);
+
+ /*IV ^ LFSR (SSE2) */
+ pCtx->LFSR_X[15] = _mm256_xor_si256(pCtx->LFSR_X[15], mU);
+ pCtx->LFSR_X[12] = _mm256_xor_si256(pCtx->LFSR_X[12], mT);
+ pCtx->LFSR_X[10] = _mm256_xor_si256(pCtx->LFSR_X[10], mS);
+ pCtx->LFSR_X[9] = _mm256_xor_si256(pCtx->LFSR_X[9], mR);
+ pCtx->iLFSR_X = 0;
+ /* FSM initialization (SSE2) */
+ mS = _mm256_setzero_si256();
+ for (i = 0; i < 3; i++)
+ pCtx->FSM_X[i] = mS;
+
+ /* Initialisation rounds */
+ for (i = 0; i < 32; i++) {
+ ClockFSM_8(pCtx, &mS);
+ ClockLFSR_8(pCtx);
+ pCtx->LFSR_X[(pCtx->iLFSR_X + 15) % 16] = _mm256_xor_si256(
+ pCtx->LFSR_X[(pCtx->iLFSR_X + 15) % 16], mS);
+ }
+}
+#endif /* AVX2 */
+
+static inline void
+preserve_bits(uint64_t *KS,
+ const uint8_t *pcBufferOut, const uint8_t *pcBufferIn,
+ SafeBuf *safeOutBuf, SafeBuf *safeInBuf,
+ const uint8_t bit_len, const uint8_t byte_len)
+{
+ const uint64_t mask = UINT64_MAX << (SNOW3G_BLOCK_SIZE * 8 - bit_len);
+
+ /* Clear the last bits of the keystream and the input
+ * (input only in out-of-place case) */
+ *KS &= mask;
+ if (pcBufferIn != pcBufferOut) {
+ const uint64_t swapMask = BSWAP64(mask);
+
+ safeInBuf->b64 &= swapMask;
+
+ /*
+ * Merge the last bits from the output, to be preserved,
+ * in the keystream, to be XOR'd with the input
+ * (which last bits are 0, maintaining the output bits)
+ */
+ memcpy_keystrm(safeOutBuf->b8, pcBufferOut, byte_len);
+ *KS |= BSWAP64(safeOutBuf->b64 & ~swapMask);
+ }
+}
+
+/**
+*******************************************************************************
+* @description
+* This function is the core snow3g bit algorithm
+* for the 3GPP confidentiality algorithm
+*
+* @param[in] pCtx Context where the scheduled keys are stored
+* @param[in] pBufferIn Input buffer
+* @param[out] pBufferOut Output buffer
+* @param[in] cipherLengthInBits length in bits of the data to be encrypted
+* @param[in] bitOffset offset in input buffer, where data are valid
+*
+*******************************************************************************/
+static inline void f8_snow3g_bit(snow3gKeyState1_t *pCtx,
+ const void *pIn,
+ void *pOut,
+ const uint32_t lengthInBits,
+ const uint32_t offsetInBits)
+{
+ const uint8_t *pBufferIn = pIn;
+ uint8_t *pBufferOut = pOut;
+ uint32_t cipherLengthInBits = lengthInBits;
+ uint64_t shiftrem = 0;
+ uint64_t KS8, KS8bit; /* 8 bytes of keystream */
+ const uint8_t *pcBufferIn = pBufferIn + (offsetInBits / 8);
+ uint8_t *pcBufferOut = pBufferOut + (offsetInBits / 8);
+ /* Offset into the first byte (0 - 7 bits) */
+ uint32_t remainOffset = offsetInBits % 8;
+ uint32_t byteLength = (cipherLengthInBits + 7) / 8;
+ SafeBuf safeInBuf = {0};
+ SafeBuf safeOutBuf = {0};
+
+ /* Now run the block cipher */
+
+ /* Start with potential partial block (due to offset and length) */
+ snow3g_keystream_1_8(pCtx, &KS8);
+ KS8bit = KS8 >> remainOffset;
+ /* Only one block to encrypt */
+ if (cipherLengthInBits < (64 - remainOffset)) {
+ byteLength = (cipherLengthInBits + 7) / 8;
+ memcpy_keystrm(safeInBuf.b8, pcBufferIn, byteLength);
+ /*
+ * If operation is Out-of-place and there is offset
+ * to be applied, "remainOffset" bits from the output buffer
+ * need to be preserved (only applicable to first byte,
+ * since remainOffset is up to 7 bits)
+ */
+ if ((pIn != pOut) && remainOffset) {
+ const uint8_t mask8 = (uint8_t)
+ (1 << (8 - remainOffset)) - 1;
+
+ safeInBuf.b8[0] = (safeInBuf.b8[0] & mask8) |
+ (pcBufferOut[0] & ~mask8);
+ }
+ /* If last byte is a partial byte, the last bits of the output
+ * need to be preserved */
+ const uint8_t bitlen_with_off = remainOffset +
+ cipherLengthInBits;
+
+ if ((bitlen_with_off & 0x7) != 0)
+ preserve_bits(&KS8bit, pcBufferOut, pcBufferIn,
+ &safeOutBuf, &safeInBuf,
+ bitlen_with_off, byteLength);
+
+ xor_keystrm_rev(safeOutBuf.b8, safeInBuf.b8, KS8bit);
+ memcpy_keystrm(pcBufferOut, safeOutBuf.b8, byteLength);
+ return;
+ }
+ /*
+ * If operation is Out-of-place and there is offset
+ * to be applied, "remainOffset" bits from the output buffer
+ * need to be preserved (only applicable to first byte,
+ * since remainOffset is up to 7 bits)
+ */
+ if ((pIn != pOut) && remainOffset) {
+ const uint8_t mask8 = (uint8_t)(1 << (8 - remainOffset)) - 1;
+
+ memcpy_keystrm(safeInBuf.b8, pcBufferIn, 8);
+ safeInBuf.b8[0] = (safeInBuf.b8[0] & mask8) |
+ (pcBufferOut[0] & ~mask8);
+ xor_keystrm_rev(pcBufferOut, safeInBuf.b8, KS8bit);
+ pcBufferIn += SNOW3G_BLOCK_SIZE;
+ } else {
+ /* At least 64 bits to produce (including offset) */
+ pcBufferIn = xor_keystrm_rev(pcBufferOut, pcBufferIn, KS8bit);
+ }
+
+ if (remainOffset != 0)
+ shiftrem = KS8 << (64 - remainOffset);
+ cipherLengthInBits -= SNOW3G_BLOCK_SIZE * 8 - remainOffset;
+ pcBufferOut += SNOW3G_BLOCK_SIZE;
+
+ while (cipherLengthInBits) {
+ /* produce the next block of keystream */
+ snow3g_keystream_1_8(pCtx, &KS8);
+ KS8bit = (KS8 >> remainOffset) | shiftrem;
+ if (remainOffset != 0)
+ shiftrem = KS8 << (64 - remainOffset);
+ if (cipherLengthInBits >= SNOW3G_BLOCK_SIZE * 8) {
+ pcBufferIn = xor_keystrm_rev(pcBufferOut,
+ pcBufferIn, KS8bit);
+ cipherLengthInBits -= SNOW3G_BLOCK_SIZE * 8;
+ pcBufferOut += SNOW3G_BLOCK_SIZE;
+ /* loop variant */
+ } else {
+ /* end of the loop, handle the last bytes */
+ byteLength = (cipherLengthInBits + 7) / 8;
+ memcpy_keystrm(safeInBuf.b8, pcBufferIn,
+ byteLength);
+
+ /* If last byte is a partial byte, the last bits
+ * of the output need to be preserved */
+ if ((cipherLengthInBits & 0x7) != 0)
+ preserve_bits(&KS8bit, pcBufferOut, pcBufferIn,
+ &safeOutBuf, &safeInBuf,
+ cipherLengthInBits, byteLength);
+
+ xor_keystrm_rev(safeOutBuf.b8, safeInBuf.b8, KS8bit);
+ memcpy_keystrm(pcBufferOut, safeOutBuf.b8, byteLength);
+ cipherLengthInBits = 0;
+ }
+ }
+#ifdef SAFE_DATA
+ CLEAR_VAR(&KS8, sizeof(KS8));
+ CLEAR_VAR(&KS8bit, sizeof(KS8bit));
+ CLEAR_MEM(&safeInBuf, sizeof(safeInBuf));
+ CLEAR_MEM(&safeOutBuf, sizeof(safeOutBuf));
+#endif
+}
+
+/**
+*******************************************************************************
+* @description
+* This function is the core snow3g algorithm for
+* the 3GPP confidentiality and integrity algorithm.
+*
+* @param[in] pCtx Context where the scheduled keys are stored
+* @param[in] pBufferIn Input buffer
+* @param[out] pBufferOut Output buffer
+* @param[in] lengthInBytes length in bytes of the data to be encrypted
+*
+*******************************************************************************/
+static inline void f8_snow3g(snow3gKeyState1_t *pCtx,
+ const void *pIn,
+ void *pOut,
+ const uint32_t lengthInBytes)
+{
+ uint32_t qwords = lengthInBytes / SNOW3G_8_BYTES; /* number of qwords */
+ uint32_t words = lengthInBytes & 4; /* remaining word if not 0 */
+ uint32_t bytes = lengthInBytes & 3; /* remaining bytes */
+ uint32_t KS4; /* 4 bytes of keystream */
+ uint64_t KS8; /* 8 bytes of keystream */
+ const uint8_t *pBufferIn = pIn;
+ uint8_t *pBufferOut = pOut;
+
+ /* process 64 bits at a time */
+ while (qwords--) {
+ /* generate keystream 8 bytes at a time */
+ snow3g_keystream_1_8(pCtx, &KS8);
+
+ /* xor keystream 8 bytes at a time */
+ pBufferIn = xor_keystrm_rev(pBufferOut, pBufferIn, KS8);
+ pBufferOut += SNOW3G_8_BYTES;
+ }
+
+ /* check for remaining 0 to 7 bytes */
+ if (0 != words) {
+ if (bytes) {
+ /* 5 to 7 last bytes, process 8 bytes */
+ uint8_t buftemp[8];
+ uint8_t safeBuff[8];
+
+ memset(safeBuff, 0, SNOW3G_8_BYTES);
+ snow3g_keystream_1_8(pCtx, &KS8);
+ memcpy_keystrm(safeBuff, pBufferIn, 4 + bytes);
+ xor_keystrm_rev(buftemp, safeBuff, KS8);
+ memcpy_keystrm(pBufferOut, buftemp, 4 + bytes);
+#ifdef SAFE_DATA
+ CLEAR_MEM(&safeBuff, sizeof(safeBuff));
+ CLEAR_MEM(&buftemp, sizeof(buftemp));
+#endif
+ } else {
+ /* exactly 4 last bytes */
+ snow3g_keystream_1_4(pCtx, &KS4);
+ xor_keystream_reverse_32(pBufferOut, pBufferIn, KS4);
+ }
+ } else if (0 != bytes) {
+ /* 1 to 3 last bytes */
+ uint8_t buftemp[4];
+ uint8_t safeBuff[4];
+
+ memset(safeBuff, 0, SNOW3G_4_BYTES);
+ snow3g_keystream_1_4(pCtx, &KS4);
+ memcpy_keystream_32(safeBuff, pBufferIn, bytes);
+ xor_keystream_reverse_32(buftemp, safeBuff, KS4);
+ memcpy_keystream_32(pBufferOut, buftemp, bytes);
+#ifdef SAFE_DATA
+ CLEAR_MEM(&safeBuff, sizeof(safeBuff));
+ CLEAR_MEM(&buftemp, sizeof(buftemp));
+#endif
+ }
+
+#ifdef SAFE_DATA
+ CLEAR_VAR(&KS4, sizeof(KS4));
+ CLEAR_VAR(&KS8, sizeof(KS8));
+#endif
+}
+
+#ifdef AVX2
+/**
+*******************************************************************************
+* @description
+* This function converts the state from a 4 buffer state structure to 1
+* buffer state structure.
+*
+* @param[in] pSrcState Pointer to the source state
+* @param[in] pDstState Pointer to the destination state
+* @param[in] NumBuffers Number of buffers
+*
+*******************************************************************************/
+static inline void snow3gStateConvert_8(snow3gKeyState8_t *pSrcState,
+ snow3gKeyState1_t *pDstState,
+ uint32_t NumBuffers)
+{
+ uint32_t T = 0, iLFSR_X = pSrcState->iLFSR_X;
+ __m256i *LFSR_X = pSrcState->LFSR_X;
+ int i;
+
+ for (i = 0; i < 16; i++) {
+ switch (NumBuffers) {
+ case 0:
+ T = _mm256_extract_epi32(LFSR_X[(i + iLFSR_X) % 16], 0);
+ break;
+ case 1:
+ T = _mm256_extract_epi32(LFSR_X[(i + iLFSR_X) % 16], 1);
+ break;
+ case 2:
+ T = _mm256_extract_epi32(LFSR_X[(i + iLFSR_X) % 16], 2);
+ break;
+ case 3:
+ T = _mm256_extract_epi32(LFSR_X[(i + iLFSR_X) % 16], 3);
+ break;
+ case 4:
+ T = _mm256_extract_epi32(LFSR_X[(i + iLFSR_X) % 16], 4);
+ break;
+ case 5:
+ T = _mm256_extract_epi32(LFSR_X[(i + iLFSR_X) % 16], 5);
+ break;
+ case 6:
+ T = _mm256_extract_epi32(LFSR_X[(i + iLFSR_X) % 16], 6);
+ break;
+ case 7:
+ T = _mm256_extract_epi32(LFSR_X[(i + iLFSR_X) % 16], 7);
+ break;
+ }
+ pDstState->LFSR_S[i] = T;
+ }
+ i = 0;
+ switch (NumBuffers) {
+ case 0:
+ T = _mm256_extract_epi32(pSrcState->FSM_X[i], 0);
+ break;
+ case 1:
+ T = _mm256_extract_epi32(pSrcState->FSM_X[i], 1);
+ break;
+ case 2:
+ T = _mm256_extract_epi32(pSrcState->FSM_X[i], 2);
+ break;
+ case 3:
+ T = _mm256_extract_epi32(pSrcState->FSM_X[i], 3);
+ break;
+ case 4:
+ T = _mm256_extract_epi32(pSrcState->FSM_X[i], 4);
+ break;
+ case 5:
+ T = _mm256_extract_epi32(pSrcState->FSM_X[i], 5);
+ break;
+ case 6:
+ T = _mm256_extract_epi32(pSrcState->FSM_X[i], 6);
+ break;
+ case 7:
+ T = _mm256_extract_epi32(pSrcState->FSM_X[i], 7);
+ break;
+ }
+ pDstState->FSM_R1 = T;
+
+ i = 1;
+ switch (NumBuffers) {
+ case 0:
+ T = _mm256_extract_epi32(pSrcState->FSM_X[i], 0);
+ break;
+ case 1:
+ T = _mm256_extract_epi32(pSrcState->FSM_X[i], 1);
+ break;
+ case 2:
+ T = _mm256_extract_epi32(pSrcState->FSM_X[i], 2);
+ break;
+ case 3:
+ T = _mm256_extract_epi32(pSrcState->FSM_X[i], 3);
+ break;
+ case 4:
+ T = _mm256_extract_epi32(pSrcState->FSM_X[i], 4);
+ break;
+ case 5:
+ T = _mm256_extract_epi32(pSrcState->FSM_X[i], 5);
+ break;
+ case 6:
+ T = _mm256_extract_epi32(pSrcState->FSM_X[i], 6);
+ break;
+ case 7:
+ T = _mm256_extract_epi32(pSrcState->FSM_X[i], 7);
+ break;
+ }
+ pDstState->FSM_R2 = T;
+
+ i = 2;
+ switch (NumBuffers) {
+ case 0:
+ T = _mm256_extract_epi32(pSrcState->FSM_X[i], 0);
+ break;
+ case 1:
+ T = _mm256_extract_epi32(pSrcState->FSM_X[i], 1);
+ break;
+ case 2:
+ T = _mm256_extract_epi32(pSrcState->FSM_X[i], 2);
+ break;
+ case 3:
+ T = _mm256_extract_epi32(pSrcState->FSM_X[i], 3);
+ break;
+ case 4:
+ T = _mm256_extract_epi32(pSrcState->FSM_X[i], 4);
+ break;
+ case 5:
+ T = _mm256_extract_epi32(pSrcState->FSM_X[i], 5);
+ break;
+ case 6:
+ T = _mm256_extract_epi32(pSrcState->FSM_X[i], 6);
+ break;
+ case 7:
+ T = _mm256_extract_epi32(pSrcState->FSM_X[i], 7);
+ break;
+ }
+ pDstState->FSM_R3 = T;
+}
+#endif /* AVX2 */
+
+/**
+*******************************************************************************
+* @description
+* This function converts the state from a 4 buffer state structure to 1
+* buffer state structure.
+*
+* @param[in] pSrcState Pointer to the source state
+* @param[in] pDstState Pointer to the destination state
+* @param[in] NumBuffers Number of buffers
+*
+*******************************************************************************/
+static inline void snow3gStateConvert_4(snow3gKeyState4_t *pSrcState,
+ snow3gKeyState1_t *pDstState,
+ uint32_t NumBuffers)
+{
+ uint32_t i;
+ uint32_t T = 0, iLFSR_X = pSrcState->iLFSR_X;
+ __m128i *LFSR_X = pSrcState->LFSR_X;
+
+ for (i = 0; i < 16; i++) {
+ switch (NumBuffers) {
+ case 0:
+ T = _mm_extract_epi32(LFSR_X[(i + iLFSR_X) % 16], 0);
+ break;
+ case 1:
+ T = _mm_extract_epi32(LFSR_X[(i + iLFSR_X) % 16], 1);
+ break;
+ case 2:
+ T = _mm_extract_epi32(LFSR_X[(i + iLFSR_X) % 16], 2);
+ break;
+ case 3:
+ T = _mm_extract_epi32(LFSR_X[(i + iLFSR_X) % 16], 3);
+ break;
+ }
+ pDstState->LFSR_S[i] = T;
+ }
+
+ i = 0;
+ switch (NumBuffers) {
+ case 0:
+ T = _mm_extract_epi32(pSrcState->FSM_X[i], 0);
+ break;
+ case 1:
+ T = _mm_extract_epi32(pSrcState->FSM_X[i], 1);
+ break;
+ case 2:
+ T = _mm_extract_epi32(pSrcState->FSM_X[i], 2);
+ break;
+ case 3:
+ T = _mm_extract_epi32(pSrcState->FSM_X[i], 3);
+ break;
+ }
+ pDstState->FSM_R1 = T;
+
+ i = 1;
+ switch (NumBuffers) {
+ case 0:
+ T = _mm_extract_epi32(pSrcState->FSM_X[i], 0);
+ break;
+ case 1:
+ T = _mm_extract_epi32(pSrcState->FSM_X[i], 1);
+ break;
+ case 2:
+ T = _mm_extract_epi32(pSrcState->FSM_X[i], 2);
+ break;
+ case 3:
+ T = _mm_extract_epi32(pSrcState->FSM_X[i], 3);
+ break;
+ }
+ pDstState->FSM_R2 = T;
+
+ i = 2;
+ switch (NumBuffers) {
+ case 0:
+ T = _mm_extract_epi32(pSrcState->FSM_X[i], 0);
+ break;
+ case 1:
+ T = _mm_extract_epi32(pSrcState->FSM_X[i], 1);
+ break;
+ case 2:
+ T = _mm_extract_epi32(pSrcState->FSM_X[i], 2);
+ break;
+ case 3:
+ T = _mm_extract_epi32(pSrcState->FSM_X[i], 3);
+ break;
+ }
+ pDstState->FSM_R3 = T;
+}
+
+/*---------------------------------------------------------
+ * f8()
+ * Initializations and Context size definitions
+ *---------------------------------------------------------*/
+size_t SNOW3G_KEY_SCHED_SIZE(void) { return sizeof(snow3g_key_schedule_t); }
+
+int SNOW3G_INIT_KEY_SCHED(const void *pKey, snow3g_key_schedule_t *pCtx)
+{
+#ifdef SAFE_PARAM
+ if ((pKey == NULL) || (pCtx == NULL))
+ return -1;
+#endif
+
+ const uint32_t *pKey32 = pKey;
+
+ pCtx->k[3] = BSWAP32(pKey32[0]);
+ pCtx->k[2] = BSWAP32(pKey32[1]);
+ pCtx->k[1] = BSWAP32(pKey32[2]);
+ pCtx->k[0] = BSWAP32(pKey32[3]);
+
+ return 0;
+}
+
+/*---------------------------------------------------------
+ * @description
+ * Snow3G F8 1 buffer:
+ * Single buffer enc/dec with IV and precomputed key schedule
+ *---------------------------------------------------------*/
+void SNOW3G_F8_1_BUFFER(const snow3g_key_schedule_t *pHandle,
+ const void *pIV,
+ const void *pBufferIn,
+ void *pBufferOut,
+ const uint32_t lengthInBytes)
+{
+#ifdef SAFE_PARAM
+ if ((pHandle == NULL) || (pIV == NULL) ||
+ (pBufferIn == NULL) || (pBufferOut == NULL) ||
+ (lengthInBytes == 0) || (lengthInBytes > SNOW3G_MAX_BYTELEN))
+ return;
+#endif
+ snow3gKeyState1_t ctx;
+ uint32_t KS4; /* 4 bytes of keystream */
+
+ /* Initialize the schedule from the IV */
+ snow3gStateInitialize_1(&ctx, pHandle, pIV);
+
+ /* Clock FSM and LFSR once, ignore the keystream */
+ snow3g_keystream_1_4(&ctx, &KS4);
+
+ f8_snow3g(&ctx, pBufferIn, pBufferOut, lengthInBytes);
+
+#ifdef SAFE_DATA
+ CLEAR_VAR(&KS4, sizeof(KS4));
+ CLEAR_MEM(&ctx, sizeof(ctx));
+ CLEAR_SCRATCH_GPS();
+ CLEAR_SCRATCH_SIMD_REGS();
+#endif /* SAFE_DATA */
+}
+
+/*---------------------------------------------------------
+ * @description
+ * Snow3G F8 bit 1 buffer:
+ * Single buffer enc/dec with IV and precomputed key schedule
+ *---------------------------------------------------------*/
+void SNOW3G_F8_1_BUFFER_BIT(const snow3g_key_schedule_t *pHandle,
+ const void *pIV,
+ const void *pBufferIn,
+ void *pBufferOut,
+ const uint32_t lengthInBits,
+ const uint32_t offsetInBits)
+{
+#ifdef SAFE_PARAM
+ if ((pHandle == NULL) || (pIV == NULL) ||
+ (pBufferIn == NULL) || (pBufferOut == NULL) ||
+ (lengthInBits == 0))
+ return;
+#endif
+
+ snow3gKeyState1_t ctx;
+ uint32_t KS4; /* 4 bytes of keystream */
+
+ /* Initialize the schedule from the IV */
+ snow3gStateInitialize_1(&ctx, pHandle, pIV);
+
+ /* Clock FSM and LFSR once, ignore the keystream */
+ snow3g_keystream_1_4(&ctx, &KS4);
+
+ f8_snow3g_bit(&ctx, pBufferIn, pBufferOut, lengthInBits, offsetInBits);
+
+#ifdef SAFE_DATA
+ CLEAR_VAR(&KS4, sizeof(KS4));
+ CLEAR_MEM(&ctx, sizeof(ctx));
+ CLEAR_SCRATCH_GPS();
+ CLEAR_SCRATCH_SIMD_REGS();
+#endif /* SAFE_DATA */
+}
+
+/*---------------------------------------------------------
+ * @description
+ * Snow3G F8 2 buffer:
+ * Two buffers enc/dec with the same key schedule.
+ * The 3 IVs are independent and are passed as an array of pointers.
+ * Each buffer and data length are separate.
+ *---------------------------------------------------------*/
+void SNOW3G_F8_2_BUFFER(const snow3g_key_schedule_t *pHandle,
+ const void *pIV1,
+ const void *pIV2,
+ const void *pBufIn1,
+ void *pBufOut1,
+ const uint32_t lenInBytes1,
+ const void *pBufIn2,
+ void *pBufOut2,
+ const uint32_t lenInBytes2)
+{
+#ifdef SAFE_PARAM
+ if ((pHandle == NULL) || (pIV1 == NULL) || (pIV2 == NULL) ||
+ (pBufIn1 == NULL) || (pBufOut1 == NULL) ||
+ (pBufIn2 == NULL) || (pBufOut2 == NULL) ||
+ (lenInBytes1 == 0) || (lenInBytes1 > SNOW3G_MAX_BYTELEN) ||
+ (lenInBytes2 == 0) || (lenInBytes2 > SNOW3G_MAX_BYTELEN))
+ return;
+#endif
+
+ snow3gKeyState1_t ctx1, ctx2;
+ uint32_t KS4; /* 4 bytes of keystream */
+
+ /* Initialize the schedule from the IV */
+ snow3gStateInitialize_1(&ctx1, pHandle, pIV1);
+
+ /* Clock FSM and LFSR once, ignore the keystream */
+ snow3g_keystream_1_4(&ctx1, &KS4);
+
+ /* data processing for packet 1 */
+ f8_snow3g(&ctx1, pBufIn1, pBufOut1, lenInBytes1);
+
+ /* Initialize the schedule from the IV */
+ snow3gStateInitialize_1(&ctx2, pHandle, pIV2);
+
+ /* Clock FSM and LFSR once, ignore the keystream */
+ snow3g_keystream_1_4(&ctx2, &KS4);
+
+ /* data processing for packet 2 */
+ f8_snow3g(&ctx2, pBufIn2, pBufOut2, lenInBytes2);
+
+#ifdef SAFE_DATA
+ CLEAR_VAR(&KS4, sizeof(KS4));
+ CLEAR_MEM(&ctx1, sizeof(ctx1));
+ CLEAR_MEM(&ctx2, sizeof(ctx2));
+ CLEAR_SCRATCH_GPS();
+ CLEAR_SCRATCH_SIMD_REGS();
+#endif /* SAFE_DATA */
+
+}
+
+/*---------------------------------------------------------
+ * @description
+ * Snow3G F8 4 buffer:
+ * Four packets enc/dec with the same key schedule.
+ * The 4 IVs are independent and are passed as an array of pointers.
+ * Each buffer and data length are separate.
+ *---------------------------------------------------------*/
+void SNOW3G_F8_4_BUFFER(const snow3g_key_schedule_t *pHandle,
+ const void *pIV1,
+ const void *pIV2,
+ const void *pIV3,
+ const void *pIV4,
+ const void *pBufferIn1,
+ void *pBufferOut1,
+ const uint32_t lengthInBytes1,
+ const void *pBufferIn2,
+ void *pBufferOut2,
+ const uint32_t lengthInBytes2,
+ const void *pBufferIn3,
+ void *pBufferOut3,
+ const uint32_t lengthInBytes3,
+ const void *pBufferIn4,
+ void *pBufferOut4,
+ const uint32_t lengthInBytes4)
+{
+#ifdef SAFE_PARAM
+ if ((pHandle == NULL) ||
+ (pIV1 == NULL) || (pIV2 == NULL) ||
+ (pIV3 == NULL) || (pIV4 == NULL) ||
+ (pBufferIn1 == NULL) || (pBufferOut1 == NULL) ||
+ (pBufferIn2 == NULL) || (pBufferOut2 == NULL) ||
+ (pBufferIn3 == NULL) || (pBufferOut3 == NULL) ||
+ (pBufferIn4 == NULL) || (pBufferOut4 == NULL) ||
+ (lengthInBytes1 == 0) || (lengthInBytes1 > SNOW3G_MAX_BYTELEN) ||
+ (lengthInBytes2 == 0) || (lengthInBytes2 > SNOW3G_MAX_BYTELEN) ||
+ (lengthInBytes3 == 0) || (lengthInBytes3 > SNOW3G_MAX_BYTELEN) ||
+ (lengthInBytes4 == 0) || (lengthInBytes4 > SNOW3G_MAX_BYTELEN))
+ return;
+#endif
+
+ snow3gKeyState4_t ctx;
+ __m128i H, L; /* 4 bytes of keystream */
+ uint32_t lenInBytes1 = lengthInBytes1;
+ uint32_t lenInBytes2 = lengthInBytes2;
+ uint32_t lenInBytes3 = lengthInBytes3;
+ uint32_t lenInBytes4 = lengthInBytes4;
+ uint32_t bytes1 =
+ (lenInBytes1 < lenInBytes2 ? lenInBytes1
+ : lenInBytes2); /* number of bytes */
+ uint32_t bytes2 =
+ (lenInBytes3 < lenInBytes4 ? lenInBytes3
+ : lenInBytes4); /* number of bytes */
+ /* min num of bytes */
+ uint32_t bytes = (bytes1 < bytes2) ? bytes1 : bytes2;
+ uint32_t qwords = bytes / SNOW3G_8_BYTES;
+ uint8_t *pBufOut1 = pBufferOut1;
+ uint8_t *pBufOut2 = pBufferOut2;
+ uint8_t *pBufOut3 = pBufferOut3;
+ uint8_t *pBufOut4 = pBufferOut4;
+ const uint8_t *pBufIn1 = pBufferIn1;
+ const uint8_t *pBufIn2 = pBufferIn2;
+ const uint8_t *pBufIn3 = pBufferIn3;
+ const uint8_t *pBufIn4 = pBufferIn4;
+
+ bytes = qwords * SNOW3G_8_BYTES; /* rounded down minimum length */
+
+ /* Initialize the schedule from the IV */
+ snow3gStateInitialize_4(&ctx, pHandle, pIV1, pIV2, pIV3, pIV4);
+
+ /* Clock FSM and LFSR once, ignore the keystream */
+ snow3g_keystream_4_4(&ctx, &L);
+
+ lenInBytes1 -= bytes;
+ lenInBytes2 -= bytes;
+ lenInBytes3 -= bytes;
+ lenInBytes4 -= bytes;
+
+ /* generates 4 bytes at a time on all streams */
+ while (qwords--) {
+ snow3g_keystream_4_8(&ctx, &L, &H);
+ pBufIn1 = xor_keystrm_rev(pBufOut1, pBufIn1,
+ _mm_extract_epi64(L, 0));
+ pBufIn2 = xor_keystrm_rev(pBufOut2, pBufIn2,
+ _mm_extract_epi64(L, 1));
+ pBufIn3 = xor_keystrm_rev(pBufOut3, pBufIn3,
+ _mm_extract_epi64(H, 0));
+ pBufIn4 = xor_keystrm_rev(pBufOut4, pBufIn4,
+ _mm_extract_epi64(H, 1));
+
+ pBufOut1 += SNOW3G_8_BYTES;
+ pBufOut2 += SNOW3G_8_BYTES;
+ pBufOut3 += SNOW3G_8_BYTES;
+ pBufOut4 += SNOW3G_8_BYTES;
+ }
+
+ /* process the remaining of each buffer
+ * - extract the LFSR and FSM structures
+ * - Continue process 1 buffer
+ */
+ if (lenInBytes1) {
+ snow3gKeyState1_t ctx1;
+
+ snow3gStateConvert_4(&ctx, &ctx1, 0);
+ f8_snow3g(&ctx1, pBufIn1, pBufOut1, lenInBytes1);
+ }
+
+ if (lenInBytes2) {
+ snow3gKeyState1_t ctx2;
+
+ snow3gStateConvert_4(&ctx, &ctx2, 1);
+ f8_snow3g(&ctx2, pBufIn2, pBufOut2, lenInBytes2);
+ }
+
+ if (lenInBytes3) {
+ snow3gKeyState1_t ctx3;
+
+ snow3gStateConvert_4(&ctx, &ctx3, 2);
+ f8_snow3g(&ctx3, pBufIn3, pBufOut3, lenInBytes3);
+ }
+
+ if (lenInBytes4) {
+ snow3gKeyState1_t ctx4;
+
+ snow3gStateConvert_4(&ctx, &ctx4, 3);
+ f8_snow3g(&ctx4, pBufIn4, pBufOut4, lenInBytes4);
+ }
+
+#ifdef SAFE_DATA
+ H = _mm_setzero_si128();
+ L = _mm_setzero_si128();
+ CLEAR_MEM(&ctx, sizeof(ctx));
+ CLEAR_SCRATCH_GPS();
+ CLEAR_SCRATCH_SIMD_REGS();
+#endif /* SAFE_DATA */
+
+}
+
+#ifdef AVX2
+/*---------------------------------------------------------
+ * @description
+ * Snow3G 8 buffer ks 8 multi:
+ * Processes 8 packets 8 bytes at a time.
+ * Uses individual key schedule for each buffer.
+ *---------------------------------------------------------*/
+static inline void
+snow3g_8_buffer_ks_8_multi(uint32_t bytes,
+ const snow3g_key_schedule_t * const pKey[],
+ const void * const IV[],
+ const void * const pBufferIn[],
+ void *pBufferOut[], const uint32_t *lengthInBytes)
+{
+ uint32_t qwords = bytes / SNOW3G_8_BYTES;
+ __m256i H, L; /* 8 bytes of keystream */
+ snow3gKeyState8_t ctx;
+ int i;
+ const uint8_t *tBufferIn[8];
+ uint8_t *tBufferOut[8];
+ uint32_t tLenInBytes[8];
+
+ bytes = qwords * SNOW3G_8_BYTES; /* rounded down minimum length */
+
+ for (i = 0; i < 8; i++) {
+ tBufferIn[i] = pBufferIn[i];
+ tBufferOut[i] = pBufferOut[i];
+ tLenInBytes[i] = lengthInBytes[i];
+ }
+
+ /* Initialize the schedule from the IV */
+ snow3gStateInitialize_8_multiKey(&ctx, pKey, IV);
+
+ /* Clock FSM and LFSR once, ignore the keystream */
+ snow3g_keystream_8_4(&ctx, &L);
+
+ for (i = 0; i < 8; i++)
+ tLenInBytes[i] -= bytes;
+
+ /* generates 8 sets at a time on all streams */
+ for (i = qwords; i != 0; i--) {
+ int j;
+
+ snow3g_keystream_8_8(&ctx, &L, &H);
+
+ tBufferIn[0] = xor_keystrm_rev(tBufferOut[0], tBufferIn[0],
+ _mm256_extract_epi64(L, 0));
+ tBufferIn[1] = xor_keystrm_rev(tBufferOut[1], tBufferIn[1],
+ _mm256_extract_epi64(L, 1));
+ tBufferIn[2] = xor_keystrm_rev(tBufferOut[2], tBufferIn[2],
+ _mm256_extract_epi64(H, 0));
+ tBufferIn[3] = xor_keystrm_rev(tBufferOut[3], tBufferIn[3],
+ _mm256_extract_epi64(H, 1));
+ tBufferIn[4] = xor_keystrm_rev(tBufferOut[4], tBufferIn[4],
+ _mm256_extract_epi64(L, 2));
+ tBufferIn[5] = xor_keystrm_rev(tBufferOut[5], tBufferIn[5],
+ _mm256_extract_epi64(L, 3));
+ tBufferIn[6] = xor_keystrm_rev(tBufferOut[6], tBufferIn[6],
+ _mm256_extract_epi64(H, 2));
+ tBufferIn[7] = xor_keystrm_rev(tBufferOut[7], tBufferIn[7],
+ _mm256_extract_epi64(H, 3));
+
+ for (j = 0; j < 8; j++)
+ tBufferOut[j] += SNOW3G_8_BYTES;
+ }
+
+ /* process the remaining of each buffer
+ * - extract the LFSR and FSM structures
+ * - Continue process 1 buffer
+ */
+ if (tLenInBytes[0]) {
+ snow3gKeyState1_t ctx1;
+
+ snow3gStateConvert_8(&ctx, &ctx1, 0);
+ f8_snow3g(&ctx1, tBufferIn[0], tBufferOut[0], tLenInBytes[0]);
+ }
+ if (tLenInBytes[1]) {
+ snow3gKeyState1_t ctx2;
+
+ snow3gStateConvert_8(&ctx, &ctx2, 1);
+ f8_snow3g(&ctx2, tBufferIn[1], tBufferOut[1], tLenInBytes[1]);
+ }
+ if (tLenInBytes[2]) {
+ snow3gKeyState1_t ctx3;
+
+ snow3gStateConvert_8(&ctx, &ctx3, 2);
+ f8_snow3g(&ctx3, tBufferIn[2], tBufferOut[2], tLenInBytes[2]);
+ }
+ if (tLenInBytes[3]) {
+ snow3gKeyState1_t ctx4;
+
+ snow3gStateConvert_8(&ctx, &ctx4, 3);
+ f8_snow3g(&ctx4, tBufferIn[3], tBufferOut[3], tLenInBytes[3]);
+ }
+ if (tLenInBytes[4]) {
+ snow3gKeyState1_t ctx5;
+
+ snow3gStateConvert_8(&ctx, &ctx5, 4);
+ f8_snow3g(&ctx5, tBufferIn[4], tBufferOut[4], tLenInBytes[4]);
+ }
+ if (tLenInBytes[5]) {
+ snow3gKeyState1_t ctx6;
+
+ snow3gStateConvert_8(&ctx, &ctx6, 5);
+ f8_snow3g(&ctx6, tBufferIn[5], tBufferOut[5], tLenInBytes[5]);
+ }
+ if (tLenInBytes[6]) {
+ snow3gKeyState1_t ctx7;
+
+ snow3gStateConvert_8(&ctx, &ctx7, 6);
+ f8_snow3g(&ctx7, tBufferIn[6], tBufferOut[6], tLenInBytes[6]);
+ }
+ if (tLenInBytes[7]) {
+ snow3gKeyState1_t ctx8;
+
+ snow3gStateConvert_8(&ctx, &ctx8, 7);
+ f8_snow3g(&ctx8, tBufferIn[7], tBufferOut[7], tLenInBytes[7]);
+ }
+
+#ifdef SAFE_DATA
+ H = _mm256_setzero_si256();
+ L = _mm256_setzero_si256();
+ CLEAR_MEM(&ctx, sizeof(ctx));
+#endif /* SAFE_DATA */
+}
+
+/*---------------------------------------------------------
+ * @description
+ * Snow3G 8 buffer ks 32 multi:
+ * Processes 8 packets 32 bytes at a time.
+ * Uses individual key schedule for each buffer.
+ *---------------------------------------------------------*/
+static inline void
+snow3g_8_buffer_ks_32_multi(uint32_t bytes,
+ const snow3g_key_schedule_t * const pKey[],
+ const void * const IV[],
+ const void * const pBufferIn[],
+ void *pBufferOut[], const uint32_t *lengthInBytes)
+{
+
+ snow3gKeyState8_t ctx;
+ uint32_t i;
+
+ const uint8_t *tBufferIn[8];
+ uint8_t *tBufferOut[8];
+ uint32_t tLenInBytes[8];
+
+ for (i = 0; i < 8; i++) {
+ tBufferIn[i] = pBufferIn[i];
+ tBufferOut[i] = pBufferOut[i];
+ tLenInBytes[i] = lengthInBytes[i];
+ }
+
+ uint32_t blocks = bytes / 32;
+
+ bytes = blocks * 32; /* rounded down minimum length */
+
+ /* Initialize the schedule from the IV */
+ snow3gStateInitialize_8_multiKey(&ctx, pKey, IV);
+
+ /* Clock FSM and LFSR once, ignore the keystream */
+ __m256i ks[8];
+
+ snow3g_keystream_8_4(&ctx, ks);
+
+ for (i = 0; i < 8; i++)
+ tLenInBytes[i] -= bytes;
+
+ __m256i in[8];
+
+ /* generates 8 sets at a time on all streams */
+ for (i = 0; i < blocks; i++) {
+ int j;
+
+ in[0] = _mm256_loadu_si256((const __m256i *)tBufferIn[0]);
+ in[1] = _mm256_loadu_si256((const __m256i *)tBufferIn[1]);
+ in[2] = _mm256_loadu_si256((const __m256i *)tBufferIn[2]);
+ in[3] = _mm256_loadu_si256((const __m256i *)tBufferIn[3]);
+ in[4] = _mm256_loadu_si256((const __m256i *)tBufferIn[4]);
+ in[5] = _mm256_loadu_si256((const __m256i *)tBufferIn[5]);
+ in[6] = _mm256_loadu_si256((const __m256i *)tBufferIn[6]);
+ in[7] = _mm256_loadu_si256((const __m256i *)tBufferIn[7]);
+
+ snow3g_keystream_8_32(&ctx, ks);
+
+ _mm256_storeu_si256((__m256i *)tBufferOut[0],
+ _mm256_xor_si256(in[0], ks[0]));
+ _mm256_storeu_si256((__m256i *)tBufferOut[1],
+ _mm256_xor_si256(in[1], ks[1]));
+ _mm256_storeu_si256((__m256i *)tBufferOut[2],
+ _mm256_xor_si256(in[2], ks[2]));
+ _mm256_storeu_si256((__m256i *)tBufferOut[3],
+ _mm256_xor_si256(in[3], ks[3]));
+ _mm256_storeu_si256((__m256i *)tBufferOut[4],
+ _mm256_xor_si256(in[4], ks[4]));
+ _mm256_storeu_si256((__m256i *)tBufferOut[5],
+ _mm256_xor_si256(in[5], ks[5]));
+ _mm256_storeu_si256((__m256i *)tBufferOut[6],
+ _mm256_xor_si256(in[6], ks[6]));
+ _mm256_storeu_si256((__m256i *)tBufferOut[7],
+ _mm256_xor_si256(in[7], ks[7]));
+
+ for (j = 0; j < 8; j++) {
+ tBufferIn[i] += 32;
+ tBufferOut[i] += 32;
+ }
+ }
+
+ /* process the remaining of each buffer
+ * - extract the LFSR and FSM structures
+ * - Continue process 1 buffer
+ */
+ if (tLenInBytes[0]) {
+ snow3gKeyState1_t ctx1;
+
+ snow3gStateConvert_8(&ctx, &ctx1, 0);
+ f8_snow3g(&ctx1, tBufferIn[0], tBufferOut[0], tLenInBytes[0]);
+ }
+ if (tLenInBytes[1]) {
+ snow3gKeyState1_t ctx2;
+
+ snow3gStateConvert_8(&ctx, &ctx2, 1);
+ f8_snow3g(&ctx2, tBufferIn[1], tBufferOut[1], tLenInBytes[1]);
+ }
+ if (tLenInBytes[2]) {
+ snow3gKeyState1_t ctx3;
+
+ snow3gStateConvert_8(&ctx, &ctx3, 2);
+ f8_snow3g(&ctx3, tBufferIn[2], tBufferOut[2], tLenInBytes[2]);
+ }
+ if (tLenInBytes[3]) {
+ snow3gKeyState1_t ctx4;
+
+ snow3gStateConvert_8(&ctx, &ctx4, 3);
+ f8_snow3g(&ctx4, tBufferIn[3], tBufferOut[3], tLenInBytes[3]);
+ }
+ if (tLenInBytes[4]) {
+ snow3gKeyState1_t ctx5;
+
+ snow3gStateConvert_8(&ctx, &ctx5, 4);
+ f8_snow3g(&ctx5, tBufferIn[4], tBufferOut[4], tLenInBytes[4]);
+ }
+ if (tLenInBytes[5]) {
+ snow3gKeyState1_t ctx6;
+
+ snow3gStateConvert_8(&ctx, &ctx6, 5);
+ f8_snow3g(&ctx6, tBufferIn[5], tBufferOut[5], tLenInBytes[5]);
+ }
+ if (tLenInBytes[6]) {
+ snow3gKeyState1_t ctx7;
+
+ snow3gStateConvert_8(&ctx, &ctx7, 6);
+ f8_snow3g(&ctx7, tBufferIn[6], tBufferOut[6], tLenInBytes[6]);
+ }
+ if (tLenInBytes[7]) {
+ snow3gKeyState1_t ctx8;
+
+ snow3gStateConvert_8(&ctx, &ctx8, 7);
+ f8_snow3g(&ctx8, tBufferIn[7], tBufferOut[7], tLenInBytes[7]);
+ }
+
+#ifdef SAFE_DATA
+ CLEAR_MEM(&ctx, sizeof(ctx));
+ CLEAR_MEM(&ks, sizeof(ks));
+ CLEAR_MEM(&in, sizeof(in));
+#endif /* SAFE_DATA */
+}
+
+/*---------------------------------------------------------
+ * @description
+ * Snow3G 8 buffer ks 8 multi:
+ * Processes 8 packets 8 bytes at a time.
+ * Uses same key schedule for each buffer.
+ *---------------------------------------------------------*/
+static inline void
+snow3g_8_buffer_ks_8(uint32_t bytes,
+ const snow3g_key_schedule_t *pHandle,
+ const void *pIV1,
+ const void *pIV2,
+ const void *pIV3,
+ const void *pIV4,
+ const void *pIV5,
+ const void *pIV6,
+ const void *pIV7,
+ const void *pIV8,
+ const void *pBufferIn1, void *pBufferOut1,
+ const uint32_t lengthInBytes1,
+ const void *pBufferIn2, void *pBufferOut2,
+ const uint32_t lengthInBytes2,
+ const void *pBufferIn3, void *pBufferOut3,
+ const uint32_t lengthInBytes3,
+ const void *pBufferIn4, void *pBufferOut4,
+ const uint32_t lengthInBytes4,
+ const void *pBufferIn5, void *pBufferOut5,
+ const uint32_t lengthInBytes5,
+ const void *pBufferIn6, void *pBufferOut6,
+ const uint32_t lengthInBytes6,
+ const void *pBufferIn7, void *pBufferOut7,
+ const uint32_t lengthInBytes7,
+ const void *pBufferIn8, void *pBufferOut8,
+ const uint32_t lengthInBytes8)
+{
+
+ uint32_t qwords = bytes / SNOW3G_8_BYTES;
+ __m256i H, L; /* 8 bytes of keystream */
+ snow3gKeyState8_t ctx;
+ int i;
+ uint32_t lenInBytes1 = lengthInBytes1;
+ uint32_t lenInBytes2 = lengthInBytes2;
+ uint32_t lenInBytes3 = lengthInBytes3;
+ uint32_t lenInBytes4 = lengthInBytes4;
+ uint32_t lenInBytes5 = lengthInBytes5;
+ uint32_t lenInBytes6 = lengthInBytes6;
+ uint32_t lenInBytes7 = lengthInBytes7;
+ uint32_t lenInBytes8 = lengthInBytes8;
+ uint8_t *pBufOut1 = pBufferOut1;
+ uint8_t *pBufOut2 = pBufferOut2;
+ uint8_t *pBufOut3 = pBufferOut3;
+ uint8_t *pBufOut4 = pBufferOut4;
+ uint8_t *pBufOut5 = pBufferOut5;
+ uint8_t *pBufOut6 = pBufferOut6;
+ uint8_t *pBufOut7 = pBufferOut7;
+ uint8_t *pBufOut8 = pBufferOut8;
+ const uint8_t *pBufIn1 = pBufferIn1;
+ const uint8_t *pBufIn2 = pBufferIn2;
+ const uint8_t *pBufIn3 = pBufferIn3;
+ const uint8_t *pBufIn4 = pBufferIn4;
+ const uint8_t *pBufIn5 = pBufferIn5;
+ const uint8_t *pBufIn6 = pBufferIn6;
+ const uint8_t *pBufIn7 = pBufferIn7;
+ const uint8_t *pBufIn8 = pBufferIn8;
+
+ bytes = qwords * SNOW3G_8_BYTES; /* rounded down minimum length */
+
+ /* Initialize the schedule from the IV */
+ snow3gStateInitialize_8(&ctx, pHandle, pIV1, pIV2, pIV3,
+ pIV4, pIV5, pIV6, pIV7, pIV8);
+
+ /* Clock FSM and LFSR once, ignore the keystream */
+ snow3g_keystream_8_4(&ctx, &L);
+
+ lenInBytes1 -= bytes;
+ lenInBytes2 -= bytes;
+ lenInBytes3 -= bytes;
+ lenInBytes4 -= bytes;
+ lenInBytes5 -= bytes;
+ lenInBytes6 -= bytes;
+ lenInBytes7 -= bytes;
+ lenInBytes8 -= bytes;
+
+ /* generates 8 sets at a time on all streams */
+ for (i = qwords; i != 0; i--) {
+ snow3g_keystream_8_8(&ctx, &L, &H);
+
+ pBufIn1 = xor_keystrm_rev(pBufOut1, pBufIn1,
+ _mm256_extract_epi64(L, 0));
+ pBufIn2 = xor_keystrm_rev(pBufOut2, pBufIn2,
+ _mm256_extract_epi64(L, 1));
+ pBufIn3 = xor_keystrm_rev(pBufOut3, pBufIn3,
+ _mm256_extract_epi64(H, 0));
+ pBufIn4 = xor_keystrm_rev(pBufOut4, pBufIn4,
+ _mm256_extract_epi64(H, 1));
+ pBufIn5 = xor_keystrm_rev(pBufOut5, pBufIn5,
+ _mm256_extract_epi64(L, 2));
+ pBufIn6 = xor_keystrm_rev(pBufOut6, pBufIn6,
+ _mm256_extract_epi64(L, 3));
+ pBufIn7 = xor_keystrm_rev(pBufOut7, pBufIn7,
+ _mm256_extract_epi64(H, 2));
+ pBufIn8 = xor_keystrm_rev(pBufOut8, pBufIn8,
+ _mm256_extract_epi64(H, 3));
+
+ pBufOut1 += SNOW3G_8_BYTES;
+ pBufOut2 += SNOW3G_8_BYTES;
+ pBufOut3 += SNOW3G_8_BYTES;
+ pBufOut4 += SNOW3G_8_BYTES;
+ pBufOut5 += SNOW3G_8_BYTES;
+ pBufOut6 += SNOW3G_8_BYTES;
+ pBufOut7 += SNOW3G_8_BYTES;
+ pBufOut8 += SNOW3G_8_BYTES;
+ }
+
+ /* process the remaining of each buffer
+ * - extract the LFSR and FSM structures
+ * - Continue process 1 buffer
+ */
+ if (lenInBytes1) {
+ snow3gKeyState1_t ctx1;
+
+ snow3gStateConvert_8(&ctx, &ctx1, 0);
+ f8_snow3g(&ctx1, pBufIn1, pBufOut1, lenInBytes1);
+ }
+
+ if (lenInBytes2) {
+ snow3gKeyState1_t ctx2;
+
+ snow3gStateConvert_8(&ctx, &ctx2, 1);
+ f8_snow3g(&ctx2, pBufIn2, pBufOut2, lenInBytes2);
+ }
+
+ if (lenInBytes3) {
+ snow3gKeyState1_t ctx3;
+
+ snow3gStateConvert_8(&ctx, &ctx3, 2);
+ f8_snow3g(&ctx3, pBufIn3, pBufOut3, lenInBytes3);
+ }
+
+ if (lenInBytes4) {
+ snow3gKeyState1_t ctx4;
+
+ snow3gStateConvert_8(&ctx, &ctx4, 3);
+ f8_snow3g(&ctx4, pBufIn4, pBufOut4, lenInBytes4);
+ }
+
+ if (lenInBytes5) {
+ snow3gKeyState1_t ctx5;
+
+ snow3gStateConvert_8(&ctx, &ctx5, 4);
+ f8_snow3g(&ctx5, pBufIn5, pBufOut5, lenInBytes5);
+ }
+
+ if (lenInBytes6) {
+ snow3gKeyState1_t ctx6;
+
+ snow3gStateConvert_8(&ctx, &ctx6, 5);
+ f8_snow3g(&ctx6, pBufIn6, pBufOut6, lenInBytes6);
+ }
+
+ if (lenInBytes7) {
+ snow3gKeyState1_t ctx7;
+
+ snow3gStateConvert_8(&ctx, &ctx7, 6);
+ f8_snow3g(&ctx7, pBufIn7, pBufOut7, lenInBytes7);
+ }
+
+ if (lenInBytes8) {
+ snow3gKeyState1_t ctx8;
+
+ snow3gStateConvert_8(&ctx, &ctx8, 7);
+ f8_snow3g(&ctx8, pBufIn8, pBufOut8, lenInBytes8);
+ }
+
+#ifdef SAFE_DATA
+ H = _mm256_setzero_si256();
+ L = _mm256_setzero_si256();
+ CLEAR_MEM(&ctx, sizeof(ctx));
+#endif /* SAFE_DATA */
+}
+
+/*---------------------------------------------------------
+ * @description
+ * Snow3G 8 buffer ks 32 multi:
+ * Processes 8 packets 32 bytes at a time.
+ * Uses same key schedule for each buffer.
+ *---------------------------------------------------------*/
+static inline void
+snow3g_8_buffer_ks_32(uint32_t bytes,
+ const snow3g_key_schedule_t *pKey,
+ const void *pIV1, const void *pIV2,
+ const void *pIV3, const void *pIV4,
+ const void *pIV5, const void *pIV6,
+ const void *pIV7, const void *pIV8,
+ const void *pBufferIn1, void *pBufferOut1,
+ const uint32_t lengthInBytes1,
+ const void *pBufferIn2, void *pBufferOut2,
+ const uint32_t lengthInBytes2,
+ const void *pBufferIn3, void *pBufferOut3,
+ const uint32_t lengthInBytes3,
+ const void *pBufferIn4, void *pBufferOut4,
+ const uint32_t lengthInBytes4,
+ const void *pBufferIn5, void *pBufferOut5,
+ const uint32_t lengthInBytes5,
+ const void *pBufferIn6, void *pBufferOut6,
+ const uint32_t lengthInBytes6,
+ const void *pBufferIn7, void *pBufferOut7,
+ const uint32_t lengthInBytes7,
+ const void *pBufferIn8, void *pBufferOut8,
+ const uint32_t lengthInBytes8)
+{
+ snow3gKeyState8_t ctx;
+ uint32_t i;
+ uint32_t lenInBytes1 = lengthInBytes1;
+ uint32_t lenInBytes2 = lengthInBytes2;
+ uint32_t lenInBytes3 = lengthInBytes3;
+ uint32_t lenInBytes4 = lengthInBytes4;
+ uint32_t lenInBytes5 = lengthInBytes5;
+ uint32_t lenInBytes6 = lengthInBytes6;
+ uint32_t lenInBytes7 = lengthInBytes7;
+ uint32_t lenInBytes8 = lengthInBytes8;
+ uint8_t *pBufOut1 = pBufferOut1;
+ uint8_t *pBufOut2 = pBufferOut2;
+ uint8_t *pBufOut3 = pBufferOut3;
+ uint8_t *pBufOut4 = pBufferOut4;
+ uint8_t *pBufOut5 = pBufferOut5;
+ uint8_t *pBufOut6 = pBufferOut6;
+ uint8_t *pBufOut7 = pBufferOut7;
+ uint8_t *pBufOut8 = pBufferOut8;
+ const uint8_t *pBufIn1 = pBufferIn1;
+ const uint8_t *pBufIn2 = pBufferIn2;
+ const uint8_t *pBufIn3 = pBufferIn3;
+ const uint8_t *pBufIn4 = pBufferIn4;
+ const uint8_t *pBufIn5 = pBufferIn5;
+ const uint8_t *pBufIn6 = pBufferIn6;
+ const uint8_t *pBufIn7 = pBufferIn7;
+ const uint8_t *pBufIn8 = pBufferIn8;
+
+ uint32_t blocks = bytes / 32;
+
+ bytes = blocks * 32; /* rounded down minimum length */
+
+ /* Initialize the schedule from the IV */
+ snow3gStateInitialize_8(&ctx, pKey, pIV1, pIV2, pIV3, pIV4, pIV5, pIV6,
+ pIV7, pIV8);
+
+ /* Clock FSM and LFSR once, ignore the keystream */
+ __m256i ks[8];
+
+ snow3g_keystream_8_4(&ctx, ks);
+
+ lenInBytes1 -= bytes;
+ lenInBytes2 -= bytes;
+ lenInBytes3 -= bytes;
+ lenInBytes4 -= bytes;
+ lenInBytes5 -= bytes;
+ lenInBytes6 -= bytes;
+ lenInBytes7 -= bytes;
+ lenInBytes8 -= bytes;
+
+ __m256i in[8];
+
+ /* generates 8 sets at a time on all streams */
+ for (i = 0; i < blocks; i++) {
+
+ in[0] = _mm256_loadu_si256((const __m256i *)pBufIn1);
+ in[1] = _mm256_loadu_si256((const __m256i *)pBufIn2);
+ in[2] = _mm256_loadu_si256((const __m256i *)pBufIn3);
+ in[3] = _mm256_loadu_si256((const __m256i *)pBufIn4);
+ in[4] = _mm256_loadu_si256((const __m256i *)pBufIn5);
+ in[5] = _mm256_loadu_si256((const __m256i *)pBufIn6);
+ in[6] = _mm256_loadu_si256((const __m256i *)pBufIn7);
+ in[7] = _mm256_loadu_si256((const __m256i *)pBufIn8);
+
+ snow3g_keystream_8_32(&ctx, ks);
+
+ _mm256_storeu_si256((__m256i *)pBufOut1,
+ _mm256_xor_si256(in[0], ks[0]));
+ _mm256_storeu_si256((__m256i *)pBufOut2,
+ _mm256_xor_si256(in[1], ks[1]));
+ _mm256_storeu_si256((__m256i *)pBufOut3,
+ _mm256_xor_si256(in[2], ks[2]));
+ _mm256_storeu_si256((__m256i *)pBufOut4,
+ _mm256_xor_si256(in[3], ks[3]));
+ _mm256_storeu_si256((__m256i *)pBufOut5,
+ _mm256_xor_si256(in[4], ks[4]));
+ _mm256_storeu_si256((__m256i *)pBufOut6,
+ _mm256_xor_si256(in[5], ks[5]));
+ _mm256_storeu_si256((__m256i *)pBufOut7,
+ _mm256_xor_si256(in[6], ks[6]));
+ _mm256_storeu_si256((__m256i *)pBufOut8,
+ _mm256_xor_si256(in[7], ks[7]));
+
+ pBufIn1 += 32;
+ pBufIn2 += 32;
+ pBufIn3 += 32;
+ pBufIn4 += 32;
+ pBufIn5 += 32;
+ pBufIn6 += 32;
+ pBufIn7 += 32;
+ pBufIn8 += 32;
+
+ pBufOut1 += 32;
+ pBufOut2 += 32;
+ pBufOut3 += 32;
+ pBufOut4 += 32;
+ pBufOut5 += 32;
+ pBufOut6 += 32;
+ pBufOut7 += 32;
+ pBufOut8 += 32;
+ }
+
+ /* process the remaining of each buffer
+ * - extract the LFSR and FSM structures
+ * - Continue process 1 buffer
+ */
+ if (lenInBytes1) {
+ snow3gKeyState1_t ctx1;
+
+ snow3gStateConvert_8(&ctx, &ctx1, 0);
+ f8_snow3g(&ctx1, pBufIn1, pBufOut1, lenInBytes1);
+ }
+
+ if (lenInBytes2) {
+ snow3gKeyState1_t ctx2;
+
+ snow3gStateConvert_8(&ctx, &ctx2, 1);
+ f8_snow3g(&ctx2, pBufIn2, pBufOut2, lenInBytes2);
+ }
+
+ if (lenInBytes3) {
+ snow3gKeyState1_t ctx3;
+
+ snow3gStateConvert_8(&ctx, &ctx3, 2);
+ f8_snow3g(&ctx3, pBufIn3, pBufOut3, lenInBytes3);
+ }
+
+ if (lenInBytes4) {
+ snow3gKeyState1_t ctx4;
+
+ snow3gStateConvert_8(&ctx, &ctx4, 3);
+ f8_snow3g(&ctx4, pBufIn4, pBufOut4, lenInBytes4);
+ }
+
+ if (lenInBytes5) {
+ snow3gKeyState1_t ctx5;
+
+ snow3gStateConvert_8(&ctx, &ctx5, 4);
+ f8_snow3g(&ctx5, pBufIn5, pBufOut5, lenInBytes5);
+ }
+
+ if (lenInBytes6) {
+ snow3gKeyState1_t ctx6;
+
+ snow3gStateConvert_8(&ctx, &ctx6, 5);
+ f8_snow3g(&ctx6, pBufIn6, pBufOut6, lenInBytes6);
+ }
+
+ if (lenInBytes7) {
+ snow3gKeyState1_t ctx7;
+
+ snow3gStateConvert_8(&ctx, &ctx7, 6);
+ f8_snow3g(&ctx7, pBufIn7, pBufOut7, lenInBytes7);
+ }
+
+ if (lenInBytes8) {
+ snow3gKeyState1_t ctx8;
+
+ snow3gStateConvert_8(&ctx, &ctx8, 7);
+ f8_snow3g(&ctx8, pBufIn8, pBufOut8, lenInBytes8);
+ }
+
+#ifdef SAFE_DATA
+ CLEAR_MEM(&ctx, sizeof(ctx));
+ CLEAR_MEM(&ks, sizeof(ks));
+ CLEAR_MEM(&in, sizeof(in));
+#endif /* SAFE_DATA */
+}
+#endif /* AVX2 */
+
+/*---------------------------------------------------------
+ * @description
+ * Snow3G F8 8 buffer, multi-key:
+ * Eight packets enc/dec with eight respective key schedules.
+ * The 8 IVs are independent and are passed as an array of pointers.
+ * Each buffer and data length are separate.
+ *---------------------------------------------------------*/
+void SNOW3G_F8_8_BUFFER_MULTIKEY(const snow3g_key_schedule_t * const pKey[],
+ const void * const IV[],
+ const void * const BufferIn[],
+ void *BufferOut[],
+ const uint32_t lengthInBytes[])
+{
+ int i;
+
+#ifdef SAFE_PARAM
+ if ((pKey == NULL) || (IV == NULL) || (BufferIn == NULL) ||
+ (BufferOut == NULL) || (lengthInBytes == NULL))
+ return;
+
+ for (i = 0; i < 8; i++)
+ if ((pKey[i] == NULL) || (IV[i] == NULL) ||
+ (BufferIn[i] == NULL) || (BufferOut[i] == NULL) ||
+ (lengthInBytes[i] == 0) ||
+ (lengthInBytes[i] > SNOW3G_MAX_BYTELEN))
+ return;
+#endif
+
+#ifndef AVX2
+ /* basic C workaround for lack of non AVX2 implementation */
+ for (i = 0; i < 8; i++)
+ SNOW3G_F8_1_BUFFER(pKey[i], IV[i], BufferIn[i], BufferOut[i],
+ lengthInBytes[i]);
+#else
+ uint32_t bytes = lengthInBytes[0];
+
+ /* find min byte lenght */
+ for (i = 1; i < 8; i++)
+ if (lengthInBytes[i] < bytes)
+ bytes = lengthInBytes[i];
+
+ if (bytes % 32) {
+ snow3g_8_buffer_ks_8_multi(bytes, pKey, IV, BufferIn, BufferOut,
+ lengthInBytes);
+ } else {
+ snow3g_8_buffer_ks_32_multi(bytes, pKey, IV, BufferIn,
+ BufferOut, lengthInBytes);
+ }
+#ifdef SAFE_DATA
+ CLEAR_SCRATCH_GPS();
+ CLEAR_SCRATCH_SIMD_REGS();
+#endif
+#endif /* AVX2 */
+}
+
+/*---------------------------------------------------------
+ * @description
+ * Snow3G F8 8 buffer:
+ * Eight packets enc/dec with the same key schedule.
+ * The 8 IVs are independent and are passed as an array of pointers.
+ * Each buffer and data length are separate.
+ * Uses AVX instructions.
+ *---------------------------------------------------------*/
+void SNOW3G_F8_8_BUFFER(const snow3g_key_schedule_t *pHandle,
+ const void *pIV1,
+ const void *pIV2,
+ const void *pIV3,
+ const void *pIV4,
+ const void *pIV5,
+ const void *pIV6,
+ const void *pIV7,
+ const void *pIV8,
+ const void *pBufIn1,
+ void *pBufOut1,
+ const uint32_t lenInBytes1,
+ const void *pBufIn2,
+ void *pBufOut2,
+ const uint32_t lenInBytes2,
+ const void *pBufIn3,
+ void *pBufOut3,
+ const uint32_t lenInBytes3,
+ const void *pBufIn4,
+ void *pBufOut4,
+ const uint32_t lenInBytes4,
+ const void *pBufIn5,
+ void *pBufOut5,
+ const uint32_t lenInBytes5,
+ const void *pBufIn6,
+ void *pBufOut6,
+ const uint32_t lenInBytes6,
+ const void *pBufIn7,
+ void *pBufOut7,
+ const uint32_t lenInBytes7,
+ const void *pBufIn8,
+ void *pBufOut8,
+ const uint32_t lenInBytes8)
+{
+#ifdef SAFE_PARAM
+ if ((pHandle == NULL) ||
+ (pIV1 == NULL) || (pIV2 == NULL) ||
+ (pIV3 == NULL) || (pIV4 == NULL) ||
+ (pIV5 == NULL) || (pIV6 == NULL) ||
+ (pIV7 == NULL) || (pIV8 == NULL) ||
+ (pBufIn1 == NULL) || (pBufOut1 == NULL) ||
+ (pBufIn2 == NULL) || (pBufOut2 == NULL) ||
+ (pBufIn3 == NULL) || (pBufOut3 == NULL) ||
+ (pBufIn4 == NULL) || (pBufOut4 == NULL) ||
+ (pBufIn5 == NULL) || (pBufOut5 == NULL) ||
+ (pBufIn6 == NULL) || (pBufOut6 == NULL) ||
+ (pBufIn7 == NULL) || (pBufOut7 == NULL) ||
+ (pBufIn8 == NULL) || (pBufOut8 == NULL) ||
+ (lenInBytes1 == 0) || (lenInBytes1 > SNOW3G_MAX_BYTELEN) ||
+ (lenInBytes2 == 0) || (lenInBytes2 > SNOW3G_MAX_BYTELEN) ||
+ (lenInBytes3 == 0) || (lenInBytes3 > SNOW3G_MAX_BYTELEN) ||
+ (lenInBytes4 == 0) || (lenInBytes4 > SNOW3G_MAX_BYTELEN) ||
+ (lenInBytes5 == 0) || (lenInBytes5 > SNOW3G_MAX_BYTELEN) ||
+ (lenInBytes6 == 0) || (lenInBytes6 > SNOW3G_MAX_BYTELEN) ||
+ (lenInBytes7 == 0) || (lenInBytes7 > SNOW3G_MAX_BYTELEN) ||
+ (lenInBytes8 == 0) || (lenInBytes8 > SNOW3G_MAX_BYTELEN))
+ return;
+#endif
+
+#ifdef AVX2
+ uint32_t bytes1 =
+ (lenInBytes1 < lenInBytes2 ? lenInBytes1
+ : lenInBytes2); /* number of bytes */
+ uint32_t bytes2 =
+ (lenInBytes3 < lenInBytes4 ? lenInBytes3
+ : lenInBytes4); /* number of bytes */
+ uint32_t bytes3 =
+ (lenInBytes5 < lenInBytes6 ? lenInBytes5
+ : lenInBytes6); /* number of bytes */
+ uint32_t bytes4 =
+ (lenInBytes7 < lenInBytes8 ? lenInBytes7
+ : lenInBytes8); /* number of bytes */
+ uint32_t bytesq1 =
+ (bytes1 < bytes2) ? bytes1 : bytes2; /* min number of bytes */
+ uint32_t bytesq2 = (bytes3 < bytes4) ? bytes3 : bytes4;
+ uint32_t bytes = (bytesq1 < bytesq2) ? bytesq1 : bytesq2;
+
+ if (bytes % 32) {
+ snow3g_8_buffer_ks_8(
+ bytes, pHandle, pIV1, pIV2, pIV3, pIV4, pIV5, pIV6,
+ pIV7, pIV8, pBufIn1, pBufOut1, lenInBytes1, pBufIn2,
+ pBufOut2, lenInBytes2, pBufIn3, pBufOut3, lenInBytes3,
+ pBufIn4, pBufOut4, lenInBytes4, pBufIn5, pBufOut5,
+ lenInBytes5, pBufIn6, pBufOut6, lenInBytes6, pBufIn7,
+ pBufOut7, lenInBytes7, pBufIn8, pBufOut8, lenInBytes8);
+ } else {
+ snow3g_8_buffer_ks_32(
+ bytes, pHandle, pIV1, pIV2, pIV3, pIV4, pIV5, pIV6,
+ pIV7, pIV8, pBufIn1, pBufOut1, lenInBytes1, pBufIn2,
+ pBufOut2, lenInBytes2, pBufIn3, pBufOut3, lenInBytes3,
+ pBufIn4, pBufOut4, lenInBytes4, pBufIn5, pBufOut5,
+ lenInBytes5, pBufIn6, pBufOut6, lenInBytes6, pBufIn7,
+ pBufOut7, lenInBytes7, pBufIn8, pBufOut8, lenInBytes8);
+ }
+#ifdef SAFE_DATA
+ CLEAR_SCRATCH_GPS();
+ CLEAR_SCRATCH_SIMD_REGS();
+#endif
+#else /* ~AVX2 */
+ SNOW3G_F8_2_BUFFER(pHandle, pIV1, pIV2, pBufIn1, pBufOut1, lenInBytes1,
+ pBufIn2, pBufOut2, lenInBytes2);
+
+ SNOW3G_F8_2_BUFFER(pHandle, pIV3, pIV4, pBufIn3, pBufOut3, lenInBytes3,
+ pBufIn4, pBufOut4, lenInBytes4);
+
+ SNOW3G_F8_2_BUFFER(pHandle, pIV5, pIV6, pBufIn5, pBufOut5, lenInBytes5,
+ pBufIn6, pBufOut6, lenInBytes6);
+
+ SNOW3G_F8_2_BUFFER(pHandle, pIV7, pIV8, pBufIn7, pBufOut7, lenInBytes7,
+ pBufIn8, pBufOut8, lenInBytes8);
+#endif /* AVX */
+}
+
+/******************************************************************************
+ * @description
+ * Snow3G F8 multi packet:
+ * Performs F8 enc/dec on [n] packets. The operation is performed in-place.
+ * The input IV's are passed in Little Endian format.
+ * The KeySchedule is in Little Endian format.
+ ******************************************************************************/
+void SNOW3G_F8_N_BUFFER(const snow3g_key_schedule_t *pCtx,
+ const void * const IV[],
+ const void * const pBufferIn[],
+ void *pBufferOut[],
+ const uint32_t bufLenInBytes[],
+ const uint32_t packetCount)
+{
+#ifdef SAFE_PARAM
+ uint32_t i;
+
+ if ((pCtx == NULL) || (IV == NULL) || (pBufferIn == NULL) ||
+ (pBufferOut == NULL) || (bufLenInBytes == NULL))
+ return;
+
+ for (i = 0; i < packetCount; i++)
+ if ((IV[i] == NULL) || (pBufferIn[i] == NULL) ||
+ (pBufferOut[i] == NULL) || (bufLenInBytes[i] == 0) ||
+ (bufLenInBytes[i] > SNOW3G_MAX_BYTELEN))
+ return;
+#endif
+ if (packetCount > 16) {
+ pBufferOut[0] = NULL;
+ printf("packetCount too high (%d)\n", packetCount);
+ return;
+ }
+
+ uint32_t packet_index, inner_index, pktCnt = packetCount;
+ int sortNeeded = 0, tempLen = 0;
+ uint8_t *srctempbuff;
+ uint8_t *dsttempbuff;
+ uint8_t *ivtempbuff;
+ uint8_t *pSrcBuf[NUM_PACKETS_16] = {NULL};
+ uint8_t *pDstBuf[NUM_PACKETS_16] = {NULL};
+ uint8_t *pIV[NUM_PACKETS_16] = {NULL};
+ uint32_t lensBuf[NUM_PACKETS_16] = {0};
+
+ memcpy((void *)lensBuf, bufLenInBytes, packetCount * sizeof(uint32_t));
+ memcpy((void *)pSrcBuf, pBufferIn, packetCount * sizeof(void *));
+ memcpy((void *)pDstBuf, pBufferOut, packetCount * sizeof(void *));
+ memcpy((void *)pIV, IV, packetCount * sizeof(void *));
+
+ packet_index = packetCount;
+
+ while (packet_index--) {
+
+ /* check if all packets are sorted by decreasing length */
+ if (packet_index > 0 && lensBuf[packet_index - 1] <
+ lensBuf[packet_index]) {
+ /* this packet array is not correctly sorted */
+ sortNeeded = 1;
+ }
+ }
+
+ if (sortNeeded) {
+
+ /* sort packets in decreasing buffer size from [0] to
+ [n]th packet, ** where buffer[0] will contain longest
+ buffer and buffer[n] will contain the shortest buffer.
+ 4 arrays are swapped :
+ - pointers to input buffers
+ - pointers to output buffers
+ - pointers to input IV's
+ - input buffer lengths */
+ packet_index = packetCount;
+ while (packet_index--) {
+
+ inner_index = packet_index;
+ while (inner_index--) {
+
+ if (lensBuf[packet_index] >
+ lensBuf[inner_index]) {
+
+ /* swap buffers to arrange in
+ descending order from [0]. */
+ srctempbuff = pSrcBuf[packet_index];
+ dsttempbuff = pDstBuf[packet_index];
+ ivtempbuff = pIV[packet_index];
+ tempLen = lensBuf[packet_index];
+
+ pSrcBuf[packet_index] =
+ pSrcBuf[inner_index];
+ pDstBuf[packet_index] =
+ pDstBuf[inner_index];
+ pIV[packet_index] = pIV[inner_index];
+ lensBuf[packet_index] =
+ lensBuf[inner_index];
+
+ pSrcBuf[inner_index] = srctempbuff;
+ pDstBuf[inner_index] = dsttempbuff;
+ pIV[inner_index] = ivtempbuff;
+ lensBuf[inner_index] = tempLen;
+ }
+ } /* for inner packet index (inner bubble-sort) */
+ } /* for outer packet index (outer bubble-sort) */
+ } /* if sortNeeded */
+
+ packet_index = 0;
+ /* process 8 buffers at-a-time */
+#ifdef AVX2
+ while (pktCnt >= 8) {
+ pktCnt -= 8;
+ SNOW3G_F8_8_BUFFER(pCtx, pIV[packet_index],
+ pIV[packet_index + 1],
+ pIV[packet_index + 2],
+ pIV[packet_index + 3],
+ pIV[packet_index + 4],
+ pIV[packet_index + 5],
+ pIV[packet_index + 6],
+ pIV[packet_index + 7],
+ pSrcBuf[packet_index],
+ pDstBuf[packet_index],
+ lensBuf[packet_index],
+ pSrcBuf[packet_index + 1],
+ pDstBuf[packet_index + 1],
+ lensBuf[packet_index + 1],
+ pSrcBuf[packet_index + 2],
+ pDstBuf[packet_index + 2],
+ lensBuf[packet_index + 2],
+ pSrcBuf[packet_index + 3],
+ pDstBuf[packet_index + 3],
+ lensBuf[packet_index + 3],
+ pSrcBuf[packet_index + 4],
+ pDstBuf[packet_index + 4],
+ lensBuf[packet_index + 4],
+ pSrcBuf[packet_index + 5],
+ pDstBuf[packet_index + 5],
+ lensBuf[packet_index + 5],
+ pSrcBuf[packet_index + 6],
+ pDstBuf[packet_index + 6],
+ lensBuf[packet_index + 6],
+ pSrcBuf[packet_index + 7],
+ pDstBuf[packet_index + 7],
+ lensBuf[packet_index + 7]);
+ packet_index += 8;
+ }
+#endif
+ /* process 4 buffers at-a-time */
+ while (pktCnt >= 4) {
+ pktCnt -= 4;
+ SNOW3G_F8_4_BUFFER(pCtx, pIV[packet_index + 0],
+ pIV[packet_index + 1],
+ pIV[packet_index + 2],
+ pIV[packet_index + 3],
+ pSrcBuf[packet_index + 0],
+ pDstBuf[packet_index + 0],
+ lensBuf[packet_index + 0],
+ pSrcBuf[packet_index + 1],
+ pDstBuf[packet_index + 1],
+ lensBuf[packet_index + 1],
+ pSrcBuf[packet_index + 2],
+ pDstBuf[packet_index + 2],
+ lensBuf[packet_index + 2],
+ pSrcBuf[packet_index + 3],
+ pDstBuf[packet_index + 3],
+ lensBuf[packet_index + 3]);
+ packet_index += 4;
+ }
+
+ /* process 2 packets at-a-time */
+ while (pktCnt >= 2) {
+ pktCnt -= 2;
+ SNOW3G_F8_2_BUFFER(pCtx, pIV[packet_index + 0],
+ pIV[packet_index + 1],
+ pSrcBuf[packet_index + 0],
+ pDstBuf[packet_index + 0],
+ lensBuf[packet_index + 0],
+ pSrcBuf[packet_index + 1],
+ pDstBuf[packet_index + 1],
+ lensBuf[packet_index + 1]);
+ packet_index += 2;
+ }
+
+ /* remaining packets are processed 1 at a time */
+ while (pktCnt--) {
+ SNOW3G_F8_1_BUFFER(pCtx, pIV[packet_index + 0],
+ pSrcBuf[packet_index + 0],
+ pDstBuf[packet_index + 0],
+ lensBuf[packet_index + 0]);
+ packet_index++;
+ }
+}
+
+void SNOW3G_F8_N_BUFFER_MULTIKEY(const snow3g_key_schedule_t * const pCtx[],
+ const void * const IV[],
+ const void * const pBufferIn[],
+ void *pBufferOut[],
+ const uint32_t bufLenInBytes[],
+ const uint32_t packetCount)
+{
+#ifdef SAFE_PARAM
+ uint32_t i;
+
+ if ((pCtx == NULL) || (IV == NULL) || (pBufferIn == NULL) ||
+ (pBufferOut == NULL) || (bufLenInBytes == NULL))
+ return;
+
+ for (i = 0; i < packetCount; i++)
+ if ((pCtx[i] == NULL) || (IV[i] == NULL) ||
+ (pBufferIn[i] == NULL) || (pBufferOut[i] == NULL) ||
+ (bufLenInBytes[i] == 0) ||
+ (bufLenInBytes[i] > SNOW3G_MAX_BYTELEN))
+ return;
+#endif
+ if (packetCount > 16) {
+ pBufferOut[0] = NULL;
+ printf("packetCount too high (%d)\n", packetCount);
+ return;
+ }
+
+ uint32_t packet_index, inner_index, pktCnt = packetCount;
+ int sortNeeded = 0, tempLen = 0;
+ uint8_t *srctempbuff;
+ uint8_t *dsttempbuff;
+ uint8_t *ivtempbuff;
+ snow3g_key_schedule_t *pCtxBuf[NUM_PACKETS_16] = {NULL};
+ uint8_t *pSrcBuf[NUM_PACKETS_16] = {NULL};
+ uint8_t *pDstBuf[NUM_PACKETS_16] = {NULL};
+ uint8_t *pIV[NUM_PACKETS_16] = {NULL};
+ uint32_t lensBuf[NUM_PACKETS_16] = {0};
+ snow3g_key_schedule_t *tempCtx;
+
+ memcpy((void *)pCtxBuf, pCtx, packetCount * sizeof(void *));
+ memcpy((void *)lensBuf, bufLenInBytes, packetCount * sizeof(uint32_t));
+ memcpy((void *)pSrcBuf, pBufferIn, packetCount * sizeof(void *));
+ memcpy((void *)pDstBuf, pBufferOut, packetCount * sizeof(void *));
+ memcpy((void *)pIV, IV, packetCount * sizeof(void *));
+
+ packet_index = packetCount;
+
+ while (packet_index--) {
+
+ /* check if all packets are sorted by decreasing length */
+ if (packet_index > 0 && lensBuf[packet_index - 1] <
+ lensBuf[packet_index]) {
+ /* this packet array is not correctly sorted */
+ sortNeeded = 1;
+ }
+ }
+
+ if (sortNeeded) {
+ /* sort packets in decreasing buffer size from [0] to [n]th
+ packet, where buffer[0] will contain longest buffer and
+ buffer[n] will contain the shortest buffer.
+ 4 arrays are swapped :
+ - pointers to input buffers
+ - pointers to output buffers
+ - pointers to input IV's
+ - input buffer lengths */
+ packet_index = packetCount;
+ while (packet_index--) {
+ inner_index = packet_index;
+ while (inner_index--) {
+ if (lensBuf[packet_index] >
+ lensBuf[inner_index]) {
+ /* swap buffers to arrange in
+ descending order from [0]. */
+ srctempbuff = pSrcBuf[packet_index];
+ dsttempbuff = pDstBuf[packet_index];
+ ivtempbuff = pIV[packet_index];
+ tempLen = lensBuf[packet_index];
+ tempCtx = pCtxBuf[packet_index];
+
+ pSrcBuf[packet_index] =
+ pSrcBuf[inner_index];
+ pDstBuf[packet_index] =
+ pDstBuf[inner_index];
+ pIV[packet_index] = pIV[inner_index];
+ lensBuf[packet_index] =
+ lensBuf[inner_index];
+ pCtxBuf[packet_index] =
+ pCtxBuf[inner_index];
+
+ pSrcBuf[inner_index] = srctempbuff;
+ pDstBuf[inner_index] = dsttempbuff;
+ pIV[inner_index] = ivtempbuff;
+ lensBuf[inner_index] = tempLen;
+ pCtxBuf[inner_index] = tempCtx;
+ }
+ } /* for inner packet index (inner bubble-sort) */
+ } /* for outer packet index (outer bubble-sort) */
+ } /* if sortNeeded */
+
+ packet_index = 0;
+ /* process 8 buffers at-a-time */
+#ifdef AVX2
+ while (pktCnt >= 8) {
+ pktCnt -= 8;
+ SNOW3G_F8_8_BUFFER_MULTIKEY(
+ (const snow3g_key_schedule_t * const *)
+ &pCtxBuf[packet_index],
+ (const void * const *)&pIV[packet_index],
+ (const void * const *)&pSrcBuf[packet_index],
+ (void **)&pDstBuf[packet_index],
+ &lensBuf[packet_index]);
+ packet_index += 8;
+ }
+#endif
+ /* TODO process 4 buffers at-a-time */
+ /* TODO process 2 packets at-a-time */
+ /* remaining packets are processed 1 at a time */
+ while (pktCnt--) {
+ SNOW3G_F8_1_BUFFER(pCtxBuf[packet_index + 0],
+ pIV[packet_index + 0],
+ pSrcBuf[packet_index + 0],
+ pDstBuf[packet_index + 0],
+ lensBuf[packet_index + 0]);
+ packet_index++;
+ }
+}
+
+/*---------------------------------------------------------
+ * @description
+ * Snow3G F9 1 buffer
+ * Single buffer digest with IV and precomputed key schedule
+ *---------------------------------------------------------*/
+void SNOW3G_F9_1_BUFFER(const snow3g_key_schedule_t *pHandle,
+ const void *pIV,
+ const void *pBufferIn,
+ const uint64_t lengthInBits,
+ void *pDigest)
+{
+#ifdef SAFE_PARAM
+ if ((pHandle == NULL) || (pIV == NULL) ||
+ (pBufferIn == NULL) || (pDigest == NULL) ||
+ (lengthInBits == 0) || (lengthInBits > SNOW3G_MAX_BITLEN))
+ return;
+#endif
+ snow3gKeyState1_t ctx;
+ uint32_t z[5];
+ uint64_t lengthInQwords, E, V, P;
+ uint64_t i, rem_bits;
+ const uint64_t *inputBuffer;
+
+ inputBuffer = (const uint64_t *)pBufferIn;
+
+ /* Initialize the snow3g key schedule */
+ snow3gStateInitialize_1(&ctx, pHandle, pIV);
+
+ /*Generate 5 keystream words*/
+ snow3g_f9_keystream_words(&ctx, &z[0]);
+
+ P = ((uint64_t)z[0] << 32) | ((uint64_t)z[1]);
+
+ lengthInQwords = lengthInBits / 64;
+
+ E = 0;
+ /* all blocks except the last one */
+ for (i = 0; i < lengthInQwords; i++) {
+ V = BSWAP64(inputBuffer[i]);
+ E = multiply_and_reduce64(E ^ V, P);
+ }
+
+ /* last bits of last block if any left */
+ rem_bits = lengthInBits % 64;
+ if (rem_bits) {
+ /* last bytes, do not go past end of buffer */
+ memcpy(&V, &inputBuffer[i], (rem_bits + 7) / 8);
+ V = BSWAP64(V);
+ V &= (((uint64_t)-1) << (64 - rem_bits)); /* mask extra bits */
+ E = multiply_and_reduce64(E ^ V, P);
+ }
+
+ /* Multiply by Q */
+ E = multiply_and_reduce64(E ^ lengthInBits,
+ (((uint64_t)z[2] << 32) | ((uint64_t)z[3])));
+
+ /* Final MAC */
+ *(uint32_t *)pDigest =
+ (uint32_t)BSWAP64(E ^ ((uint64_t)z[4] << 32));
+#ifdef SAFE_DATA
+ CLEAR_VAR(&E, sizeof(E));
+ CLEAR_VAR(&V, sizeof(V));
+ CLEAR_VAR(&P, sizeof(P));
+ CLEAR_MEM(&z, sizeof(z));
+ CLEAR_MEM(&ctx, sizeof(ctx));
+ CLEAR_SCRATCH_GPS();
+ CLEAR_SCRATCH_SIMD_REGS();
+#endif /* SAFE_DATA */
+}
+
+#endif /* SNOW3G_COMMON_H */
diff --git a/src/spdk/intel-ipsec-mb/include/snow3g_internal.h b/src/spdk/intel-ipsec-mb/include/snow3g_internal.h
new file mode 100644
index 000000000..287d60be1
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/include/snow3g_internal.h
@@ -0,0 +1,638 @@
+/*******************************************************************************
+ Copyright (c) 2009-2019, Intel Corporation
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice,
+ this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+ * Neither the name of Intel Corporation nor the names of its contributors
+ may be used to endorse or promote products derived from this software
+ without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+ FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+
+#ifndef _SNOW3G_INTERNAL_H_
+#define _SNOW3G_INTERNAL_H_
+
+#include "intel-ipsec-mb.h"
+#include "wireless_common.h"
+#include "constant_lookup.h"
+
+#define MAX_KEY_LEN (16)
+#define SNOW3G_4_BYTES (4)
+#define SNOW3G_8_BYTES (8)
+#define SNOW3G_8_BITS (8)
+#define SNOW3G_16_BYTES (16)
+#define SNOW3G_16_BITS (16)
+
+#define SNOW3G_BLOCK_SIZE (8)
+
+#define SNOW3G_KEY_LEN_IN_BYTES (16) /* 128b */
+#define SNOW3G_IV_LEN_IN_BYTES (16) /* 128b */
+
+#define SNOW3GCONSTANT (0x1b)
+
+/* Range of input data for SNOW3G is from 1 to 2^32 bits */
+#define SNOW3G_MIN_LEN 1
+#define SNOW3G_MAX_BITLEN (UINT32_MAX)
+#define SNOW3G_MAX_BYTELEN (UINT32_MAX / 8)
+
+#define ComplementaryMask64(x) ((~(x) % 64) + 1)
+#define ComplementaryMask32(x) ((~(x) % 32) + 1)
+
+#ifndef SAFE_LOOKUP
+/*standard lookup */
+#define SNOW3G_LOOKUP_W0(table, idx, size) \
+ table[idx].w0.v
+#define SNOW3G_LOOKUP_W1(table, idx, size) \
+ table[idx].w1.v
+#define SNOW3G_LOOKUP_W2(table, idx, size) \
+ table[idx].w2.v
+#define SNOW3G_LOOKUP_W3(table, idx, size) \
+ table[idx].w3.v
+#else
+/* contant time lookup */
+#if defined (AVX) || defined (AVX2)
+#define SNOW3G_LOOKUP_W0(table, idx, size) \
+ ((uint32_t)(LOOKUP64_AVX(table, idx, size) >> 0))
+#define SNOW3G_LOOKUP_W1(table, idx, size) \
+ ((uint32_t)(LOOKUP64_AVX(table, idx, size) >> 8))
+#define SNOW3G_LOOKUP_W2(table, idx, size) \
+ ((uint32_t)(LOOKUP64_AVX(table, idx, size) >> 16))
+#define SNOW3G_LOOKUP_W3(table, idx, size) \
+ ((uint32_t)(LOOKUP64_AVX(table, idx, size) >> 24))
+#else
+#define SNOW3G_LOOKUP_W0(table, idx, size) \
+ ((uint32_t)(LOOKUP64_SSE(table, idx, size) >> 0))
+#define SNOW3G_LOOKUP_W1(table, idx, size) \
+ ((uint32_t)(LOOKUP64_SSE(table, idx, size) >> 8))
+#define SNOW3G_LOOKUP_W2(table, idx, size) \
+ ((uint32_t)(LOOKUP64_SSE(table, idx, size) >> 16))
+#define SNOW3G_LOOKUP_W3(table, idx, size) \
+ ((uint32_t)(LOOKUP64_SSE(table, idx, size) >> 24))
+#endif /* AVX || AVX2 */
+#endif /* SAFE_LOOKUP */
+
+typedef union SafeBuffer {
+ uint64_t b64;
+ uint32_t b32[2];
+ uint8_t b8[SNOW3G_8_BYTES];
+} SafeBuf;
+
+typedef struct snow3gKeyState1_s {
+ /* 16 LFSR stages */
+ uint32_t LFSR_S[16];
+ /* 3 FSM states */
+ uint32_t FSM_R3;
+ uint32_t FSM_R2;
+ uint32_t FSM_R1;
+} DECLARE_ALIGNED(snow3gKeyState1_t, 16);
+
+typedef struct snow3gKeyState4_s {
+ /* 16 LFSR stages */
+ __m128i LFSR_X[16];
+ /* 3 FSM states */
+ __m128i FSM_X[3];
+ uint32_t iLFSR_X;
+
+} snow3gKeyState4_t;
+
+
+#ifdef _WIN32
+#pragma pack(push,1)
+#define DECLARE_PACKED_UINT32(x) uint32_t x
+#else
+#define DECLARE_PACKED_UINT32(x) uint32_t x __attribute__((__packed__))
+#endif
+
+typedef union snow3gTableEntry_u {
+ uint64_t v;
+ struct {
+ uint8_t shift[3];
+ DECLARE_PACKED_UINT32(v);
+ } w3;
+ struct {
+ uint8_t shift[2];
+ DECLARE_PACKED_UINT32(v);
+ } w2;
+ struct {
+ uint8_t shift[1];
+ DECLARE_PACKED_UINT32(v);
+ } w1;
+ struct {
+ uint8_t shift[4];
+ DECLARE_PACKED_UINT32(v);
+ } w0;
+} snow3gTableEntry_t;
+#ifdef _WIN32
+#pragma pack(pop)
+#endif
+
+#define rotl32(x, n) (((x) << (n)) | ((x) >> (32 - (n))))
+
+#define rotr32(x, n) (((x) << (32 - (n))) | ((x) >> (n)))
+
+#define rotl8(x, n) (((x) << (n)) | ((x) >> (8 - (n))))
+
+#define rotr8(x, n) (((x) << (8 - (n))) | ((x) >> (n)))
+
+/*************************************************************************
+ * @description - snow3g internal tables
+ *************************************************************************/
+
+extern const int snow3g_table_A_mul[256];
+extern const int snow3g_table_A_div[256];
+extern snow3gTableEntry_t snow3g_table_S1[256];
+extern snow3gTableEntry_t snow3g_table_S2[256];
+extern const int S1_T0[256];
+extern const int S1_T1[256];
+extern const int S1_T2[256];
+extern const int S1_T3[256];
+extern const int S2_T0[256];
+extern const int S2_T1[256];
+extern const int S2_T2[256];
+extern const int S2_T3[256];
+
+/* -------------------------------------------------------------------
+ * combined S-Box processing for reduced instruction dependencies
+ *
+ * S1_S2_1 : 2 S-Box , 1 packet at a time
+ * S1_S2_S3_1 : 3 S-Box at the same time
+ *
+ * S1_S2_4 : 2 S-Box , 4 packets at a time
+ *
+ * ------------------------------------------------------------------ */
+#ifdef AVX2
+#define _mm256_set_m128i(/* __m128i */ hi, /* __m128i */ lo) \
+ _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 0x1)
+
+#ifndef _mm256_loadu2_m128i
+#define _mm256_loadu2_m128i(hi, lo) \
+ _mm256_inserti128_si256( \
+ _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)lo)), \
+ _mm_loadu_si128((const __m128i *)hi), 1)
+#endif /* _mm256_loadu2_m128i */
+
+typedef struct snow3gKeyState8_s {
+ /* 16 LFSR stages */
+ __m256i LFSR_X[16];
+ /* 3 FSM states */
+ __m256i FSM_X[3];
+ uint32_t iLFSR_X;
+
+} snow3gKeyState8_t;
+
+/* Sbox Snow3g_S1 and Snow3g_S2 with dependency unrolling
+ * for n in [0..3]
+ * w[n-1] = k; y[n] = Snow3g_S2(w[n]); k = Snow3g_S1(x[n])
+ *
+ *
+ */
+#define S1_S2_8(y, w, x, k, l, n) \
+ do { \
+ uint8_t w0, w1, w2, w3; \
+ uint8_t x0, x1, x2, x3; \
+ uint32_t ty = l; \
+ w3 = _mm256_extract_epi8(w, (4 * n + 0)); \
+ w2 = _mm256_extract_epi8(w, (4 * n + 1)); \
+ w1 = _mm256_extract_epi8(w, (4 * n + 2)); \
+ w0 = _mm256_extract_epi8(w, (4 * n + 3)); \
+ l = snow3g_table_S2[w3].w3.v ^ snow3g_table_S2[w2].w2.v ^ \
+ snow3g_table_S2[w1].w1.v ^ snow3g_table_S2[w0].w0.v; \
+ if (n != 0) \
+ w = _mm256_insert_epi32(w, k, (n - 1)); \
+ if (n != 0) \
+ y = _mm256_insert_epi32(y, ty, (n - 1)); \
+ x3 = _mm256_extract_epi8(x, (4 * n + 0)); \
+ x2 = _mm256_extract_epi8(x, (4 * n + 1)); \
+ x1 = _mm256_extract_epi8(x, (4 * n + 2)); \
+ x0 = _mm256_extract_epi8(x, (4 * n + 3)); \
+ k = snow3g_table_S1[x3].w3.v ^ snow3g_table_S1[x2].w2.v ^ \
+ snow3g_table_S1[x1].w1.v ^ snow3g_table_S1[x0].w0.v; \
+ if (n == 7) \
+ w = _mm256_insert_epi32(w, k, n); \
+ if (n == 7) \
+ y = _mm256_insert_epi32(y, l, n); \
+ } while (0)
+#endif /* AVX2 */
+
+
+#if defined (NO_AESNI) || defined (SAFE_LOOKUP)
+/* help compilers to interleave the
+ * operations and table access latencies
+ */
+
+/* Sbox Snow3g_S1 and Snow3g_S2, simple C code
+ * y = Snow3g_S2(w); w = Snow3g_S1(x);
+ */
+#define S1_S2_1(y, w, x) \
+ do { \
+ uint32_t w0, w1, w2, w3; \
+ uint32_t x0, x1, x2, x3; \
+ uint32_t tw, tx; \
+ w3 = w & 0xff; \
+ x3 = x & 0xff; \
+ tw = SNOW3G_LOOKUP_W3(snow3g_table_S2, w3, \
+ sizeof(snow3g_table_S2)); \
+ tx = SNOW3G_LOOKUP_W3(snow3g_table_S1, x3, \
+ sizeof(snow3g_table_S1)); \
+ w0 = w >> 24; \
+ x0 = x >> 24; \
+ tw ^= SNOW3G_LOOKUP_W0(snow3g_table_S2, w0, \
+ sizeof(snow3g_table_S2)); \
+ tx ^= SNOW3G_LOOKUP_W0(snow3g_table_S1, x0, \
+ sizeof(snow3g_table_S1)); \
+ w1 = (w >> 16) & 0xff; \
+ x1 = (x >> 16) & 0xff; \
+ tw ^= SNOW3G_LOOKUP_W1(snow3g_table_S2, w1, \
+ sizeof(snow3g_table_S2)); \
+ tx ^= SNOW3G_LOOKUP_W1(snow3g_table_S1, x1, \
+ sizeof(snow3g_table_S1)); \
+ w2 = (w >> 8) & 0xff; \
+ x2 = (x >> 8) & 0xff; \
+ y = tw ^ SNOW3G_LOOKUP_W2(snow3g_table_S2, w2, \
+ sizeof(snow3g_table_S2)); \
+ w = tx ^ SNOW3G_LOOKUP_W2(snow3g_table_S1, x2, \
+ sizeof(snow3g_table_S1)); \
+ } while (0)
+
+/* Sbox Snow3g_S1 and Snow3g_S2, simple C code
+ * y = Snow3g_S2(w); w = Snow3g_S1(x); u = Snow3g_S1(z);
+ */
+#define S1_S2_S3_1(y, w, x, u, z) \
+ do { \
+ unsigned w0, w1, w2, w3; \
+ unsigned x0, x1, x2, x3; \
+ unsigned z0, z1, z2, z3; \
+ uint32_t tw, tx, tz; \
+ w3 = w & 0xff; \
+ x3 = x & 0xff; \
+ z3 = z & 0xff; \
+ tw = SNOW3G_LOOKUP_W3(snow3g_table_S2, w3, \
+ sizeof(snow3g_table_S2)); \
+ tx = SNOW3G_LOOKUP_W3(snow3g_table_S1, x3, \
+ sizeof(snow3g_table_S1)); \
+ tz = SNOW3G_LOOKUP_W3(snow3g_table_S1, z3, \
+ sizeof(snow3g_table_S1)); \
+ w0 = w >> 24; \
+ x0 = x >> 24; \
+ z0 = z >> 24; \
+ tw ^= SNOW3G_LOOKUP_W0(snow3g_table_S2, w0, \
+ sizeof(snow3g_table_S2)); \
+ tx ^= SNOW3G_LOOKUP_W0(snow3g_table_S1, x0, \
+ sizeof(snow3g_table_S1)); \
+ tz ^= SNOW3G_LOOKUP_W0(snow3g_table_S1, z0, \
+ sizeof(snow3g_table_S1)); \
+ w1 = (w >> 16) & 0xff; \
+ x1 = (x >> 16) & 0xff; \
+ z1 = (z >> 16) & 0xff; \
+ tw ^= SNOW3G_LOOKUP_W1(snow3g_table_S2, w1, \
+ sizeof(snow3g_table_S2)); \
+ tx ^= SNOW3G_LOOKUP_W1(snow3g_table_S1, x1, \
+ sizeof(snow3g_table_S1)); \
+ tz ^= SNOW3G_LOOKUP_W1(snow3g_table_S1, z1, \
+ sizeof(snow3g_table_S1)); \
+ w2 = (w >> 8) & 0xff; \
+ x2 = (x >> 8) & 0xff; \
+ z2 = (z >> 8) & 0xff; \
+ y = tw ^ SNOW3G_LOOKUP_W2(snow3g_table_S2, w2, \
+ sizeof(snow3g_table_S2)); \
+ w = tx ^ SNOW3G_LOOKUP_W2(snow3g_table_S1, x2, \
+ sizeof(snow3g_table_S1)); \
+ u = tz ^ SNOW3G_LOOKUP_W2(snow3g_table_S1, z2, \
+ sizeof(snow3g_table_S1)); \
+ } while (0)
+
+/* Sbox Snow3g_S1 and Snow3g_S2 with dependency unrolling
+ * for n in [0..3]
+ * w[n-1] = k; y[n] = Snow3g_S2(w[n]); k = Snow3g_S1(x[n])
+ *
+ *
+ */
+#define S1_S2_4(y, w, x, k, l, n) \
+ do { \
+ unsigned w0, w1, w2, w3; \
+ unsigned x0, x1, x2, x3; \
+ uint32_t ty = l; \
+ w3 = _mm_extract_epi8(w, (4 * n + 0)); \
+ w2 = _mm_extract_epi8(w, (4 * n + 1)); \
+ w1 = _mm_extract_epi8(w, (4 * n + 2)); \
+ w0 = _mm_extract_epi8(w, (4 * n + 3)); \
+ l = SNOW3G_LOOKUP_W3(snow3g_table_S2, w3, \
+ sizeof(snow3g_table_S2)) ^ \
+ SNOW3G_LOOKUP_W2(snow3g_table_S2, w2, \
+ sizeof(snow3g_table_S2)) ^ \
+ SNOW3G_LOOKUP_W1(snow3g_table_S2, w1, \
+ sizeof(snow3g_table_S2)) ^ \
+ SNOW3G_LOOKUP_W0(snow3g_table_S2, w0, \
+ sizeof(snow3g_table_S2)); \
+ if (n != 0) \
+ w = _mm_insert_epi32(w, k, (n - 1)); \
+ if (n != 0) \
+ y = _mm_insert_epi32(y, ty, (n - 1)); \
+ x3 = _mm_extract_epi8(x, (4 * n + 0)); \
+ x2 = _mm_extract_epi8(x, (4 * n + 1)); \
+ x1 = _mm_extract_epi8(x, (4 * n + 2)); \
+ x0 = _mm_extract_epi8(x, (4 * n + 3)); \
+ k = SNOW3G_LOOKUP_W3(snow3g_table_S1, x3, \
+ sizeof(snow3g_table_S1)) ^ \
+ SNOW3G_LOOKUP_W2(snow3g_table_S1, x2, \
+ sizeof(snow3g_table_S1)) ^ \
+ SNOW3G_LOOKUP_W1(snow3g_table_S1, x1, \
+ sizeof(snow3g_table_S1)) ^ \
+ SNOW3G_LOOKUP_W0(snow3g_table_S1, x0, \
+ sizeof(snow3g_table_S1)); \
+ if (n == 3) \
+ w = _mm_insert_epi32(w, k, n); \
+ if (n == 3) \
+ y = _mm_insert_epi32(y, l, n); \
+ } while (0)
+
+#else /* SSE/AVX */
+
+/* use AES-NI Rijndael for Snow3G Sbox, overlap the latency
+ * of AESENC with Snow3g_S2 sbox calculations
+ */
+
+/* Sbox Snow3g_S1 and Snow3g_S2, simple C code
+ * y = Snow3g_S2(w); w = rijndael Snow3g_S1(x);
+ */
+#define S1_S2_1(y, w, x) \
+ do { \
+ __m128i m10, m11; \
+ m11 = _mm_cvtsi32_si128(x); \
+ m10 = _mm_setzero_si128(); \
+ m11 = _mm_shuffle_epi32(m11, 0x0); \
+ m11 = _mm_aesenc_si128(m11, m10); \
+ y = Snow3g_S2(w); \
+ w = _mm_cvtsi128_si32(m11); \
+ } while (0)
+
+/* Sbox Snow3g_S1 and Snow3g_S2
+ * y = Snow3g_S2(w); w = rijndael Snow3g_S1(x); u = rijndael Snow3g_S1(z);
+ */
+#define S1_S2_S3_1(y, w, x, v, z) \
+ do { \
+ __m128i m10, m11, m12; \
+ m11 = _mm_cvtsi32_si128(x); \
+ m10 = _mm_setzero_si128(); \
+ m11 = _mm_shuffle_epi32(m11, 0x0); \
+ m11 = _mm_aesenc_si128(m11, m10); \
+ m12 = _mm_cvtsi32_si128(z); \
+ m12 = _mm_shuffle_epi32(m12, 0x0); \
+ m12 = _mm_aesenc_si128(m12, m10); \
+ y = Snow3g_S2(w); \
+ w = _mm_cvtsi128_si32(m11); \
+ v = _mm_cvtsi128_si32(m12); \
+ } while (0)
+/* Sbox Snow3g_S1 and Snow3g_S2
+ * for n in [0..3]
+ * extract packet data
+ * y = Snow3g_S2(w); w = rijndael Snow3g_S1(x)
+ * insert the result data
+ */
+#define S1_S2_4(y, w, x, k, n) \
+ do { \
+ uint32_t ty; \
+ unsigned w0, w1, w2, w3; \
+ __m128i m10, m11; \
+ m10 = _mm_setzero_si128(); \
+ m11 = _mm_shuffle_epi32( \
+ x, ((n << 6) | (n << 4) | (n << 2) | (n << 0))); \
+ m11 = _mm_aesenc_si128(m11, m10); \
+ w3 = _mm_extract_epi8(w, (4 * n + 0)); \
+ w2 = _mm_extract_epi8(w, (4 * n + 1)); \
+ w1 = _mm_extract_epi8(w, (4 * n + 2)); \
+ w0 = _mm_extract_epi8(w, (4 * n + 3)); \
+ ty = snow3g_table_S2[w3].w3.v ^ snow3g_table_S2[w1].w1.v ^ \
+ snow3g_table_S2[w2].w2.v ^ snow3g_table_S2[w0].w0.v; \
+ if (n != 0) \
+ w = _mm_insert_epi32(w, k, (n - 1)); \
+ k = _mm_cvtsi128_si32(m11); \
+ if (n == 3) \
+ w = _mm_insert_epi32(w, k, n); \
+ y = _mm_insert_epi32(y, ty, n); \
+ } while (0)
+
+#endif /* NO_AESNI || SAFE_LOOKUP */
+
+/* -------------------------------------------------------------------
+ * Sbox Snow3g_S1 maps a 32bit input to a 32bit output
+ * ------------------------------------------------------------------ */
+static inline uint32_t Snow3g_S1(uint32_t w)
+{
+ uint32_t w0, w1, w2, w3;
+
+ w3 = w & 0xff;
+ w1 = (w >> 16) & 0xff;
+ w2 = (w >> 8) & 0xff;
+ w0 = w >> 24;
+ return snow3g_table_S1[w3].w3.v ^ snow3g_table_S1[w1].w1.v ^
+ snow3g_table_S1[w2].w2.v ^ snow3g_table_S1[w0].w0.v;
+}
+
+/* -------------------------------------------------------------------
+ * Sbox Snow3g_S2 maps a 32bit input to a 32bit output
+ * ------------------------------------------------------------------ */
+static inline uint32_t Snow3g_S2(uint32_t w)
+{
+ uint32_t w0, w1, w2, w3;
+
+ w3 = w & 0xff;
+ w1 = (w >> 16) & 0xff;
+ w2 = (w >> 8) & 0xff;
+ w0 = w >> 24;
+
+ return snow3g_table_S2[w3].w3.v ^ snow3g_table_S2[w1].w1.v ^
+ snow3g_table_S2[w2].w2.v ^ snow3g_table_S2[w0].w0.v;
+}
+
+/* -------------------------------------------------------------------
+ * LFSR array shift by 1 position
+ * ------------------------------------------------------------------ */
+static inline void ShiftLFSR_1(snow3gKeyState1_t *pCtx)
+{
+ uint32_t i;
+
+ for (i = 0; i < 15; i++)
+ pCtx->LFSR_S[i] = pCtx->LFSR_S[i + 1];
+}
+
+/* -------------------------------------------------------------------
+ * LFSR array shift by 2 positions
+ * ------------------------------------------------------------------ */
+static inline void ShiftTwiceLFSR_1(snow3gKeyState1_t *pCtx)
+{
+ int i;
+
+ for (i = 0; i < 14; i++)
+ pCtx->LFSR_S[i] = pCtx->LFSR_S[i + 2];
+}
+
+/* -------------------------------------------------------------------
+ * ClockFSM function as defined in snow3g standard
+ * The FSM has 2 input words S5 and S15 from the LFSR
+ * produces a 32 bit output word F
+ * ------------------------------------------------------------------ */
+static inline void ClockFSM_1(snow3gKeyState1_t *pCtx, uint32_t *data)
+{
+ uint32_t F, R;
+
+ F = pCtx->LFSR_S[15] + pCtx->FSM_R1;
+ R = pCtx->FSM_R3 ^ pCtx->LFSR_S[5];
+ *data = F ^ pCtx->FSM_R2;
+ R += pCtx->FSM_R2;
+ S1_S2_1(pCtx->FSM_R3, pCtx->FSM_R2, pCtx->FSM_R1);
+ pCtx->FSM_R1 = R;
+}
+
+/* -------------------------------------------------------------------
+ * ClockLFSR functin as defined in snow3g standard
+ * ------------------------------------------------------------------ */
+static inline void ClockLFSR_1(snow3gKeyState1_t *pCtx)
+{
+ uint32_t V = pCtx->LFSR_S[2];
+ uint32_t S0 = pCtx->LFSR_S[0];
+ uint32_t S11 = pCtx->LFSR_S[11];
+
+ V ^= snow3g_table_A_mul[S0 >> 24];
+ V ^= snow3g_table_A_div[S11 & 0xff];
+ V ^= S0 << 8;
+ V ^= S11 >> 8;
+
+ ShiftLFSR_1(pCtx);
+
+ pCtx->LFSR_S[15] = V;
+}
+
+/**
+ *******************************************************************************
+ * @description
+ * This function initializes the key schedule for 1 buffer for snow3g f8/f9.
+ *
+ * @param[in] pCtx Context where the scheduled keys are stored
+ * @param [in] pKeySched Key schedule
+ * @param [in] pIV IV
+ *
+ ******************************************************************************/
+static inline void
+snow3gStateInitialize_1(snow3gKeyState1_t *pCtx,
+ const snow3g_key_schedule_t *pKeySched,
+ const void *pIV)
+{
+ uint32_t K, L;
+ int i;
+ uint32_t V0, V1;
+ uint32_t F0, F1;
+ uint32_t L0, L1, L11, L12;
+ uint32_t R0, R1;
+ uint32_t FSM2, FSM3, FSM4;
+ const uint32_t *pIV32 = pIV;
+
+ /* LFSR initialisation */
+ for (i = 0; i < 4; i++) {
+ K = pKeySched->k[i];
+ L = ~K;
+ pCtx->LFSR_S[i + 4] = K;
+ pCtx->LFSR_S[i + 12] = K;
+ pCtx->LFSR_S[i + 0] = L;
+ pCtx->LFSR_S[i + 8] = L;
+ }
+
+ pCtx->LFSR_S[15] ^= BSWAP32(pIV32[3]);
+ pCtx->LFSR_S[12] ^= BSWAP32(pIV32[2]);
+ pCtx->LFSR_S[10] ^= BSWAP32(pIV32[1]);
+ pCtx->LFSR_S[9] ^= BSWAP32(pIV32[0]);
+
+ /* FSM initialialization */
+ FSM2 = 0x0;
+ FSM3 = 0x0;
+ FSM4 = 0x0;
+ R1 = 0x0;
+ V1 = pCtx->LFSR_S[15];
+
+ for (i = 0; i < 16; i++) {
+ /* clock FSM + clock LFSR + clockFSM + clock LFSR */
+ L0 = pCtx->LFSR_S[0];
+ L1 = pCtx->LFSR_S[1];
+ V0 = pCtx->LFSR_S[2];
+ F0 = V1 + R1; /** (s15 + R1) **/
+ V1 = pCtx->LFSR_S[3];
+ V0 ^= snow3g_table_A_mul[L0 >> 24]; /* MUL(s0,0 ) */
+ F0 ^= FSM2; /** (s15 + R1) ^ R2 **/
+ V1 ^= snow3g_table_A_mul[L1 >> 24];
+ L11 = pCtx->LFSR_S[11];
+ L12 = pCtx->LFSR_S[12];
+ R0 = FSM3 ^ pCtx->LFSR_S[5]; /*** (R3 ^ s5 ) ***/
+ V0 ^= snow3g_table_A_div[L11 & 0xff]; /* DIV(s11,3 )*/
+ R0 += FSM2; /*** R2 + (R3 ^ s5 ) ***/
+ V1 ^= snow3g_table_A_div[L12 & 0xff];
+ V0 ^= L0 << 8; /* (s0,1 || s0,2 || s0,3 || 0x00) */
+ V1 ^= L1 << 8;
+ V0 ^= L11 >> 8; /* (0x00 || s11,0 || s11,1 || s11,2 ) */
+ V1 ^= L12 >> 8;
+ S1_S2_S3_1(FSM3, FSM2, R1, FSM4, R0);
+ V0 ^= F0; /* ^F */
+ R1 = FSM3 ^ pCtx->LFSR_S[6];
+ F1 = V0 + R0;
+ F1 ^= FSM2;
+ R1 += FSM2;
+ FSM3 = Snow3g_S2(FSM2);
+ FSM2 = FSM4;
+ V1 ^= F1;
+
+ /* shift LFSR twice */
+ ShiftTwiceLFSR_1(pCtx);
+
+ pCtx->LFSR_S[14] = V0;
+ pCtx->LFSR_S[15] = V1;
+ }
+
+ /* set FSM into scheduling structure */
+ pCtx->FSM_R3 = FSM3;
+ pCtx->FSM_R2 = FSM2;
+ pCtx->FSM_R1 = R1;
+}
+
+/**
+ *******************************************************************************
+ * @description
+ * This function generates 5 words of keystream used in the initial stages
+ * of snow3g F9.
+ *
+ * @param[in] pCtx Context where the scheduled
+ *keys are stored
+ * @param[in/out] pKeyStream Pointer to the generated keystream
+ *
+ ******************************************************************************/
+static inline void snow3g_f9_keystream_words(snow3gKeyState1_t *pCtx,
+ uint32_t *pKeyStream)
+{
+ uint32_t F, XX;
+ int i;
+
+ ClockFSM_1(pCtx, &XX);
+ ClockLFSR_1(pCtx);
+
+ for (i = 0; i < 5; i++) {
+ ClockFSM_1(pCtx, &F);
+ pKeyStream[i] = F ^ pCtx->LFSR_S[0];
+ ClockLFSR_1(pCtx);
+ }
+}
+
+#endif /* _SNOW3G_INTERNAL_H_ */
diff --git a/src/spdk/intel-ipsec-mb/include/transpose_avx2.asm b/src/spdk/intel-ipsec-mb/include/transpose_avx2.asm
new file mode 100644
index 000000000..fed12cf4b
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/include/transpose_avx2.asm
@@ -0,0 +1,218 @@
+;;
+;; Copyright (c) 2012-2019, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;; * Redistributions of source code must retain the above copyright notice,
+;; this list of conditions and the following disclaimer.
+;; * Redistributions in binary form must reproduce the above copyright
+;; notice, this list of conditions and the following disclaimer in the
+;; documentation and/or other materials provided with the distribution.
+;; * Neither the name of Intel Corporation nor the names of its contributors
+;; may be used to endorse or promote products derived from this software
+;; without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+%ifndef _TRANSPOSE_AVX2_ASM_
+%define _TRANSPOSE_AVX2_ASM_
+
+%include "include/reg_sizes.asm"
+
+; LOAD ALL 8 LANES FOR 8x8 32-BIT TRANSPOSE
+;
+; r0-r7 [out] ymm registers which will contain the data to be transposed
+; addr0-addr7 [in] pointers to the next 32-byte block of data to be fetch for all 8 lanes
+; ptr_offset [in] offset to be applied on all pointers (addr0-addr7)
+%macro TRANSPOSE8_U32_LOAD8 17
+%define %%r0 %1
+%define %%r1 %2
+%define %%r2 %3
+%define %%r3 %4
+%define %%r4 %5
+%define %%r5 %6
+%define %%r6 %7
+%define %%r7 %8
+%define %%addr0 %9
+%define %%addr1 %10
+%define %%addr2 %11
+%define %%addr3 %12
+%define %%addr4 %13
+%define %%addr5 %14
+%define %%addr6 %15
+%define %%addr7 %16
+%define %%ptr_offset %17
+
+; Expected output data
+;
+; r0 = {e3 e2 e1 e0 a3 a2 a1 a0}
+; r1 = {f3 f2 f1 f0 b3 b2 b1 b0}
+; r2 = {g3 g2 g1 g0 c3 c2 c1 c0}
+; r3 = {h3 h2 h1 h0 d3 d2 d1 d0}
+; r4 = {e7 e6 e5 e4 a7 a6 a5 a4}
+; r5 = {f7 f6 f5 f4 b7 b6 b5 b4}
+; r6 = {g7 g6 g5 g4 c7 c6 c5 c4}
+; r7 = {h7 h6 h5 h4 d7 d6 d5 d4}
+
+ vmovups XWORD(%%r0),[%%addr0+%%ptr_offset]
+ vmovups XWORD(%%r1),[%%addr1+%%ptr_offset]
+ vmovups XWORD(%%r2),[%%addr2+%%ptr_offset]
+ vmovups XWORD(%%r3),[%%addr3+%%ptr_offset]
+ vmovups XWORD(%%r4),[%%addr0+%%ptr_offset+16]
+ vmovups XWORD(%%r5),[%%addr1+%%ptr_offset+16]
+ vmovups XWORD(%%r6),[%%addr2+%%ptr_offset+16]
+ vmovups XWORD(%%r7),[%%addr3+%%ptr_offset+16]
+
+ vinserti128 %%r0, %%r0, [%%addr4+%%ptr_offset], 0x01
+ vinserti128 %%r1, %%r1, [%%addr5+%%ptr_offset], 0x01
+ vinserti128 %%r2, %%r2, [%%addr6+%%ptr_offset], 0x01
+ vinserti128 %%r3, %%r3, [%%addr7+%%ptr_offset], 0x01
+ vinserti128 %%r4, %%r4, [%%addr4+%%ptr_offset+16], 0x01
+ vinserti128 %%r5, %%r5, [%%addr5+%%ptr_offset+16], 0x01
+ vinserti128 %%r6, %%r6, [%%addr6+%%ptr_offset+16], 0x01
+ vinserti128 %%r7, %%r7, [%%addr7+%%ptr_offset+16], 0x01
+
+%endmacro
+
+; 8x8 32-BIT TRANSPOSE
+;
+; Before calling this macro, TRANSPOSE8_U32_LOAD8 must be called.
+;
+; r0-r3 [in/out] ymm registers containing bytes 0-15 of each 32B block (e.g. ymm0 = [e3-e0 a3-a0])
+; r4-r7 [in/out] ymm registers containing bytes 16-31 of each 32B block (e.g. ymm4 = [e4-e7 a4-a7])
+; t0-t1 [clobbered] ymm temporary registers
+%macro TRANSPOSE8_U32 10
+%define %%r0 %1
+%define %%r1 %2
+%define %%r2 %3
+%define %%r3 %4
+%define %%r4 %5
+%define %%r5 %6
+%define %%r6 %7
+%define %%r7 %8
+%define %%t0 %9
+%define %%t1 %10
+; Input looks like: {r0 r1 r2 r3 r4 r5 r6 r7}
+; r0 = {e3 e2 e1 e0 a3 a2 a1 a0}
+; r1 = {f3 f2 f1 f0 b3 b2 b1 b0}
+; r2 = {g3 g2 g1 g0 c3 c2 c1 c0}
+; r3 = {h3 h2 h1 h0 d3 d2 d1 d0}
+; r4 = {e7 e6 e5 e4 a7 a6 a5 a4}
+; r5 = {f7 f6 f5 f4 b7 b6 b5 b4}
+; r6 = {g7 g6 g5 g4 c7 c6 c5 c4}
+; r7 = {h7 h6 h5 h4 d7 d6 d5 d4}
+;
+; Output looks like: {r0 r1 r2 r3 r4 r5 r6 r7}
+; r0 = {h0 g0 f0 e0 d0 c0 b0 a0}
+; r1 = {h1 g1 f1 e1 d1 c1 b1 a1}
+; r2 = {h2 g2 f2 e2 d2 c2 b2 a2}
+; r3 = {h3 g3 f3 e3 d3 c3 b3 a3}
+; r4 = {h4 g4 f4 e4 d4 c4 b4 a4}
+; r5 = {h5 g5 f5 e5 d5 c5 b5 a5}
+; r6 = {h6 g6 f6 e6 d6 c6 b6 a6}
+; r7 = {h7 g7 f7 e7 d7 c7 b7 a7}
+;
+ ; process top half (r0..r3)
+ vshufps %%t0, %%r0, %%r1, 0x44 ; t0 = {f1 f0 e1 e0 b1 b0 a1 a0}
+ vshufps %%r0, %%r0, %%r1, 0xEE ; r0 = {f3 f2 e3 e2 b3 b2 a3 a2}
+ vshufps %%t1, %%r2, %%r3, 0x44 ; t1 = {h1 h0 g1 g0 d1 d0 c1 c0}
+ vshufps %%r2, %%r2, %%r3, 0xEE ; r2 = {h3 h2 g3 g2 d3 d2 c3 c2}
+
+ vshufps %%r1, %%t0, %%t1, 0xDD ; r1 = {h1 g1 f1 e1 d1 c1 b1 a1}
+ vshufps %%r3, %%r0, %%r2, 0xDD ; r3 = {h3 g3 f3 e3 d3 c3 b3 a3}
+ vshufps %%r2, %%r0, %%r2, 0x88 ; r2 = {h2 g2 f2 e2 d2 c2 b2 a2}
+ vshufps %%r0, %%t0, %%t1, 0x88 ; r0 = {h0 g0 f0 e0 d0 c0 b0 a0}
+
+ ;; process bottom half (r4..r7)
+ vshufps %%t0, %%r4, %%r5, 0x44 ; t0 = {f5 f4 e5 e4 b5 b4 a5 a4}
+ vshufps %%r4, %%r4, %%r5, 0xEE ; r4 = {f7 f6 e7 e6 b7 b6 a7 a6}
+ vshufps %%t1, %%r6, %%r7, 0x44 ; t1 = {h5 h4 g5 g4 d5 d4 c5 c4}
+ vshufps %%r6, %%r6, %%r7, 0xEE ; r6 = {h7 h6 g7 g6 d7 d6 c7 c6}
+
+ vshufps %%r5, %%t0, %%t1, 0xDD ; r5 = {h5 g5 f5 e5 d5 c5 b5 a5}
+ vshufps %%r7, %%r4, %%r6, 0xDD ; r7 = {h7 g7 f7 e7 d7 c7 b7 a7}
+ vshufps %%r6, %%r4, %%r6, 0x88 ; r6 = {h6 g6 f6 e6 d6 c6 b6 a6}
+ vshufps %%r4, %%t0, %%t1, 0x88 ; r4 = {h4 g4 f4 e4 d4 c4 b4 a4}
+%endmacro
+
+; LOAD ALL 4 LANES FOR 4x4 64-BIT TRANSPOSE
+;
+; r0-r3 [out] ymm registers which will contain the data to be transposed
+; addr0-addr3 [in] pointers to the next 32-byte block of data to be fetch for the 4 lanes
+; ptr_offset [in] offset to be applied on all pointers (addr0-addr3)
+%macro TRANSPOSE4_U64_LOAD4 9
+%define %%r0 %1
+%define %%r1 %2
+%define %%r2 %3
+%define %%r3 %4
+%define %%addr0 %5
+%define %%addr1 %6
+%define %%addr2 %7
+%define %%addr3 %8
+%define %%ptr_offset %9
+
+; Expected output data
+;
+; r0 = {c1 c0 a1 a0}
+; r1 = {d1 d0 b1 b0}
+; r2 = {c3 c2 a3 a2}
+; r3 = {d3 d2 b3 b2}
+
+ vmovupd XWORD(%%r0),[%%addr0+%%ptr_offset]
+ vmovupd XWORD(%%r1),[%%addr1+%%ptr_offset]
+ vmovupd XWORD(%%r2),[%%addr0+%%ptr_offset+16]
+ vmovupd XWORD(%%r3),[%%addr1+%%ptr_offset+16]
+
+ vinserti128 %%r0, %%r0, [%%addr2+%%ptr_offset], 0x01
+ vinserti128 %%r1, %%r1, [%%addr3+%%ptr_offset], 0x01
+ vinserti128 %%r2, %%r2, [%%addr2+%%ptr_offset+16], 0x1
+ vinserti128 %%r3, %%r3, [%%addr3+%%ptr_offset+16], 0x01
+
+%endmacro
+
+; 4x4 64-BIT TRANSPOSE
+;
+; Before calling this macro, TRANSPOSE4_U64_LOAD4 must be called.
+;
+; This macro takes 4 registers as input (r0-r3)
+; and transposes their content (64-bit elements)
+; outputing the data in registers (o0,r1,o2,r3),
+; using two additional registers
+%macro TRANSPOSE4_U64 6
+%define %%r0 %1 ; [in] ymm register for row 0 input (c0-c1 a1-a0)
+%define %%r1 %2 ; [in/out] ymm register for row 1 input (d0-d1 b1-b0) and output
+%define %%r2 %3 ; [in] ymm register for row 2 input (c3-c2 a3-a2)
+%define %%r3 %4 ; [in/out] ymm register for row 3 input (d3-d2 b3-b2) and output
+%define %%o0 %5 ; [out] ymm register for row 0 output
+%define %%o2 %6 ; [out] ymm register for row 2 output
+; Input looks like: {r0 r1 r2 r3}
+; r0 = {c1 c0 a1 a0}
+; r1 = {d1 d0 b1 b0}
+; r2 = {c3 c2 a3 a2}
+; r3 = {d3 d2 b3 b2}
+;
+; output looks like: {o0 r1 o2 r3}
+; o0 = {d0 c0 b0 a0}
+; r1 = {d1 c1 b1 a1}
+; o2 = {d2 c2 b2 a2}
+; r3 = {d3 c3 b3 a3}
+ ; vshufps does not cross the mid-way boundary and hence is cheaper
+ vshufps %%o0, %%r0, %%r1, 0x44 ; o0 = {d0 c0 b0 a0}
+ vshufps %%r1, %%r0, %%r1, 0xEE ; r1 = {d1 d0 b1 b0}
+
+ vshufps %%o2, %%r2, %%r3, 0x44 ; o1 = {d2 c2 b2 a2}
+ vshufps %%r3, %%r2, %%r3, 0xEE ; r3 = {d3 c3 b3 a3}
+%endmacro
+
+%endif ;; _TRANSPOSE_AVX2_ASM_
diff --git a/src/spdk/intel-ipsec-mb/include/transpose_avx512.asm b/src/spdk/intel-ipsec-mb/include/transpose_avx512.asm
new file mode 100644
index 000000000..6937ceb00
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/include/transpose_avx512.asm
@@ -0,0 +1,497 @@
+;;
+;; Copyright (c) 2012-2019, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;; * Redistributions of source code must retain the above copyright notice,
+;; this list of conditions and the following disclaimer.
+;; * Redistributions in binary form must reproduce the above copyright
+;; notice, this list of conditions and the following disclaimer in the
+;; documentation and/or other materials provided with the distribution.
+;; * Neither the name of Intel Corporation nor the names of its contributors
+;; may be used to endorse or promote products derived from this software
+;; without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+%ifndef _TRANSPOSE_AVX512_ASM_
+%define _TRANSPOSE_AVX512_ASM_
+
+%include "include/reg_sizes.asm"
+
+section .data
+default rel
+align 64
+PSHUFFLE_TRANSPOSE_MASK1: dq 0x0000000000000000
+ dq 0x0000000000000001
+ dq 0x0000000000000008
+ dq 0x0000000000000009
+ dq 0x0000000000000004
+ dq 0x0000000000000005
+ dq 0x000000000000000C
+ dq 0x000000000000000D
+
+align 64
+PSHUFFLE_TRANSPOSE_MASK2: dq 0x0000000000000002
+ dq 0x0000000000000003
+ dq 0x000000000000000A
+ dq 0x000000000000000B
+ dq 0x0000000000000006
+ dq 0x0000000000000007
+ dq 0x000000000000000E
+ dq 0x000000000000000F
+
+
+; LOAD FIRST 8 LANES FOR 16x16 32-BIT TRANSPOSE
+;
+; r0-r15 [out] zmm registers which will contain the data to be transposed
+; addr0-addr7 [in] pointers to the next 64-byte block of data to be fetch for the first 8 lanes
+; ptr_offset [in] offset to be applied on all pointers (addr0-addr7)
+%macro TRANSPOSE16_U32_LOAD_FIRST8 25
+%define %%r0 %1
+%define %%r1 %2
+%define %%r2 %3
+%define %%r3 %4
+%define %%r4 %5
+%define %%r5 %6
+%define %%r6 %7
+%define %%r7 %8
+%define %%r8 %9
+%define %%r9 %10
+%define %%r10 %11
+%define %%r11 %12
+%define %%r12 %13
+%define %%r13 %14
+%define %%r14 %15
+%define %%r15 %16
+%define %%addr0 %17
+%define %%addr1 %18
+%define %%addr2 %19
+%define %%addr3 %20
+%define %%addr4 %21
+%define %%addr5 %22
+%define %%addr6 %23
+%define %%addr7 %24
+%define %%ptr_offset %25
+
+; Expected output data
+;
+; r0 = {X X X X X X X X a7 a6 a5 a4 a3 a2 a1 a0}
+; r1 = {X X X X X X X X b7 b6 b5 b4 b3 b2 b1 b0}
+; r2 = {X X X X X X X X c7 c6 c5 c4 c3 c2 c1 c0}
+; r3 = {X X X X X X X X d7 d6 d5 d4 d3 d2 d1 d0}
+; r4 = {X X X X X X X X e7 e6 e5 e4 e3 e2 e1 e0}
+; r5 = {X X X X X X X X f7 f6 f5 f4 f3 f2 f1 f0}
+; r6 = {X X X X X X X X g7 g6 g5 g4 g3 g2 g1 g0}
+; r7 = {X X X X X X X X h7 h6 h5 h4 h3 h2 h1 h0}
+; r8 = {X X X X X X X X a15 a14 a13 a12 a11 a10 a9 a8}
+; r9 = {X X X X X X X X b15 b14 b13 b12 b11 b10 b9 b8}
+; r10 = {X X X X X X X X c15 c14 c13 c12 c11 c10 c9 c8}
+; r11 = {X X X X X X X X d15 d14 d13 d12 d11 d10 d9 d8}
+; r12 = {X X X X X X X X e15 e14 e13 e12 e11 e10 e9 e8}
+; r13 = {X X X X X X X X f15 f14 f13 f12 f11 f10 f9 f8}
+; r14 = {X X X X X X X X g15 g14 g13 g12 g11 g10 g9 g8}
+; r15 = {X X X X X X X X h15 h14 h13 h12 h11 h10 h9 h8}
+ vmovups YWORD(%%r0),[%%addr0+%%ptr_offset]
+ vmovups YWORD(%%r1),[%%addr1+%%ptr_offset]
+ vmovups YWORD(%%r2),[%%addr2+%%ptr_offset]
+ vmovups YWORD(%%r3),[%%addr3+%%ptr_offset]
+ vmovups YWORD(%%r4),[%%addr4+%%ptr_offset]
+ vmovups YWORD(%%r5),[%%addr5+%%ptr_offset]
+ vmovups YWORD(%%r6),[%%addr6+%%ptr_offset]
+ vmovups YWORD(%%r7),[%%addr7+%%ptr_offset]
+ vmovups YWORD(%%r8),[%%addr0+%%ptr_offset+32]
+ vmovups YWORD(%%r9),[%%addr1+%%ptr_offset+32]
+ vmovups YWORD(%%r10),[%%addr2+%%ptr_offset+32]
+ vmovups YWORD(%%r11),[%%addr3+%%ptr_offset+32]
+ vmovups YWORD(%%r12),[%%addr4+%%ptr_offset+32]
+ vmovups YWORD(%%r13),[%%addr5+%%ptr_offset+32]
+ vmovups YWORD(%%r14),[%%addr6+%%ptr_offset+32]
+ vmovups YWORD(%%r15),[%%addr7+%%ptr_offset+32]
+
+%endmacro
+
+; LOAD LAST 8 LANES FOR 16x16 32-BIT TRANSPOSE
+;
+; r0-r15 [in/out] zmm registers which will contain the data to be transposed
+; addr0-addr7 [in] pointers to the next 64-byte block of data to be fetch for the last 8 lanes
+; ptr_offset [in] offset to be applied on all pointers (addr0-addr7)
+%macro TRANSPOSE16_U32_LOAD_LAST8 25
+%define %%r0 %1
+%define %%r1 %2
+%define %%r2 %3
+%define %%r3 %4
+%define %%r4 %5
+%define %%r5 %6
+%define %%r6 %7
+%define %%r7 %8
+%define %%r8 %9
+%define %%r9 %10
+%define %%r10 %11
+%define %%r11 %12
+%define %%r12 %13
+%define %%r13 %14
+%define %%r14 %15
+%define %%r15 %16
+%define %%addr0 %17
+%define %%addr1 %18
+%define %%addr2 %19
+%define %%addr3 %20
+%define %%addr4 %21
+%define %%addr5 %22
+%define %%addr6 %23
+%define %%addr7 %24
+%define %%ptr_offset %25
+
+; Expected output data
+;
+; r0 = {i7 i6 i5 i4 i3 i2 i1 i0 a7 a6 a5 a4 a3 a2 a1 a0}
+; r1 = {j7 j6 j5 j4 j3 j2 j1 j0 b7 b6 b5 b4 b3 b2 b1 b0}
+; r2 = {k7 k6 k5 k4 k3 k2 k1 k0 c7 c6 c5 c4 c3 c2 c1 c0}
+; r3 = {l7 l6 l5 l4 l3 l2 l1 l0 d7 d6 d5 d4 d3 d2 d1 d0}
+; r4 = {m7 m6 m5 m4 m3 m2 m1 m0 e7 e6 e5 e4 e3 e2 e1 e0}
+; r5 = {n7 n6 n5 n4 n3 n2 n1 n0 f7 f6 f5 f4 f3 f2 f1 f0}
+; r6 = {o7 o6 o5 o4 o3 o2 o1 o0 g7 g6 g5 g4 g3 g2 g1 g0}
+; r7 = {p7 p6 p5 p4 p3 p2 p1 p0 h7 h6 h5 h4 h3 h2 h1 h0}
+; r8 = {i15 i14 i13 i12 i11 i10 i9 i8 a15 a14 a13 a12 a11 a10 a9 a8}
+; r9 = {j15 j14 j13 j12 j11 j10 j9 j8 b15 b14 b13 b12 b11 b10 b9 b8}
+; r10 = {k15 k14 k13 k12 k11 k10 k9 k8 c15 c14 c13 c12 c11 c10 c9 c8}
+; r11 = {l15 l14 l13 l12 l11 l10 l9 l8 d15 d14 d13 d12 d11 d10 d9 d8}
+; r12 = {m15 m14 m13 m12 m11 m10 m9 m8 e15 e14 e13 e12 e11 e10 e9 e8}
+; r13 = {n15 n14 n13 n12 n11 n10 n9 n8 f15 f14 f13 f12 f11 f10 f9 f8}
+; r14 = {o15 o14 o13 o12 o11 o10 o9 o8 g15 g14 g13 g12 g11 g10 g9 g8}
+; r15 = {p15 p14 p13 p12 p11 p10 p9 p8 h15 h14 h13 h12 h11 h10 h9 h8}
+
+ vinserti64x4 %%r0, %%r0, [%%addr0+%%ptr_offset], 0x01
+ vinserti64x4 %%r1, %%r1, [%%addr1+%%ptr_offset], 0x01
+ vinserti64x4 %%r2, %%r2, [%%addr2+%%ptr_offset], 0x01
+ vinserti64x4 %%r3, %%r3, [%%addr3+%%ptr_offset], 0x01
+ vinserti64x4 %%r4, %%r4, [%%addr4+%%ptr_offset], 0x01
+ vinserti64x4 %%r5, %%r5, [%%addr5+%%ptr_offset], 0x01
+ vinserti64x4 %%r6, %%r6, [%%addr6+%%ptr_offset], 0x01
+ vinserti64x4 %%r7, %%r7, [%%addr7+%%ptr_offset], 0x01
+ vinserti64x4 %%r8, %%r8, [%%addr0+%%ptr_offset+32], 0x01
+ vinserti64x4 %%r9, %%r9, [%%addr1+%%ptr_offset+32], 0x01
+ vinserti64x4 %%r10, %%r10, [%%addr2+%%ptr_offset+32], 0x01
+ vinserti64x4 %%r11, %%r11, [%%addr3+%%ptr_offset+32], 0x01
+ vinserti64x4 %%r12, %%r12, [%%addr4+%%ptr_offset+32], 0x01
+ vinserti64x4 %%r13, %%r13, [%%addr5+%%ptr_offset+32], 0x01
+ vinserti64x4 %%r14, %%r14, [%%addr6+%%ptr_offset+32], 0x01
+ vinserti64x4 %%r15, %%r15, [%%addr7+%%ptr_offset+32], 0x01
+
+%endmacro
+
+; 16x16 32-BIT TRANSPOSE
+;
+; Before calling this macro, TRANSPOSE16_U32_LOAD_FIRST8 and TRANSPOSE16_U32_LOAD_LAST8
+; must be called.
+;
+; r0-r7 [in/out] zmm registers containing bytes 0-31 of each 64B block (e.g. zmm0 = [i7-i0 a7-a0])
+; r8-r15 [in/out] zmm registers containing bytes 32-63 of each 64B block (e.g. zmm8 = [i15-i8 a15-a8])
+; t0-t1 [clobbered] zmm temporary registers
+; m0-m1 [clobbered] zmm registers for shuffle mask storing
+%macro TRANSPOSE16_U32 20
+%define %%r0 %1
+%define %%r1 %2
+%define %%r2 %3
+%define %%r3 %4
+%define %%r4 %5
+%define %%r5 %6
+%define %%r6 %7
+%define %%r7 %8
+%define %%r8 %9
+%define %%r9 %10
+%define %%r10 %11
+%define %%r11 %12
+%define %%r12 %13
+%define %%r13 %14
+%define %%r14 %15
+%define %%r15 %16
+%define %%t0 %17
+%define %%t1 %18
+%define %%m0 %19
+%define %%m1 %20
+
+; Input data
+;
+; r0 = {i7 i6 i5 i4 i3 i2 i1 i0 a7 a6 a5 a4 a3 a2 a1 a0}
+; r1 = {j7 j6 j5 j4 j3 j2 j1 j0 b7 b6 b5 b4 b3 b2 b1 b0}
+; r2 = {k7 k6 k5 k4 k3 k2 k1 k0 c7 c6 c5 c4 c3 c2 c1 c0}
+; r3 = {l7 l6 l5 l4 l3 l2 l1 l0 d7 d6 d5 d4 d3 d2 d1 d0}
+; r4 = {m7 m6 m5 m4 m3 m2 m1 m0 e7 e6 e5 e4 e3 e2 e1 e0}
+; r5 = {n7 n6 n5 n4 n3 n2 n1 n0 f7 f6 f5 f4 f3 f2 f1 f0}
+; r6 = {o7 o6 o5 o4 o3 o2 o1 o0 g7 g6 g5 g4 g3 g2 g1 g0}
+; r7 = {p7 p6 p5 p4 p3 p2 p1 p0 h7 h6 h5 h4 h3 h2 h1 h0}
+; r8 = {i15 i14 i13 i12 i11 i10 i9 i8 a15 a14 a13 a12 a11 a10 a9 a8}
+; r9 = {j15 j14 j13 j12 j11 j10 j9 j8 b15 b14 b13 b12 b11 b10 b9 b8}
+; r10 = {k15 k14 k13 k12 k11 k10 k9 k8 c15 c14 c13 c12 c11 c10 c9 c8}
+; r11 = {l15 l14 l13 l12 l11 l10 l9 l8 d15 d14 d13 d12 d11 d10 d9 d8}
+; r12 = {m15 m14 m13 m12 m11 m10 m9 m8 e15 e14 e13 e12 e11 e10 e9 e8}
+; r13 = {n15 n14 n13 n12 n11 n10 n9 n8 f15 f14 f13 f12 f11 f10 f9 f8}
+; r14 = {o15 o14 o13 o12 o11 o10 o9 o8 g15 g14 g13 g12 g11 g10 g9 g8}
+; r15 = {p15 p14 p13 p12 p11 p10 p9 p8 h15 h14 h13 h12 h11 h10 h9 h8}
+
+; Expected output data
+;
+; r0 = {p0 o0 n0 m0 l0 k0 j0 i0 h0 g0 f0 e0 d0 c0 b0 a0}
+; r1 = {p1 o1 n1 m1 l1 k1 j1 i1 h1 g1 f1 e1 d1 c1 b1 a1}
+; r2 = {p2 o2 n2 m2 l2 k2 j2 i2 h2 g2 f2 e2 d2 c2 b2 a2}
+; r3 = {p3 o3 n3 m3 l3 k3 j3 i3 h3 g3 f3 e3 d3 c3 b3 a3}
+; r4 = {p4 o4 n4 m4 l4 k4 j4 i4 h4 g4 f4 e4 d4 c4 b4 a4}
+; r5 = {p5 o5 n5 m5 l5 k5 j5 i5 h5 g5 f5 e5 d5 c5 b5 a5}
+; r6 = {p6 o6 n6 m6 l6 k6 j6 i6 h6 g6 f6 e6 d6 c6 b6 a6}
+; r7 = {p7 o7 n7 m7 l7 k7 j7 i7 h7 g7 f7 e7 d7 c7 b7 a7}
+; r8 = {p8 o8 n8 m8 l8 k8 j8 i8 h8 g8 f8 e8 d8 c8 b8 a8}
+; r9 = {p9 o9 n9 m9 l9 k9 j9 i9 h9 g9 f9 e9 d9 c9 b9 a9}
+; r10 = {p10 o10 n10 m10 l10 k10 j10 i10 h10 g10 f10 e10 d10 c10 b10 a10}
+; r11 = {p11 o11 n11 m11 l11 k11 j11 i11 h11 g11 f11 e11 d11 c11 b11 a11}
+; r12 = {p12 o12 n12 m12 l12 k12 j12 i12 h12 g12 f12 e12 d12 c12 b12 a12}
+; r13 = {p13 o13 n13 m13 l13 k13 j13 i13 h13 g13 f13 e13 d13 c13 b13 a13}
+; r14 = {p14 o14 n14 m14 l14 k14 j14 i14 h14 g14 f14 e14 d14 c14 b14 a14}
+; r15 = {p15 o15 n15 m15 l15 k15 j15 i15 h15 g15 f15 e15 d15 c15 b15 a15}
+
+
+ ; process first 4 rows (r0..r3)
+ vshufps %%t0, %%r0, %%r1, 0x44 ; t0 = {j5 j4 i5 i4 j1 j0 i1 i0 b5 b4 a5 a4 b1 b0 a1 a0}
+ vshufps %%r0, %%r0, %%r1, 0xEE ; r0 = {j7 j6 i7 i6 j3 j2 i3 i2 b7 b6 a7 a6 b3 b2 a3 a2}
+ vshufps %%t1, %%r2, %%r3, 0x44 ; t1 = {l5 l4 k5 k4 l1 l0 k1 k0 d5 d4 c5 c4 d1 d0 c1 c0}
+ vshufps %%r2, %%r2, %%r3, 0xEE ; r2 = {l7 l6 k7 k6 l3 l2 k3 k2 d7 d6 c7 c6 d3 d2 c3 c2}
+
+ vshufps %%r3, %%t0, %%t1, 0xDD ; r3 = {l5 k5 j5 i5 l1 k1 j1 i1 d5 c5 b5 a5 d1 c1 b1 a1}
+ vshufps %%r1, %%r0, %%r2, 0x88 ; r1 = {l6 k6 j6 i6 l2 k2 j2 i2 d6 c6 b6 a6 d2 c2 b2 a2}
+ vshufps %%r0, %%r0, %%r2, 0xDD ; r0 = {l7 k7 j7 i7 l3 k3 j3 i3 d7 c7 b7 a7 d3 c3 b3 a3}
+ vshufps %%t0, %%t0, %%t1, 0x88 ; t0 = {l4 k4 j4 i4 l0 k0 j0 i0 d4 c4 b4 a4 d0 c0 b0 a0}
+
+ ; Load permute masks
+ vmovdqa64 %%m0, [PSHUFFLE_TRANSPOSE_MASK1]
+ vmovdqa64 %%m1, [PSHUFFLE_TRANSPOSE_MASK2]
+
+ ; process second 4 rows (r4..r7)
+ vshufps %%r2, %%r4, %%r5, 0x44 ; r2 = {n5 n4 m5 m4 n1 n0 m1 m0 f5 f4 e5 e4 f1 f0 e1 e0}
+ vshufps %%r4, %%r4, %%r5, 0xEE ; r4 = {n7 n6 m7 m6 n3 n2 m3 m2 f7 f6 e7 e6 f3 f2 e3 e2}
+ vshufps %%t1, %%r6, %%r7, 0x44 ; t1 = {p5 p4 o5 o4 p1 p0 o1 o0 h5 h4 g5 g4 h1 h0 g1 g0}
+ vshufps %%r6, %%r6, %%r7, 0xEE ; r6 = {p7 p6 o7 o6 p3 p2 o3 o2 h7 h6 g7 g6 h3 h2 g3 g2}
+
+ vshufps %%r7, %%r2, %%t1, 0xDD ; r7 = {p5 o5 n5 m5 p1 o1 n1 m1 h5 g5 f5 e5 h1 g1 f1 e1}
+ vshufps %%r5, %%r4, %%r6, 0x88 ; r5 = {p6 o6 n6 m6 p2 o2 n2 m2 h6 g6 f6 e6 h2 g2 f2 e2}
+ vshufps %%r4, %%r4, %%r6, 0xDD ; r4 = {p7 o7 n7 m7 p3 o3 n3 m3 h7 g7 f7 e7 h3 g3 f3 e3}
+ vshufps %%r2, %%r2, %%t1, 0x88 ; r2 = {p4 o4 n4 m4 p0 o0 n0 m0 h4 g4 f4 e4 h0 g0 f0 e0}
+
+ ; process third 4 rows (r8..r11)
+ vshufps %%r6, %%r8, %%r9, 0x44 ; r6 = {j13 j12 i13 i12 j9 j8 i9 i8 b13 b12 a13 a12 b9 b8 a9 a8 }
+ vshufps %%r8, %%r8, %%r9, 0xEE ; r8 = {j15 j14 i15 i14 j11 j10 i11 i10 b15 b14 a15 a14 b11 b10 a11 a10}
+ vshufps %%t1, %%r10, %%r11, 0x44 ; t1 = {l13 l12 k13 k12 l9 l8 k9 k8 d13 d12 c13 c12 d9 d8 c9 c8 }
+ vshufps %%r10, %%r10, %%r11, 0xEE ; r10 = {l15 l14 k15 k14 l11 l10 k11 k10 d15 d14 c15 c14 d11 d10 c11 c10}
+
+ vshufps %%r11, %%r6, %%t1, 0xDD ; r11 = {l13 k13 j13 i13 l9 k9 j9 i9 d13 c13 b13 a13 d9 c9 b9 a9 }
+ vshufps %%r9, %%r8, %%r10, 0x88 ; r9 = {l14 k14 j14 i14 l10 k10 j10 i10 d14 c14 b14 a14 d10 c10 b10 a10}
+ vshufps %%r8, %%r8, %%r10, 0xDD ; r8 = {l15 k15 j15 i15 l11 k11 j11 i11 d15 c15 b15 a15 d11 c11 b11 a11}
+ vshufps %%r6, %%r6, %%t1, 0x88 ; r6 = {l12 k12 j12 i12 l8 k8 j8 i8 d12 c12 b12 a12 d8 c8 b8 a8 }
+
+ ; process fourth 4 rows (r12..r15)
+ vshufps %%r10, %%r12, %%r13, 0x44 ; r10 = {n13 n12 m13 m12 n9 n8 m9 m8 f13 f12 e13 e12 f9 f8 e9 e8 }
+ vshufps %%r12, %%r12, %%r13, 0xEE ; r12 = {n15 n14 m15 m14 n11 n10 m11 m10 f15 f14 e15 e14 f11 f10 e11 e10}
+ vshufps %%t1, %%r14, %%r15, 0x44 ; t1 = {p13 p12 o13 o12 p9 p8 o9 o8 h13 h12 g13 g12 h9 h8 g9 g8 }
+ vshufps %%r14, %%r14, %%r15, 0xEE ; r14 = {p15 p14 o15 o14 p11 p10 o11 o10 h15 h14 g15 g14 h11 h10 g11 g10}
+
+ vshufps %%r15, %%r10, %%t1, 0xDD ; r15 = {p13 o13 n13 m13 p9 o9 n9 m9 h13 g13 f13 e13 h9 g9 f9 e9 }
+ vshufps %%r13, %%r12, %%r14, 0x88 ; r13 = {p14 o14 n14 m14 p10 o10 n10 m10 h14 g14 f14 e14 h10 g10 f10 e10}
+ vshufps %%r12, %%r12, %%r14, 0xDD ; r12 = {p15 o15 n15 m15 p11 o11 n11 m11 h15 g15 f15 e15 h11 g11 f11 e11}
+ vshufps %%r10, %%r10, %%t1, 0x88 ; r10 = {p12 o12 n12 m12 p8 o8 n8 m8 h12 g12 f12 e12 h8 g8 f8 e8 }
+
+ ; perform final shuffles on bottom half, producing r8-r15
+ vmovdqu32 %%t1, %%m0
+ vpermi2q %%t1, %%r9, %%r13 ; t1 = {p10 o10 n10 m10 l10 k10 j10 i10 h10 g10 f10 e10 d10 c10 b10 a10}
+ vmovdqu32 %%r14, %%m1
+ vpermi2q %%r14, %%r9, %%r13 ; r14 = {p14 o14 n14 m14 l14 k14 j14 i14 h14 g14 f14 e14 d14 c14 b14 a14}
+
+ vmovdqu32 %%r9, %%m0
+ vpermi2q %%r9, %%r11, %%r15 ; r9 = {p9 o9 n9 m9 l9 k9 j9 i9 h9 g9 f9 e9 d9 c9 b9 a9}
+ vmovdqu32 %%r13, %%m1
+ vpermi2q %%r13, %%r11, %%r15 ; r13 = {p13 o13 n13 m13 l13 k13 j13 i13 h13 g13 f13 e13 d13 c13 b13 a13}
+
+ vmovdqu32 %%r11, %%m0
+ vpermi2q %%r11, %%r8, %%r12 ; r11 = {p11 o11 n11 m11 l11 k11 j11 i11 h11 g11 f11 e11 d11 c11 b11 a11}
+ vmovdqu32 %%r15, %%m1
+ vpermi2q %%r15, %%r8, %%r12 ; r15 = {p15 o15 n15 m15 l15 k15 j15 i15 h15 g15 f15 e15 d15 c15 b15 a15}
+
+ vmovdqu32 %%r8, %%m0
+ vpermi2q %%r8, %%r6, %%r10 ; r8 = {p8 o8 n8 m8 l8 k8 j8 i8 h8 g8 f8 e8 d8 c8 b8 a8}
+ vmovdqu32 %%r12, %%m1
+ vpermi2q %%r12, %%r6, %%r10 ; r12 = {p12 o12 n12 m12 l12 k12 j12 i12 h12 g12 f12 e12 d12 c12 b12 a12}
+
+ vmovdqu32 %%r10, %%t1 ; r10 = {p10 o10 n10 m10 l10 k10 j10 i10 h10 g10 f10 e10 d10 c10 b10 a10}
+
+ ; perform final shuffles on top half, producing r0-r7
+ vmovdqu32 %%t1, %%m0
+ vpermi2q %%t1, %%r1, %%r5 ; t1 = {p2 o2 n2 m2 l2 k2 j2 i2 h2 g2 f2 e2 d2 c2 b2 a2}
+ vmovdqu32 %%r6, %%m1
+ vpermi2q %%r6, %%r1, %%r5 ; r6 = {p6 o6 n6 m6 l6 k6 j6 i6 h6 g6 f6 e6 d6 c6 b6 a6}
+
+ vmovdqu32 %%r1, %%m0
+ vpermi2q %%r1, %%r3, %%r7 ; r1 = {p1 o1 n1 m1 l1 k1 j1 i1 h1 g1 f1 e1 d1 c1 b1 a1}
+ vmovdqu32 %%r5, %%m1
+ vpermi2q %%r5, %%r3, %%r7 ; r5 = {p5 o5 n5 m5 l5 k5 j5 i5 h5 g5 f5 e5 d5 c5 b5 a5}
+
+ vmovdqu32 %%r3, %%m0
+ vpermi2q %%r3, %%r0, %%r4 ; r3 = {p3 o3 n3 m3 l3 k3 j3 i3 h3 g3 f3 e3 d3 c3 b3 a3}
+ vmovdqu32 %%r7, %%m1
+ vpermi2q %%r7, %%r0, %%r4 ; r7 = {p7 o7 n7 m7 l7 k7 j7 i7 h7 g7 f7 e7 d7 c7 b7 a7}
+
+ vmovdqu32 %%r0, %%m0
+ vpermi2q %%r0, %%t0, %%r2 ; r0 = {p0 o0 n0 m0 l0 k0 j0 i0 h0 g0 f0 e0 d0 c0 b0 a0}
+ vmovdqu32 %%r4, %%m1
+ vpermi2q %%r4, %%t0, %%r2 ; r4 = {p4 o4 n4 m4 l4 k4 j4 i4 h4 g4 f4 e4 d4 c4 b4 a4}
+
+ vmovdqu32 %%r2, %%t1 ; r2 = {p2 o2 n2 m2 l2 k2 j2 i2 h2 g2 f2 e2 d2 c2 b2 a2}
+
+%endmacro
+
+; LOAD ALL 8 LANES FOR 8x8 64-BIT TRANSPOSE
+;
+; r0-r7 [out] zmm registers which will contain the data to be transposed
+; addr0-addr7 [in] pointers to the next 64-byte block of data to be fetch for all 8 lanes
+; ptr_offset [in] offset to be applied on all pointers (addr0-addr7)
+%macro TRANSPOSE8_U64_LOAD8 17
+%define %%r0 %1
+%define %%r1 %2
+%define %%r2 %3
+%define %%r3 %4
+%define %%r4 %5
+%define %%r5 %6
+%define %%r6 %7
+%define %%r7 %8
+%define %%addr0 %9
+%define %%addr1 %10
+%define %%addr2 %11
+%define %%addr3 %12
+%define %%addr4 %13
+%define %%addr5 %14
+%define %%addr6 %15
+%define %%addr7 %16
+%define %%ptr_offset %17
+
+; Expected output data
+;
+; r0 = {e3 e2 e1 e0 a3 a2 a1 a0}
+; r1 = {f3 f2 f1 f0 b3 b2 b1 b0}
+; r2 = {g3 g2 g1 g0 c3 c2 c1 c0}
+; r3 = {h3 h2 h1 h0 d3 d2 d1 d0}
+; r4 = {e7 e6 e5 e4 a7 a6 a5 a4}
+; r5 = {f7 f6 f5 f4 b7 b6 b5 b4}
+; r6 = {g7 g6 g5 g4 c7 c6 c5 c4}
+; r7 = {h7 h6 h5 h4 d7 d6 d5 d4}
+
+ vmovups YWORD(%%r0),[%%addr0+%%ptr_offset]
+ vmovups YWORD(%%r1),[%%addr1+%%ptr_offset]
+ vmovups YWORD(%%r2),[%%addr2+%%ptr_offset]
+ vmovups YWORD(%%r3),[%%addr3+%%ptr_offset]
+ vmovups YWORD(%%r4),[%%addr0+%%ptr_offset+32]
+ vmovups YWORD(%%r5),[%%addr1+%%ptr_offset+32]
+ vmovups YWORD(%%r6),[%%addr2+%%ptr_offset+32]
+ vmovups YWORD(%%r7),[%%addr3+%%ptr_offset+32]
+
+ vinserti64x4 %%r0, %%r0, [%%addr4+%%ptr_offset], 0x01
+ vinserti64x4 %%r1, %%r1, [%%addr5+%%ptr_offset], 0x01
+ vinserti64x4 %%r2, %%r2, [%%addr6+%%ptr_offset], 0x01
+ vinserti64x4 %%r3, %%r3, [%%addr7+%%ptr_offset], 0x01
+ vinserti64x4 %%r4, %%r4, [%%addr4+%%ptr_offset+32], 0x01
+ vinserti64x4 %%r5, %%r5, [%%addr5+%%ptr_offset+32], 0x01
+ vinserti64x4 %%r6, %%r6, [%%addr6+%%ptr_offset+32], 0x01
+ vinserti64x4 %%r7, %%r7, [%%addr7+%%ptr_offset+32], 0x01
+
+%endmacro
+
+; 8x8 64-BIT TRANSPOSE
+;
+; Before calling this macro, TRANSPOSE8_U64_LOAD8 must be called.
+;
+; r0-r3 [in/out] zmm registers containing bytes 0-31 of each 64B block (e.g. zmm0 = [e3-e0 a3-a0])
+; r4-r7 [in/out] zmm registers containing bytes 32-63 of each 64B block (e.g. zmm4 = [e4-e7 a4-a7])
+; t0-t1 [clobbered] zmm temporary registers
+; PERM_INDEX1-2 [clobbered] zmm registers for shuffle mask storing
+%macro TRANSPOSE8_U64 12
+%define %%r0 %1
+%define %%r1 %2
+%define %%r2 %3
+%define %%r3 %4
+%define %%r4 %5
+%define %%r5 %6
+%define %%r6 %7
+%define %%r7 %8
+%define %%t0 %9
+%define %%t1 %10
+%define %%PERM_INDEX1 %11
+%define %%PERM_INDEX2 %12
+
+; each x(i) is 64 bits, 8 * 64 = 512 ==> a full digest length, 64-bit double precision quantities
+
+; Input data
+;
+; r0 = {e3 e2 e1 e0 a3 a2 a1 a0}
+; r1 = {f3 f2 f1 f0 b3 b2 b1 b0}
+; r2 = {g3 g2 g1 g0 c3 c2 c1 c0}
+; r3 = {h3 h2 h1 h0 d3 d2 d1 d0}
+; r4 = {e7 e6 e5 e4 a7 a6 a5 a4}
+; r5 = {f7 f6 f5 f4 b7 b6 b5 b4}
+; r6 = {g7 g6 g5 g4 c7 c6 c5 c4}
+; r7 = {h7 h6 h5 h4 d7 d6 d5 d4}
+;
+; Expected output data
+;
+; r0 = {h0 g0 f0 e0 d0 c0 b0 a0}
+; r1 = {h1 g1 f1 e1 d1 c1 b1 a1}
+; r2 = {h2 g2 f2 e2 d2 c2 b2 a2}
+; r3 = {h3 g3 f3 e3 d3 c3 b3 a3}
+; r4 = {h4 g4 f4 e4 d4 c4 b4 a4}
+; r5 = {h5 g5 f5 e5 d5 c5 b5 a5}
+; r6 = {h6 g6 f6 e6 d6 c6 b6 a6}
+; r7 = {h7 g7 f7 e7 d7 c7 b7 a7}
+
+ ;; ;;; will not get clobbered
+ vmovdqa32 %%PERM_INDEX1, [PSHUFFLE_TRANSPOSE_MASK1] ; temp
+ vmovdqa32 %%PERM_INDEX2, [PSHUFFLE_TRANSPOSE_MASK2] ; temp
+
+ ; process top half (r0..r3)
+ vshufpd %%t0, %%r0, %%r1, 0x00 ; t0 = {f2 e2 f0 e0 b2 a2 b0 a0}
+ vshufpd %%r1, %%r0, %%r1, 0xFF ; r0 = {f3 e3 f1 e1 b3 a3 b1 a1}
+ vshufpd %%t1, %%r2, %%r3, 0x00 ; t1 = {h2 g2 h0 g0 d2 c2 d0 c0}
+ vshufpd %%r2, %%r2, %%r3, 0xFF ; r2 = {h3 g3 h1 g1 d3 c3 d1 c1}
+
+ vmovdqa32 %%r3, %%r1
+ vpermt2q %%r1, %%PERM_INDEX1,%%r2 ; r1 = {h1 g1 f1 e1 d1 c1 b1 a1}
+ vpermt2q %%r3, %%PERM_INDEX2,%%r2 ; r3 = {h3 g3 f3 e3 d3 c3 b3 a3}
+
+ vmovdqa32 %%r0, %%t0
+ vmovdqa32 %%r2, %%t0
+ vpermt2q %%r0, %%PERM_INDEX1,%%t1 ; r0 = {h0 g0 f0 e0 d0 c0 b0 a0}
+ vpermt2q %%r2, %%PERM_INDEX2,%%t1 ; r2 = {h2 g2 f2 e2 d2 c2 b2 a2}
+
+ ; process top bottom (r4..r7)
+ vshufpd %%t0, %%r4, %%r5, 0x00 ; t0 = {f6 e6 f4 e4 b6 a6 b4 a4}
+ vshufpd %%r5, %%r4, %%r5, 0xFF ; r0 = {f7 e7 f5 e5 b7 a7 b5 a5}
+ vshufpd %%t1, %%r6, %%r7, 0x00 ; t1 = {h6 g6 h4 g4 d6 c6 d4 c4}
+ vshufpd %%r6, %%r6, %%r7, 0xFF ; r2 = {h7 g7 h5 g5 d7 c7 d5 c5}
+
+ vmovdqa32 %%r7, %%r5
+ vpermt2q %%r5, %%PERM_INDEX1,%%r6 ; r5 = {h5 g5 f5 e5 d5 c5 b5 a5}
+ vpermt2q %%r7, %%PERM_INDEX2,%%r6 ; r7 = {h7 g7 f7 e7 d7 c7 b7 a7}
+
+ vmovdqa32 %%r4, %%t0
+ vmovdqa32 %%r6, %%t0
+ vpermt2q %%r4, %%PERM_INDEX1,%%t1 ; r4 = {h4 g4 f4 e4 d4 c4 b4 a4}
+ vpermt2q %%r6, %%PERM_INDEX2,%%t1 ; r6 = {h6 g6 f6 e6 d6 c6 b6 a6}
+%endmacro
+
+%endif ;; _TRANSPOSE_AVX512_ASM_
diff --git a/src/spdk/intel-ipsec-mb/include/wireless_common.asm b/src/spdk/intel-ipsec-mb/include/wireless_common.asm
new file mode 100644
index 000000000..811c2c256
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/include/wireless_common.asm
@@ -0,0 +1,128 @@
+;;
+;; Copyright (c) 2019, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;; * Redistributions of source code must retain the above copyright notice,
+;; this list of conditions and the following disclaimer.
+;; * Redistributions in binary form must reproduce the above copyright
+;; notice, this list of conditions and the following disclaimer in the
+;; documentation and/or other materials provided with the distribution.
+;; * Neither the name of Intel Corporation nor the names of its contributors
+;; may be used to endorse or promote products derived from this software
+;; without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+%include "include/os.asm"
+
+section .data
+default rel
+align 16
+swap_mask:
+db 0x03, 0x02, 0x01, 0x00, 0x07, 0x06, 0x05, 0x04
+db 0x0b, 0x0a, 0x09, 0x08, 0x0f, 0x0e, 0x0d, 0x0c
+
+section .text
+
+; Function which XOR's 64 bytes of the input buffer with 64 bytes of the
+; KeyStream, placing the result in the output buffer.
+; KeyStream bytes must be swapped on 32 bit boundary before this operation
+%macro xor_keystream 1
+%define %%SIMDTYPE %1 ; "SSE" or "AVX"
+
+%ifidn %%SIMDTYPE, AVX
+ %define %%MOVDQU vmovdqu
+ %define %%MOVDQA vmovdqa
+ %define %%PXOR vpxor
+ %define %%PSHUFB vpshufb
+%else
+ %define %%MOVDQU movdqu
+ %define %%MOVDQA movdqa
+ %define %%PXOR pxor
+ %define %%PSHUFB pshufb
+%endif
+%ifdef LINUX
+ %define %%pIn rdi
+ %define %%pOut rsi
+ %define %%pKS rdx
+%else
+ %define %%pIn rcx
+ %define %%pOut rdx
+ %define %%pKS r8
+
+ mov rax, rsp
+ sub rsp, 48
+ and rsp, ~15
+ %%MOVDQA [rsp], xmm6
+ %%MOVDQA [rsp + 16], xmm7
+ %%MOVDQA [rsp + 32], xmm8
+%endif
+ %define XKEY0 xmm0
+ %define XKEY1 xmm1
+ %define XKEY2 xmm2
+ %define XKEY3 xmm3
+ %define XIN0 xmm4
+ %define XIN1 xmm5
+ %define XIN2 xmm6
+ %define XIN3 xmm7
+ %define XSHUF xmm8
+
+ %%MOVDQA XSHUF, [rel swap_mask]
+ %%MOVDQA XKEY0, [%%pKS]
+ %%MOVDQA XKEY1, [%%pKS + 16]
+ %%MOVDQA XKEY2, [%%pKS + 32]
+ %%MOVDQA XKEY3, [%%pKS + 48]
+
+ %%PSHUFB XKEY0, XSHUF
+ %%PSHUFB XKEY1, XSHUF
+ %%PSHUFB XKEY2, XSHUF
+ %%PSHUFB XKEY3, XSHUF
+
+ %%MOVDQU XIN0, [%%pIn]
+ %%MOVDQU XIN1, [%%pIn + 16]
+ %%MOVDQU XIN2, [%%pIn + 32]
+ %%MOVDQU XIN3, [%%pIn + 48]
+
+ %%PXOR XKEY0, XIN0
+ %%PXOR XKEY1, XIN1
+ %%PXOR XKEY2, XIN2
+ %%PXOR XKEY3, XIN3
+
+ %%MOVDQU [%%pOut], XKEY0
+ %%MOVDQU [%%pOut + 16], XKEY1
+ %%MOVDQU [%%pOut + 32], XKEY2
+ %%MOVDQU [%%pOut + 48], XKEY3
+
+%ifndef LINUX
+ %%MOVDQA xmm6, [rsp]
+ %%MOVDQA xmm7, [rsp + 16]
+ %%MOVDQA xmm8, [rsp + 32]
+ mov rsp,rax
+%endif
+%endmacro
+
+MKGLOBAL(asm_XorKeyStream64B_avx,function,internal)
+asm_XorKeyStream64B_avx:
+ xor_keystream AVX
+ ret
+
+MKGLOBAL(asm_XorKeyStream64B_sse,function,internal)
+asm_XorKeyStream64B_sse:
+ xor_keystream SSE
+ ret
+
+%ifdef LINUX
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
diff --git a/src/spdk/intel-ipsec-mb/include/wireless_common.h b/src/spdk/intel-ipsec-mb/include/wireless_common.h
new file mode 100644
index 000000000..a0ba60019
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/include/wireless_common.h
@@ -0,0 +1,216 @@
+/*******************************************************************************
+ Copyright (c) 2009-2019, Intel Corporation
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice,
+ this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+ * Neither the name of Intel Corporation nor the names of its contributors
+ may be used to endorse or promote products derived from this software
+ without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+ FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+
+#ifndef _WIRELESS_COMMON_H_
+#define _WIRELESS_COMMON_H_
+
+#include <string.h>
+#ifdef LINUX
+#include <x86intrin.h>
+#else
+#include <intrin.h>
+#endif
+
+#define NUM_PACKETS_1 1
+#define NUM_PACKETS_2 2
+#define NUM_PACKETS_3 3
+#define NUM_PACKETS_4 4
+#define NUM_PACKETS_8 8
+#define NUM_PACKETS_16 16
+
+#ifdef LINUX
+#define BSWAP32 __builtin_bswap32
+#define BSWAP64 __builtin_bswap64
+#else
+#define BSWAP32 _byteswap_ulong
+#define BSWAP64 _byteswap_uint64
+#endif
+
+typedef union _m128_u {
+ uint8_t byte[16];
+ uint16_t word[8];
+ uint32_t dword[4];
+ uint64_t qword[2];
+ __m128i m;
+} m128_t;
+
+typedef union _m64_u {
+ uint8_t byte[8];
+ uint16_t word[4];
+ uint32_t dword[2];
+ uint64_t m;
+} m64_t;
+
+static inline uint32_t bswap4(const uint32_t val)
+{
+ return ((val >> 24) | /**< A*/
+ ((val & 0xff0000) >> 8) | /**< B*/
+ ((val & 0xff00) << 8) | /**< C*/
+ (val << 24)); /**< D*/
+}
+
+/*************************************************************************
+* @description - this function is used to copy the right number of bytes
+* from the source to destination buffer
+*
+* @param pSrc [IN] - pointer to an input Byte array (at least len bytes
+* available)
+* @param pDst [IN] - pointer to the output buffer (at least len bytes available)
+* @param len [IN] - length in bytes to copy (0 to 4)
+*
+*************************************************************************/
+static inline void memcpy_keystream_32(uint8_t *pDst,
+ const uint8_t *pSrc,
+ const uint32_t len)
+{
+ switch (len) {
+ case 4:
+ *(uint32_t *)pDst = *(const uint32_t *)pSrc;
+ break;
+ case 3:
+ pDst[2] = pSrc[2];
+ /* fall-through */
+ case 2:
+ pDst[1] = pSrc[1];
+ /* fall-through */
+ case 1:
+ pDst[0] = pSrc[0];
+ /* fall-through */
+ }
+}
+
+/*************************************************************************
+* @description - this function is used to XOR the right number of bytes
+* from a keystrea and a source into a destination buffer
+*
+* @param pSrc [IN] - pointer to an input Byte array (at least 4 bytes available)
+* @param pDst [IN] - pointer to the output buffer (at least 4 bytes available)
+* @param KS [IN] - 4 bytes of keystream number, must be reversed
+* into network byte order before XOR
+*
+*************************************************************************/
+static inline void xor_keystream_reverse_32(uint8_t *pDst,
+ const uint8_t *pSrc,
+ const uint32_t KS)
+{
+ *(uint32_t *)pDst = (*(const uint32_t *)pSrc) ^ BSWAP32(KS);
+}
+
+/******************************************************************************
+ * @description - this function is used to do a keystream operation
+ * @param pSrc [IN] - pointer to an input Byte array (at least 8 bytes
+ * available)
+ * @param pDst [IN] - pointer to the output buffer (at least 8 bytes available)
+ * @param keyStream [IN] - the Keystream value (8 bytes)
+ ******************************************************************************/
+static inline const uint8_t *
+xor_keystrm_rev(uint8_t *pDst, const uint8_t *pSrc, uint64_t keyStream)
+{
+ /* default: XOR ONLY, read the input buffer, update the output buffer */
+ const uint64_t *pSrc64 = (const uint64_t *)pSrc;
+ uint64_t *pDst64 = (uint64_t *)pDst;
+ *pDst64 = *pSrc64 ^ BSWAP64(keyStream);
+ return (const uint8_t *)(pSrc64 + 1);
+}
+
+/******************************************************************************
+ * @description - this function is used to copy the right number of bytes
+ * from the source to destination buffer
+ * @param pSrc [IN] - pointer to an input Byte array (at least len bytes
+ * available)
+ * @param pDst [IN] - pointer to the output buffer (at least len bytes
+ * available)
+ * @param len [IN] - length in bytes to copy
+ ******************************************************************************/
+static inline void
+memcpy_keystrm(uint8_t *pDst, const uint8_t *pSrc, const uint32_t len)
+{
+ switch (len) {
+ case 8:
+ *(uint64_t *)pDst = *(const uint64_t *)pSrc;
+ break;
+ case 7:
+ pDst[6] = pSrc[6];
+ /* fall-through */
+ case 6:
+ pDst[5] = pSrc[5];
+ /* fall-through */
+ case 5:
+ pDst[4] = pSrc[4];
+ /* fall-through */
+ case 4:
+ *(uint32_t *)pDst = *(const uint32_t *)pSrc;
+ break;
+ case 3:
+ pDst[2] = pSrc[2];
+ /* fall-through */
+ case 2:
+ pDst[1] = pSrc[1];
+ /* fall-through */
+ case 1:
+ pDst[0] = pSrc[0];
+ /* fall-through */
+ }
+}
+
+/**
+ ******************************************************************************
+ *
+ * @description
+ * Definition of the external SSE function that XOR's 64 bytes of input
+ * with 64 bytes of keystream, swapping keystream bytes every 4 bytes.
+ *
+ * @param[in] pIn Pointer to the input buffer
+ * @param[out] pOut Pointer to the output buffer
+ * @param[in] pKey Pointer to the new 64 byte keystream
+ *
+ * @pre
+ * None
+ *
+ *****************************************************************************/
+IMB_DLL_LOCAL void asm_XorKeyStream64B_sse(const void *pIn, void *pOut,
+ const void *pKey);
+
+/**
+ ******************************************************************************
+ *
+ * @description
+ * Definition of the external AVX function that XOR's 64 bytes of input
+ * with 64 bytes of keystream, swapping keystream bytes every 4 bytes.
+ *
+ * @param[in] pIn Pointer to the input buffer
+ * @param[out] pOut Pointer to the output buffer
+ * @param[in] pKey Pointer to the new 64 byte keystream
+ *
+ * @pre
+ * None
+ *
+ *****************************************************************************/
+IMB_DLL_LOCAL void asm_XorKeyStream64B_avx(const void *pIn, void *pOut,
+ const void *pKey);
+
+#endif /* _WIRELESS_COMMON_H_ */
diff --git a/src/spdk/intel-ipsec-mb/include/zuc_common.asm b/src/spdk/intel-ipsec-mb/include/zuc_common.asm
new file mode 100644
index 000000000..4b9cdd3ec
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/include/zuc_common.asm
@@ -0,0 +1,740 @@
+;;
+;; Copyright (c) 2009-2019, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;; * Redistributions of source code must retain the above copyright notice,
+;; this list of conditions and the following disclaimer.
+;; * Redistributions in binary form must reproduce the above copyright
+;; notice, this list of conditions and the following disclaimer in the
+;; documentation and/or other materials provided with the distribution.
+;; * Neither the name of Intel Corporation nor the names of its contributors
+;; may be used to endorse or promote products derived from this software
+;; without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+%include "include/os.asm"
+%include "include/reg_sizes.asm"
+
+extern lookup_8bit_sse
+
+
+section .data
+default rel
+align 64
+S0:
+db 0x3e,0x72,0x5b,0x47,0xca,0xe0,0x00,0x33,0x04,0xd1,0x54,0x98,0x09,0xb9,0x6d,0xcb
+db 0x7b,0x1b,0xf9,0x32,0xaf,0x9d,0x6a,0xa5,0xb8,0x2d,0xfc,0x1d,0x08,0x53,0x03,0x90
+db 0x4d,0x4e,0x84,0x99,0xe4,0xce,0xd9,0x91,0xdd,0xb6,0x85,0x48,0x8b,0x29,0x6e,0xac
+db 0xcd,0xc1,0xf8,0x1e,0x73,0x43,0x69,0xc6,0xb5,0xbd,0xfd,0x39,0x63,0x20,0xd4,0x38
+db 0x76,0x7d,0xb2,0xa7,0xcf,0xed,0x57,0xc5,0xf3,0x2c,0xbb,0x14,0x21,0x06,0x55,0x9b
+db 0xe3,0xef,0x5e,0x31,0x4f,0x7f,0x5a,0xa4,0x0d,0x82,0x51,0x49,0x5f,0xba,0x58,0x1c
+db 0x4a,0x16,0xd5,0x17,0xa8,0x92,0x24,0x1f,0x8c,0xff,0xd8,0xae,0x2e,0x01,0xd3,0xad
+db 0x3b,0x4b,0xda,0x46,0xeb,0xc9,0xde,0x9a,0x8f,0x87,0xd7,0x3a,0x80,0x6f,0x2f,0xc8
+db 0xb1,0xb4,0x37,0xf7,0x0a,0x22,0x13,0x28,0x7c,0xcc,0x3c,0x89,0xc7,0xc3,0x96,0x56
+db 0x07,0xbf,0x7e,0xf0,0x0b,0x2b,0x97,0x52,0x35,0x41,0x79,0x61,0xa6,0x4c,0x10,0xfe
+db 0xbc,0x26,0x95,0x88,0x8a,0xb0,0xa3,0xfb,0xc0,0x18,0x94,0xf2,0xe1,0xe5,0xe9,0x5d
+db 0xd0,0xdc,0x11,0x66,0x64,0x5c,0xec,0x59,0x42,0x75,0x12,0xf5,0x74,0x9c,0xaa,0x23
+db 0x0e,0x86,0xab,0xbe,0x2a,0x02,0xe7,0x67,0xe6,0x44,0xa2,0x6c,0xc2,0x93,0x9f,0xf1
+db 0xf6,0xfa,0x36,0xd2,0x50,0x68,0x9e,0x62,0x71,0x15,0x3d,0xd6,0x40,0xc4,0xe2,0x0f
+db 0x8e,0x83,0x77,0x6b,0x25,0x05,0x3f,0x0c,0x30,0xea,0x70,0xb7,0xa1,0xe8,0xa9,0x65
+db 0x8d,0x27,0x1a,0xdb,0x81,0xb3,0xa0,0xf4,0x45,0x7a,0x19,0xdf,0xee,0x78,0x34,0x60
+
+S1:
+db 0x55,0xc2,0x63,0x71,0x3b,0xc8,0x47,0x86,0x9f,0x3c,0xda,0x5b,0x29,0xaa,0xfd,0x77
+db 0x8c,0xc5,0x94,0x0c,0xa6,0x1a,0x13,0x00,0xe3,0xa8,0x16,0x72,0x40,0xf9,0xf8,0x42
+db 0x44,0x26,0x68,0x96,0x81,0xd9,0x45,0x3e,0x10,0x76,0xc6,0xa7,0x8b,0x39,0x43,0xe1
+db 0x3a,0xb5,0x56,0x2a,0xc0,0x6d,0xb3,0x05,0x22,0x66,0xbf,0xdc,0x0b,0xfa,0x62,0x48
+db 0xdd,0x20,0x11,0x06,0x36,0xc9,0xc1,0xcf,0xf6,0x27,0x52,0xbb,0x69,0xf5,0xd4,0x87
+db 0x7f,0x84,0x4c,0xd2,0x9c,0x57,0xa4,0xbc,0x4f,0x9a,0xdf,0xfe,0xd6,0x8d,0x7a,0xeb
+db 0x2b,0x53,0xd8,0x5c,0xa1,0x14,0x17,0xfb,0x23,0xd5,0x7d,0x30,0x67,0x73,0x08,0x09
+db 0xee,0xb7,0x70,0x3f,0x61,0xb2,0x19,0x8e,0x4e,0xe5,0x4b,0x93,0x8f,0x5d,0xdb,0xa9
+db 0xad,0xf1,0xae,0x2e,0xcb,0x0d,0xfc,0xf4,0x2d,0x46,0x6e,0x1d,0x97,0xe8,0xd1,0xe9
+db 0x4d,0x37,0xa5,0x75,0x5e,0x83,0x9e,0xab,0x82,0x9d,0xb9,0x1c,0xe0,0xcd,0x49,0x89
+db 0x01,0xb6,0xbd,0x58,0x24,0xa2,0x5f,0x38,0x78,0x99,0x15,0x90,0x50,0xb8,0x95,0xe4
+db 0xd0,0x91,0xc7,0xce,0xed,0x0f,0xb4,0x6f,0xa0,0xcc,0xf0,0x02,0x4a,0x79,0xc3,0xde
+db 0xa3,0xef,0xea,0x51,0xe6,0x6b,0x18,0xec,0x1b,0x2c,0x80,0xf7,0x74,0xe7,0xff,0x21
+db 0x5a,0x6a,0x54,0x1e,0x41,0x31,0x92,0x35,0xc4,0x33,0x07,0x0a,0xba,0x7e,0x0e,0x34
+db 0x88,0xb1,0x98,0x7c,0xf3,0x3d,0x60,0x6c,0x7b,0xca,0xd3,0x1f,0x32,0x65,0x04,0x28
+db 0x64,0xbe,0x85,0x9b,0x2f,0x59,0x8a,0xd7,0xb0,0x25,0xac,0xaf,0x12,0x03,0xe2,0xf2
+
+EK_d:
+dw 0x44D7, 0x26BC, 0x626B, 0x135E, 0x5789, 0x35E2, 0x7135, 0x09AF,
+dw 0x4D78, 0x2F13, 0x6BC4, 0x1AF1, 0x5E26, 0x3C4D, 0x789A, 0x47AC
+
+%ifdef LINUX
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
+
+section .text
+
+%define OFFSET_FR1 (16*4)
+%define OFFSET_FR2 (17*4)
+%define OFFSET_BRC_X0 (18*4)
+%define OFFSET_BRC_X1 (19*4)
+%define OFFSET_BRC_X2 (20*4)
+%define OFFSET_BRC_X3 (21*4)
+
+;
+; BITS_REORG()
+;
+; params
+; %1 - round number
+; uses
+; eax, ebx, ecx, edx
+; return
+; updates r12d, r13d, r14d, r15d
+;
+%macro BITS_REORG 1
+ ;
+ ; r12d = LFSR_S15
+ ; eax = LFSR_S14
+ ; r13d = LFSR_S11
+ ; ebx = LFSR_S9
+ ; r14d = LFSR_S7
+ ; ecx = LFSR_S5
+ ; r15d = LFSR_S2
+ ; edx = LFSR_S0
+
+ mov r12d, [rsi + ((15 + %1) % 16)*4]
+ mov eax, [rsi + ((14 + %1) % 16)*4]
+ mov r13d, [rsi + ((11 + %1) % 16)*4]
+ mov ebx, [rsi + (( 9 + %1) % 16)*4]
+ mov r14d, [rsi + (( 7 + %1) % 16)*4]
+ mov ecx, [rsi + (( 5 + %1) % 16)*4]
+ mov r15d, [rsi + (( 2 + %1) % 16)*4]
+ mov edx, [rsi + (( 0 + %1) % 16)*4]
+
+ shr r12d, 15
+ shl eax, 16
+ shl ebx, 1
+ shl ecx, 1
+ shl edx, 1
+ shld r12d, eax, 16 ; BRC_X0
+ shld r13d, ebx, 16 ; BRC_X1
+ shld r14d, ecx, 16 ; BRC_X2
+ shld r15d, edx, 16 ; BRC_X3
+%endmacro
+
+%macro lookup_single_sbox 3
+%define %%table %1 ; [in] Pointer to table to look up
+%define %%idx %2 ; [in] Index to look up
+%define %%value %3 ; [out] Returned value from lookup function (rcx, rdx, r8, r9)
+
+%ifdef SAFE_LOOKUP
+ ;; Save all registers used in lookup_8bit (xmm0-5, r9,r10)
+ ;; and registers for param passing and return (4 regs, OS dependent)
+ ;; (6*16 + 6*8 = 144 bytes)
+ sub rsp, 144
+
+ movdqu [rsp], xmm0
+ movdqu [rsp + 16], xmm1
+ movdqu [rsp + 32], xmm2
+ movdqu [rsp + 48], xmm3
+ movdqu [rsp + 64], xmm4
+ movdqu [rsp + 80], xmm5
+ mov [rsp + 96], r9
+ mov [rsp + 104], r10
+
+%ifdef LINUX
+ mov [rsp + 112], rdi
+ mov [rsp + 120], rsi
+ mov [rsp + 128], rdx
+
+ mov rdi, %%table
+ mov rsi, %%idx
+ mov rdx, 256
+%else
+ mov [rsp + 112], rcx
+ mov [rsp + 120], rdx
+ mov [rsp + 128], r8
+ mov rcx, %%table
+ mov rdx, %%idx
+ mov r8, 256
+%endif
+ mov [rsp + 136], rax
+
+ call lookup_8bit_sse
+
+ ;; Restore all registers
+ movdqu xmm0, [rsp]
+ movdqu xmm1, [rsp + 16]
+ movdqu xmm2, [rsp + 32]
+ movdqu xmm3, [rsp + 48]
+ movdqu xmm4, [rsp + 64]
+ movdqu xmm5, [rsp + 80]
+ mov r9, [rsp + 96]
+ mov r10, [rsp + 104]
+
+%ifdef LINUX
+ mov rdi, [rsp + 112]
+ mov rsi, [rsp + 120]
+ mov rdx, [rsp + 128]
+%else
+ mov rcx, [rsp + 112]
+ mov rdx, [rsp + 120]
+ mov r8, [rsp + 128]
+%endif
+
+ ;; Move returned value from lookup function, before restoring rax
+ mov DWORD(%%value), eax
+ mov rax, [rsp + 136]
+
+ add rsp, 144
+
+%else ;; SAFE_LOOKUP
+
+ movzx DWORD(%%value), BYTE [%%table + %%idx]
+
+%endif ;; SAFE_LOOKUP
+%endmacro
+
+;
+; NONLIN_FUN()
+;
+; params
+; %1 == 1, then calculate W
+; uses
+; rdi rsi eax rdx edx
+; r8d r9d ebx
+; return
+; eax = W value
+; r10d = F_R1
+; r11d = F_R2
+;
+%macro NONLIN_FUN 1
+
+%if (%1 == 1)
+ mov eax, r12d
+ xor eax, r10d
+ add eax, r11d ; W = (BRC_X0 ^ F_R1) + F_R2
+%endif
+ lea rdi, [rel S0]
+ lea rsi, [rel S1]
+
+ add r10d, r13d ; W1= F_R1 + BRC_X1
+ xor r11d, r14d ; W2= F_R2 ^ BRC_X2
+
+ mov rdx, r10
+ shld edx, r11d, 16 ; P = (W1 << 16) | (W2 >> 16)
+ shld r11d, r10d, 16 ; Q = (W2 << 16) | (W1 >> 16)
+
+ mov ebx, edx
+ mov ecx, edx
+ mov r8d, edx
+ mov r9d, edx
+
+ rol ebx, 2
+ rol ecx, 10
+ rol r8d, 18
+ rol r9d, 24
+ xor edx, ebx
+ xor edx, ecx
+ xor edx, r8d
+ xor edx, r9d ; U = L1(P) = EDX, hi(RDX)=0
+ ;
+ xor r10, r10
+ shld ebx, edx, 24
+ shld r8d, edx, 16
+ shld r9d, edx, 8
+ and rdx, 0xFF
+ lookup_single_sbox rsi, rdx, rdx
+ and rbx, 0xFF
+ lookup_single_sbox rdi, rbx, rbx
+ and r8, 0xFF
+ lookup_single_sbox rsi, r8, r8
+ and r9, 0xFF
+ lookup_single_sbox rdi, r9, r9
+ shrd r10d, edx, 8
+ shrd r10d, ebx, 8
+ shrd r10d, r8d, 8
+ shrd r10d, r9d, 8
+ ;
+ mov ebx, r11d
+ mov ecx, r11d
+ mov r8d, r11d
+ mov r9d, r11d
+ rol ebx, 8
+ rol ecx, 14
+ rol r8d, 22
+ rol r9d, 30
+ xor r11d, ebx
+ xor r11d, ecx
+ xor r11d, r8d
+ xor r11d, r9d ; V = L2(Q) = ECX, hi(RCX)=0
+ ;
+ shld ebx, r11d, 24
+ shld r8d, r11d, 16
+ shld r9d, r11d, 8
+ and r11, 0xFF
+
+ lookup_single_sbox rsi, r11, r11
+ and rbx, 0xFF
+ lookup_single_sbox rdi, rbx, rbx
+ and r8, 0xFF
+ lookup_single_sbox rsi, r8, r8
+ and r9, 0xFF
+ lookup_single_sbox rdi, r9, r9
+
+ shrd r11d, r11d, 8
+
+ shrd r11d, ebx, 8
+ shrd r11d, r8d, 8
+ shrd r11d, r9d, 8
+%endmacro
+
+
+;
+; LFSR_UPDT()
+;
+; params
+; %1 - round number
+; uses
+; rax as input (ZERO or W)
+; return
+;
+%macro LFSR_UPDT 1
+ ;
+ ; ebx = LFSR_S0
+ ; ecx = LFSR_S4
+ ; edx = LFSR_S10
+ ; r8d = LFSR_S13
+ ; r9d = LFSR_S15
+ ;lea rsi, [LFSR_STA] ; moved to calling function
+
+ mov ebx, [rsi + (( 0 + %1) % 16)*4]
+ mov ecx, [rsi + (( 4 + %1) % 16)*4]
+ mov edx, [rsi + ((10 + %1) % 16)*4]
+ mov r8d, [rsi + ((13 + %1) % 16)*4]
+ mov r9d, [rsi + ((15 + %1) % 16)*4]
+
+ ; Calculate 64-bit LFSR feedback
+ add rax, rbx
+ shl rbx, 8
+ shl rcx, 20
+ shl rdx, 21
+ shl r8, 17
+ shl r9, 15
+ add rax, rbx
+ add rax, rcx
+ add rax, rdx
+ add rax, r8
+ add rax, r9
+
+ ; Reduce it to 31-bit value
+ mov rbx, rax
+ and rax, 0x7FFFFFFF
+ shr rbx, 31
+ add rax, rbx
+
+ mov rbx, rax
+ sub rbx, 0x7FFFFFFF
+ cmovns rax, rbx
+
+
+ ; LFSR_S16 = (LFSR_S15++) = eax
+ mov [rsi + (( 0 + %1) % 16)*4], eax
+%endmacro
+
+
+;
+; make_u31()
+;
+%macro make_u31 4
+
+%define %%Rt %1
+%define %%Ke %2
+%define %%Ek %3
+%define %%Iv %4
+ xor %%Rt, %%Rt
+ shrd %%Rt, %%Iv, 8
+ shrd %%Rt, %%Ek, 15
+ shrd %%Rt, %%Ke, 9
+%endmacro
+
+
+;
+; key_expand()
+;
+%macro key_expand 1
+ movzx r8d, byte [pKe + (%1 + 0)]
+ movzx r9d, word [rbx + ((%1 + 0)*2)]
+ movzx r10d, byte [pIv + (%1 + 0)]
+ make_u31 r11d, r8d, r9d, r10d
+ mov [rax + ((%1 + 0)*4)], r11d
+
+ movzx r12d, byte [pKe + (%1 + 1)]
+ movzx r13d, word [rbx + ((%1 + 1)*2)]
+ movzx r14d, byte [pIv + (%1 + 1)]
+ make_u31 r15d, r12d, r13d, r14d
+ mov [rax + ((%1 + 1)*4)], r15d
+%endmacro
+
+
+
+;----------------------------------------------------------------------------------------
+;;
+;;extern void Zuc_Initialization(uint8_t* pKey, uint8_t* pIV, uint32_t * pState)
+;;
+;; WIN64
+;; RCX - pKey
+;; RDX - pIV
+;; R8 - pState
+;; LIN64
+;; RDI - pKey
+;; RSI - pIV
+;; RDX - pState
+;;
+align 16
+MKGLOBAL(asm_ZucInitialization,function,internal)
+asm_ZucInitialization:
+
+%ifdef LINUX
+ %define pKe rdi
+ %define pIv rsi
+ %define pState rdx
+%else
+ %define pKe rcx
+ %define pIv rdx
+ %define pState r8
+%endif
+
+ ; save the base pointer
+ push rbp
+
+ ;load stack pointer to rbp and reserve memory in the red zone
+ mov rbp, rsp
+ sub rsp, 196
+
+ ; Save non-volatile registers
+ mov [rbp - 8], rbx
+ mov [rbp - 32], r12
+ mov [rbp - 40], r13
+ mov [rbp - 48], r14
+ mov [rbp - 56], r15
+%ifndef LINUX
+ mov [rbp - 64], rdi
+ mov [rbp - 72], rsi
+%endif
+
+ lea rbx, [rel EK_d] ; load pointer to D
+ lea rax, [pState] ; load pointer to pState
+ mov [rbp - 88], pState ; save pointer to pState
+
+ ; Expand key
+ key_expand 0
+ key_expand 2
+ key_expand 4
+ key_expand 6
+ key_expand 8
+ key_expand 10
+ key_expand 12
+ key_expand 14
+
+ ; Set R1 and R2 to zero
+ xor r10, r10
+ xor r11, r11
+
+ ; Shift LFSR 32-times, update state variables
+%assign N 0
+%rep 32
+ mov rdx, [rbp - 88] ; load pointer to pState
+ lea rsi, [rdx]
+
+ BITS_REORG N
+
+ NONLIN_FUN 1
+ shr eax, 1
+
+ mov rdx, [rbp - 88] ; re-load pointer to pState
+ lea rsi, [rdx]
+
+ LFSR_UPDT N
+
+%assign N N+1
+%endrep
+
+ ; And once more, initial round from keygen phase = 33 times
+ mov rdx, [rbp - 88] ; load pointer to pState
+ lea rsi, [rdx]
+
+
+ BITS_REORG 0
+ NONLIN_FUN 0
+ xor rax, rax
+
+ mov rdx, [rbp - 88] ; load pointer to pState
+ lea rsi, [rdx]
+
+ LFSR_UPDT 0
+
+ mov rdx, [rbp - 88] ; load pointer to pState
+ lea rsi, [rdx]
+
+ ; Save ZUC's state variables
+ mov [rsi + (16*4)],r10d ;F_R1
+ mov [rsi + (17*4)],r11d ;F_R2
+ mov [rsi + (18*4)],r12d ;BRC_X0
+ mov [rsi + (19*4)],r13d ;BRC_X1
+ mov [rsi + (20*4)],r14d ;BRC_X2
+ mov [rsi + (21*4)],r15d ;BRC_X3
+
+
+ ; Restore non-volatile registers
+ mov rbx, [rbp - 8]
+ mov r12, [rbp - 32]
+ mov r13, [rbp - 40]
+ mov r14, [rbp - 48]
+ mov r15, [rbp - 56]
+%ifndef LINUX
+ mov rdi, [rbp - 64]
+ mov rsi, [rbp - 72]
+%endif
+
+ ; restore base pointer
+ mov rsp, rbp
+ pop rbp
+
+ ret
+
+
+;;
+;; void asm_ZucGenKeystream8B(void *pKeystream, ZucState_t *pState);
+;;
+;; WIN64
+;; RCX - KS (key stream pointer)
+;; RDX - STATE (state pointer)
+;; LIN64
+;; RDI - KS (key stream pointer)
+;; RSI - STATE (state pointer)
+;;
+align 16
+MKGLOBAL(asm_ZucGenKeystream8B,function,internal)
+asm_ZucGenKeystream8B:
+
+%ifdef LINUX
+ %define pKS rdi
+ %define pState rsi
+%else
+ %define pKS rcx
+ %define pState rdx
+%endif
+ ; save the base pointer
+ push rbp
+
+ ;load stack pointer to rbp and reserve memory in the red zone
+ mov rbp, rsp
+ sub rsp, 196
+
+ ; Save non-volatile registers
+ mov [rbp - 8], rbx
+ mov [rbp - 32], r12
+ mov [rbp - 40], r13
+ mov [rbp - 48], r14
+ mov [rbp - 56], r15
+%ifndef LINUX
+ mov [rbp - 64], rdi
+ mov [rbp - 72], rsi
+%endif
+
+
+ ; Load input keystream pointer parameter in RAX
+ mov rax, pKS
+
+ ; Restore ZUC's state variables
+ xor r10, r10
+ xor r11, r11
+ mov r10d, [pState + OFFSET_FR1]
+ mov r11d, [pState + OFFSET_FR2]
+ mov r12d, [pState + OFFSET_BRC_X0]
+ mov r13d, [pState + OFFSET_BRC_X1]
+ mov r14d, [pState + OFFSET_BRC_X2]
+ mov r15d, [pState + OFFSET_BRC_X3]
+
+ ; Store keystream pointer
+ mov [rbp - 80], rax
+
+ ; Store ZUC State Pointer
+ mov [rbp - 88], pState
+
+ ; Generate 8B of keystream in 2 rounds
+%assign N 1
+%rep 2
+
+ mov rdx, [rbp - 88] ; load *pState
+ lea rsi, [rdx]
+
+ BITS_REORG N
+ NONLIN_FUN 1
+
+ ;Store the keystream
+ mov rbx, [rbp - 80] ; load *pkeystream
+ xor eax, r15d
+ mov [rbx], eax
+ add rbx, 4 ; increment the pointer
+ mov [rbp - 80], rbx ; save pkeystream
+
+ xor rax, rax
+
+ mov rdx, [rbp - 88] ; load *pState
+ lea rsi, [rdx]
+
+ LFSR_UPDT N
+
+%assign N N+1
+%endrep
+
+ mov rsi, [rbp - 88] ; load pState
+
+
+ ; Save ZUC's state variables
+ mov [rsi + OFFSET_FR1], r10d
+ mov [rsi + OFFSET_FR2], r11d
+ mov [rsi + OFFSET_BRC_X0], r12d
+ mov [rsi + OFFSET_BRC_X1], r13d
+ mov [rsi + OFFSET_BRC_X2], r14d
+ mov [rsi + OFFSET_BRC_X3], r15d
+
+ ; Restore non-volatile registers
+ mov rbx, [rbp - 8]
+ mov r12, [rbp - 32]
+ mov r13, [rbp - 40]
+ mov r14, [rbp - 48]
+ mov r15, [rbp - 56]
+%ifndef LINUX
+ mov rdi, [rbp - 64]
+ mov rsi, [rbp - 72]
+%endif
+
+ mov rsp, rbp
+ pop rbp
+
+ ret
+
+
+;;
+;; void asm_ZucGenKeystream64B(uint32_t * pKeystream, uint32_t * pState);
+;;
+;; WIN64
+;; RCX - KS (key stream pointer)
+;; RDX - STATE (state pointer)
+;; LIN64
+;; RDI - KS (key stream pointer)
+;; RSI - STATE (state pointer)
+;;
+align 16
+MKGLOBAL(asm_ZucGenKeystream64B,function,internal)
+asm_ZucGenKeystream64B:
+
+%ifdef LINUX
+ %define pKS rdi
+ %define pState rsi
+%else
+ %define pKS rcx
+ %define pState rdx
+%endif
+ ; save the base pointer
+ push rbp
+
+ ;load stack pointer to rbp and reserve memory in the red zone
+ mov rbp, rsp
+ sub rsp, 196
+
+ ; Save non-volatile registers
+ mov [rbp - 8], rbx
+ mov [rbp - 32], r12
+ mov [rbp - 40], r13
+ mov [rbp - 48], r14
+ mov [rbp - 56], r15
+%ifndef LINUX
+ mov [rbp - 64], rdi
+ mov [rbp - 72], rsi
+%endif
+
+
+ ; Load input keystream pointer parameter in RAX
+ mov rax, pKS
+
+ ; Restore ZUC's state variables
+ xor r10, r10
+ xor r11, r11
+ mov r10d, [pState + OFFSET_FR1]
+ mov r11d, [pState + OFFSET_FR2]
+ mov r12d, [pState + OFFSET_BRC_X0]
+ mov r13d, [pState + OFFSET_BRC_X1]
+ mov r14d, [pState + OFFSET_BRC_X2]
+ mov r15d, [pState + OFFSET_BRC_X3]
+
+ ; Store keystream pointer
+ mov [rbp - 80], rax
+
+ ; Store ZUC State Pointer
+ mov [rbp - 88], pState
+
+ ; Generate 64B of keystream in 16 rounds
+%assign N 1
+%rep 16
+
+ mov rdx, [rbp - 88] ; load *pState
+ lea rsi, [rdx]
+
+ BITS_REORG N
+ NONLIN_FUN 1
+
+ ;Store the keystream
+ mov rbx, [rbp - 80] ; load *pkeystream
+ xor eax, r15d
+ mov [rbx], eax
+ add rbx, 4 ; increment the pointer
+ mov [rbp - 80], rbx ; save pkeystream
+
+ xor rax, rax
+
+ mov rdx, [rbp - 88] ; load *pState
+ lea rsi, [rdx]
+
+ LFSR_UPDT N
+
+%assign N N+1
+%endrep
+
+ mov rsi, [rbp - 88] ; load pState
+
+
+ ; Save ZUC's state variables
+ mov [rsi + OFFSET_FR1], r10d
+ mov [rsi + OFFSET_FR2], r11d
+ mov [rsi + OFFSET_BRC_X0], r12d
+ mov [rsi + OFFSET_BRC_X1], r13d
+ mov [rsi + OFFSET_BRC_X2], r14d
+ mov [rsi + OFFSET_BRC_X3], r15d
+
+ ; Restore non-volatile registers
+ mov rbx, [rbp - 8]
+ mov r12, [rbp - 32]
+ mov r13, [rbp - 40]
+ mov r14, [rbp - 48]
+ mov r15, [rbp - 56]
+%ifndef LINUX
+ mov rdi, [rbp - 64]
+ mov rsi, [rbp - 72]
+%endif
+
+ mov rsp, rbp
+ pop rbp
+
+ ret
+
+
diff --git a/src/spdk/intel-ipsec-mb/include/zuc_internal.h b/src/spdk/intel-ipsec-mb/include/zuc_internal.h
new file mode 100755
index 000000000..525a1604c
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/include/zuc_internal.h
@@ -0,0 +1,432 @@
+/*******************************************************************************
+ Copyright (c) 2009-2019, Intel Corporation
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice,
+ this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+ * Neither the name of Intel Corporation nor the names of its contributors
+ may be used to endorse or promote products derived from this software
+ without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+ FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+
+/**
+ ******************************************************************************
+ * @file zuc_internal.h
+ *
+ * @description
+ * This header file defines the internal API's and data types for the
+ * 3GPP algorithm ZUC.
+ *
+ *****************************************************************************/
+
+#ifndef ZUC_INTERNAL_H_
+#define ZUC_INTERNAL_H_
+
+#include <stdio.h>
+#include <stdint.h>
+
+#include "intel-ipsec-mb.h"
+#include "immintrin.h"
+#include "include/wireless_common.h"
+
+/* 64 bytes of Keystream will be generated */
+#define ZUC_KEYSTR_LEN (64)
+#define NUM_LFSR_STATES (16)
+#define ZUC_WORD (32)
+
+/* Range of input data for ZUC is from 1 to 65504 bits */
+#define ZUC_MIN_LEN 1
+#define ZUC_MAX_LEN 65504
+
+#ifdef DEBUG
+#ifdef _WIN32
+#define DEBUG_PRINT(_fmt, ...) \
+ fprintf(stderr, "%s()::%d " _fmt , __FUNCTION__, __LINE__, __VA_ARGS__)
+#else
+#define DEBUG_PRINT(_fmt, ...) \
+ fprintf(stderr, "%s()::%d " _fmt , __func__, __LINE__, __VA_ARGS__)
+#endif
+#else
+#define DEBUG_PRINT(_fmt, ...)
+#endif
+
+/**
+ ******************************************************************************
+ * @description
+ * Macro will loop through keystream of length 64bytes and xor with the
+ * input buffer placing the result in the output buffer.
+ * KeyStream bytes must be swaped on 32bit boundary before this operation
+ *
+ *****************************************************************************/
+#define ZUC_XOR_KEYSTREAM(pIn64, pOut64, pKeyStream64) \
+{ \
+ int i =0; \
+ union SwapBytes_t { \
+ uint64_t l64; \
+ uint32_t w32[2]; \
+ }swapBytes; \
+ /* loop through the key stream and xor 64 bits at a time */ \
+ for(i =0; i < ZUC_KEYSTR_LEN/8; i++) { \
+ swapBytes.l64 = *pKeyStream64++; \
+ swapBytes.w32[0] = bswap4(swapBytes.w32[0]); \
+ swapBytes.w32[1] = bswap4(swapBytes.w32[1]); \
+ *pOut64++ = *pIn64++ ^ swapBytes.l64; \
+ } \
+}
+
+/**
+ *****************************************************************************
+ * @description
+ * Packed structure to store the ZUC state for a single packet. *
+ *****************************************************************************/
+typedef struct zuc_state_s {
+ uint32_t lfsrState[16];
+ /**< State registers of the LFSR */
+ uint32_t fR1;
+ /**< register of F */
+ uint32_t fR2;
+ /**< register of F */
+ uint32_t bX0;
+ /**< Output X0 of the bit reorganization */
+ uint32_t bX1;
+ /**< Output X1 of the bit reorganization */
+ uint32_t bX2;
+ /**< Output X2 of the bit reorganization */
+ uint32_t bX3;
+ /**< Output X3 of the bit reorganization */
+} ZucState_t;
+
+/**
+ *****************************************************************************
+ * @description
+ * Packed structure to store the ZUC state for a single packet. *
+ *****************************************************************************/
+typedef struct zuc_state_4_s {
+ uint32_t lfsrState[16][4];
+ /**< State registers of the LFSR */
+ uint32_t fR1[4];
+ /**< register of F */
+ uint32_t fR2[4];
+ /**< register of F */
+ uint32_t bX0[4];
+ /**< Output X0 of the bit reorganization for 4 packets */
+ uint32_t bX1[4];
+ /**< Output X1 of the bit reorganization for 4 packets */
+ uint32_t bX2[4];
+ /**< Output X2 of the bit reorganization for 4 packets */
+ uint32_t bX3[4];
+ /**< Output X3 of the bit reorganization for 4 packets */
+} ZucState4_t;
+
+/**
+ *****************************************************************************
+ * @description
+ * Structure to store pointers to the 4 keys to be used as input to
+ * @ref asm_ZucInitialization_4 and @ref asm_ZucGenKeystream64B_4
+ *****************************************************************************/
+typedef struct zuc_key_4_s {
+ const uint8_t *pKey1;
+ /**< Pointer to 128-bit key for packet 1 */
+ const uint8_t *pKey2;
+ /**< Pointer to 128-bit key for packet 2 */
+ const uint8_t *pKey3;
+ /**< Pointer to 128-bit key for packet 3 */
+ const uint8_t *pKey4;
+ /**< Pointer to 128-bit key for packet 4 */
+} ZucKey4_t;
+
+/**
+ *****************************************************************************
+ * @description
+ * Structure to store pointers to the 4 IV's to be used as input to
+ * @ref asm_ZucInitialization_4 and @ref asm_ZucGenKeystream64B_4
+ *****************************************************************************/
+typedef struct zuc_iv_4_s {
+ const uint8_t *pIv1;
+ /**< Pointer to 128-bit initialization vector for packet 1 */
+ const uint8_t *pIv2;
+ /**< Pointer to 128-bit initialization vector for packet 2 */
+ const uint8_t *pIv3;
+ /**< Pointer to 128-bit initialization vector for packet 3 */
+ const uint8_t *pIv4;
+ /**< Pointer to 128-bit initialization vector for packet 4 */
+} ZucIv4_t;
+
+/**
+ ******************************************************************************
+ *
+ * @description
+ * Definition of the external function that implements the initialization
+ * stage of the ZUC algorithm. The function will initialize the state
+ * for a single packet operation.
+ *
+ * @param[in] pKey Pointer to the 128-bit initial key that
+ * will be used when initializing the ZUC
+ * state.
+ * @param[in] pIv Pointer to the 128-bit initial vector that
+ * will be used when initializing the ZUC
+ * state.
+ * @param[in,out] pState Pointer to a ZUC state structure of type
+ * @ref ZucState_t that will be populated
+ * with the initialized ZUC state.
+ *
+ * @pre
+ * None
+ *
+ *****************************************************************************/
+IMB_DLL_LOCAL void asm_ZucInitialization(const void *pKey,
+ const void *pIv,
+ ZucState_t *pState);
+
+/**
+ ******************************************************************************
+ * @description
+ * Definition of the external function that implements the initialization
+ * stage of the ZUC algorithm for 4 packets. The function will initialize
+ * the state for 4 individual packets.
+ *
+ * @param[in] pKey Pointer to an array of 128-bit initial keys
+ * that will be used when initializing the ZUC
+ * state.
+ * @param[in] pIv Pointer to an array of 128-bit initial
+ * vectors that will be used when initializing
+ * the ZUC state.
+ * @param[in,out] pState Pointer to a ZUC state structure of type
+ * @ref ZucState4_t that will be populated
+ * with the initialized ZUC state.
+ *
+ * @pre
+ * None
+ *
+ *****************************************************************************/
+IMB_DLL_LOCAL void asm_ZucInitialization_4_sse(ZucKey4_t *pKeys,
+ ZucIv4_t *pIvs,
+ ZucState4_t *pState);
+
+IMB_DLL_LOCAL void asm_ZucInitialization_4_avx(ZucKey4_t *pKeys,
+ ZucIv4_t *pIvs,
+ ZucState4_t *pState);
+
+/**
+ ******************************************************************************
+ *
+ * @description
+ * Definition of the external function that implements the working
+ * stage of the ZUC algorithm. The function will generate 64 bytes of
+ * keystream.
+ *
+ * @param[in,out] pKeystream Pointer to an input buffer that will
+ * contain the generated keystream.
+
+ * @param[in] pState Pointer to a ZUC state structure of type
+ * @ref ZucState_t
+ *
+ * @pre
+ * A successful call to @ref asm_ZucInitialization to initialize the ZUC
+ * state.
+ *
+ *****************************************************************************/
+IMB_DLL_LOCAL void asm_ZucGenKeystream64B(uint32_t *pKeystream,
+ ZucState_t *pState);
+
+/**
+ ******************************************************************************
+ *
+ * @description
+ * Definition of the external function that implements the working
+ * stage of the ZUC algorithm. The function will generate 8 bytes of
+ * keystream.
+ *
+ * @param[in,out] pKeystream Pointer to an input buffer that will
+ * contain the generated keystream.
+
+ * @param[in] pState Pointer to a ZUC state structure of type
+ * @ref ZucState_t
+ *
+ * @pre
+ * A successful call to @ref asm_ZucInitialization to initialize the ZUC
+ * state.
+ *
+ *****************************************************************************/
+IMB_DLL_LOCAL void asm_ZucGenKeystream8B(void *pKeystream,
+ ZucState_t *pState);
+
+/**
+ ******************************************************************************
+ *
+ * @description
+ * Definition of the external function that implements the working
+ * stage of the ZUC algorithm. The function will generate 64 bytes of
+ * keystream for four packets in parallel.
+ *
+ * @param[in] pState Pointer to a ZUC state structure of type
+ * @ref ZucState4_t
+ *
+ * @param[in,out] pKeyStr1 Pointer to an input buffer that will
+ * contain the generated keystream for packet
+ * one.
+ * @param[in,out] pKeyStr2 Pointer to an input buffer that will
+ * contain the generated keystream for packet
+ * two.
+ * @param[in,out] pKeyStr3 Pointer to an input buffer that will
+ * contain the generated keystream for packet
+ * three.
+ * @param[in,out] pKeyStr4 Pointer to an input buffer that will
+ * contain the generated keystream for packet
+ * four.
+ *
+ * @pre
+ * A successful call to @ref asm_ZucInitialization_4 to initialize the ZUC
+ * state.
+ *
+ *****************************************************************************/
+IMB_DLL_LOCAL void asm_ZucGenKeystream64B_4_sse(ZucState4_t *pState,
+ uint32_t *pKeyStr1,
+ uint32_t *pKeyStr2,
+ uint32_t *pKeyStr3,
+ uint32_t *pKeyStr4);
+
+IMB_DLL_LOCAL void asm_ZucGenKeystream64B_4_avx(ZucState4_t *pState,
+ uint32_t *pKeyStr1,
+ uint32_t *pKeyStr2,
+ uint32_t *pKeyStr3,
+ uint32_t *pKeyStr4);
+
+/**
+ ******************************************************************************
+ * @description
+ * Definition of the external function to update the authentication tag
+ * based on keystream and data (SSE varient)
+ *
+ * @param[in] T Authentication tag
+ *
+ * @param[in] ks Pointer to key stream
+ *
+ * @param[in] data Pointer to the data
+ *
+ * @pre
+ * None
+ *
+ *****************************************************************************/
+IMB_DLL_LOCAL uint32_t asm_Eia3Round64BSSE(uint32_t T, const void *ks,
+ const void *data);
+
+/**
+ ******************************************************************************
+ * @description
+ * Definition of the external function to return the authentication
+ * update value to be XOR'ed with current authentication tag (SSE variant)
+ *
+ * @param[in] ks Pointer to key stream
+ *
+ * @param[in] data Pointer to the data
+ *
+ * @param[in] n_words Number of data bits to be processed
+ *
+ * @pre
+ * None
+ *
+ *****************************************************************************/
+IMB_DLL_LOCAL uint32_t asm_Eia3RemainderSSE(const void *ks, const void *data,
+ const uint64_t n_words);
+
+/**
+ ******************************************************************************
+ * @description
+ * Definition of the external function to update the authentication tag
+ * based on keystream and data (AVX variant)
+ *
+ * @param[in] T Authentication tag
+ *
+ * @param[in] ks Pointer to key stream
+ *
+ * @param[in] data Pointer to the data
+ *
+ * @pre
+ * None
+ *
+ *****************************************************************************/
+IMB_DLL_LOCAL uint32_t asm_Eia3Round64BAVX(uint32_t T, const void *ks,
+ const void *data);
+
+/**
+ ******************************************************************************
+ * @description
+ * Definition of the external function to return the authentication
+ * update value to be XOR'ed with current authentication tag (AVX variant)
+ *
+ * @param[in] ks Pointer to key stream
+ *
+ * @param[in] data Pointer to the data
+ *
+ * @param[in] n_words Number of data bits to be processed
+ *
+ * @pre
+ * None
+ *
+ *****************************************************************************/
+IMB_DLL_LOCAL uint32_t asm_Eia3RemainderAVX(const void *ks, const void *data,
+ const uint64_t n_words);
+
+
+/* the s-boxes */
+extern const uint8_t S0[256];
+extern const uint8_t S1[256];
+
+void zuc_eea3_1_buffer_sse(const void *pKey, const void *pIv,
+ const void *pBufferIn, void *pBufferOut,
+ const uint32_t lengthInBytes);
+
+void zuc_eea3_4_buffer_sse(const void * const pKey[4],
+ const void * const pIv[4],
+ const void * const pBufferIn[4],
+ void *pBufferOut[4],
+ const uint32_t lengthInBytes[4]);
+
+void zuc_eea3_n_buffer_sse(const void * const pKey[], const void * const pIv[],
+ const void * const pBufferIn[], void *pBufferOut[],
+ const uint32_t lengthInBytes[],
+ const uint32_t numBuffers);
+
+void zuc_eia3_1_buffer_sse(const void *pKey, const void *pIv,
+ const void *pBufferIn, const uint32_t lengthInBits,
+ uint32_t *pMacI);
+
+void zuc_eea3_1_buffer_avx(const void *pKey, const void *pIv,
+ const void *pBufferIn, void *pBufferOut,
+ const uint32_t lengthInBytes);
+
+void zuc_eea3_4_buffer_avx(const void * const pKey[4],
+ const void * const pIv[4],
+ const void * const pBufferIn[4],
+ void *pBufferOut[4],
+ const uint32_t lengthInBytes[4]);
+
+void zuc_eea3_n_buffer_avx(const void * const pKey[], const void * const pIv[],
+ const void * const pBufferIn[], void *pBufferOut[],
+ const uint32_t lengthInBytes[],
+ const uint32_t numBuffers);
+
+void zuc_eia3_1_buffer_avx(const void *pKey, const void *pIv,
+ const void *pBufferIn, const uint32_t lengthInBits,
+ uint32_t *pMacI);
+
+
+#endif /* ZUC_INTERNAL_H_ */
+