From e6918187568dbd01842d8d1d2c808ce16a894239 Mon Sep 17 00:00:00 2001 From: Daniel Baumann Date: Sun, 21 Apr 2024 13:54:28 +0200 Subject: Adding upstream version 18.2.2. Signed-off-by: Daniel Baumann --- src/spdk/intel-ipsec-mb/include/aes_common.asm | 375 +++ src/spdk/intel-ipsec-mb/include/aesni_emu.h | 120 + src/spdk/intel-ipsec-mb/include/aesni_emu.inc | 247 ++ src/spdk/intel-ipsec-mb/include/clear_regs.asm | 196 ++ src/spdk/intel-ipsec-mb/include/clear_regs_mem.h | 53 + .../intel-ipsec-mb/include/clear_regs_mem_fns.asm | 124 + src/spdk/intel-ipsec-mb/include/const.inc | 163 ++ .../intel-ipsec-mb/include/constant_lookup.asm | 561 ++++ src/spdk/intel-ipsec-mb/include/constant_lookup.h | 173 ++ src/spdk/intel-ipsec-mb/include/cpu_feature.h | 52 + src/spdk/intel-ipsec-mb/include/datastruct.asm | 235 ++ src/spdk/intel-ipsec-mb/include/dbgprint.asm | 413 +++ src/spdk/intel-ipsec-mb/include/des_utils.h | 134 + src/spdk/intel-ipsec-mb/include/gcm.h | 428 +++ src/spdk/intel-ipsec-mb/include/gcm_defines.asm | 272 ++ .../include/gcm_keys_avx2_avx512.asm | 52 + .../intel-ipsec-mb/include/gcm_keys_sse_avx.asm | 73 + .../include/gcm_keys_vaes_avx512.asm | 231 ++ src/spdk/intel-ipsec-mb/include/kasumi_internal.h | 1853 +++++++++++++ src/spdk/intel-ipsec-mb/include/memcpy.asm | 613 +++++ src/spdk/intel-ipsec-mb/include/noaesni.h | 65 + src/spdk/intel-ipsec-mb/include/os.asm | 58 + src/spdk/intel-ipsec-mb/include/reg_sizes.asm | 300 +++ src/spdk/intel-ipsec-mb/include/save_xmms.asm | 132 + src/spdk/intel-ipsec-mb/include/save_xmms.h | 39 + src/spdk/intel-ipsec-mb/include/snow3g.h | 511 ++++ src/spdk/intel-ipsec-mb/include/snow3g_common.h | 2840 ++++++++++++++++++++ src/spdk/intel-ipsec-mb/include/snow3g_internal.h | 638 +++++ src/spdk/intel-ipsec-mb/include/transpose_avx2.asm | 218 ++ .../intel-ipsec-mb/include/transpose_avx512.asm | 497 ++++ .../intel-ipsec-mb/include/wireless_common.asm | 128 + src/spdk/intel-ipsec-mb/include/wireless_common.h | 216 ++ src/spdk/intel-ipsec-mb/include/zuc_common.asm | 740 +++++ src/spdk/intel-ipsec-mb/include/zuc_internal.h | 432 +++ 34 files changed, 13182 insertions(+) create mode 100644 src/spdk/intel-ipsec-mb/include/aes_common.asm create mode 100644 src/spdk/intel-ipsec-mb/include/aesni_emu.h create mode 100644 src/spdk/intel-ipsec-mb/include/aesni_emu.inc create mode 100644 src/spdk/intel-ipsec-mb/include/clear_regs.asm create mode 100644 src/spdk/intel-ipsec-mb/include/clear_regs_mem.h create mode 100644 src/spdk/intel-ipsec-mb/include/clear_regs_mem_fns.asm create mode 100644 src/spdk/intel-ipsec-mb/include/const.inc create mode 100644 src/spdk/intel-ipsec-mb/include/constant_lookup.asm create mode 100644 src/spdk/intel-ipsec-mb/include/constant_lookup.h create mode 100644 src/spdk/intel-ipsec-mb/include/cpu_feature.h create mode 100644 src/spdk/intel-ipsec-mb/include/datastruct.asm create mode 100644 src/spdk/intel-ipsec-mb/include/dbgprint.asm create mode 100644 src/spdk/intel-ipsec-mb/include/des_utils.h create mode 100644 src/spdk/intel-ipsec-mb/include/gcm.h create mode 100644 src/spdk/intel-ipsec-mb/include/gcm_defines.asm create mode 100644 src/spdk/intel-ipsec-mb/include/gcm_keys_avx2_avx512.asm create mode 100644 src/spdk/intel-ipsec-mb/include/gcm_keys_sse_avx.asm create mode 100644 src/spdk/intel-ipsec-mb/include/gcm_keys_vaes_avx512.asm create mode 100755 src/spdk/intel-ipsec-mb/include/kasumi_internal.h create mode 100644 src/spdk/intel-ipsec-mb/include/memcpy.asm create mode 100644 src/spdk/intel-ipsec-mb/include/noaesni.h create mode 100644 src/spdk/intel-ipsec-mb/include/os.asm create mode 100644 src/spdk/intel-ipsec-mb/include/reg_sizes.asm create mode 100644 src/spdk/intel-ipsec-mb/include/save_xmms.asm create mode 100644 src/spdk/intel-ipsec-mb/include/save_xmms.h create mode 100644 src/spdk/intel-ipsec-mb/include/snow3g.h create mode 100644 src/spdk/intel-ipsec-mb/include/snow3g_common.h create mode 100644 src/spdk/intel-ipsec-mb/include/snow3g_internal.h create mode 100644 src/spdk/intel-ipsec-mb/include/transpose_avx2.asm create mode 100644 src/spdk/intel-ipsec-mb/include/transpose_avx512.asm create mode 100644 src/spdk/intel-ipsec-mb/include/wireless_common.asm create mode 100644 src/spdk/intel-ipsec-mb/include/wireless_common.h create mode 100644 src/spdk/intel-ipsec-mb/include/zuc_common.asm create mode 100755 src/spdk/intel-ipsec-mb/include/zuc_internal.h (limited to 'src/spdk/intel-ipsec-mb/include') diff --git a/src/spdk/intel-ipsec-mb/include/aes_common.asm b/src/spdk/intel-ipsec-mb/include/aes_common.asm new file mode 100644 index 000000000..5c8cbb48c --- /dev/null +++ b/src/spdk/intel-ipsec-mb/include/aes_common.asm @@ -0,0 +1,375 @@ +;; +;; Copyright (c) 2019, Intel Corporation +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are met: +;; +;; * Redistributions of source code must retain the above copyright notice, +;; this list of conditions and the following disclaimer. +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; * Neither the name of Intel Corporation nor the names of its contributors +;; may be used to endorse or promote products derived from this software +;; without specific prior written permission. +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;; + +%ifndef _AES_COMMON_ASM_ +%define _AES_COMMON_ASM_ + +%include "include/reg_sizes.asm" + +;; ============================================================================= +;; Generic macro to produce code that executes %%OPCODE instruction +;; on selected number of AES blocks (16 bytes long ) between 0 and 16. +;; All three operands of the instruction come from registers. +;; Note: if 3 blocks are left at the end instruction is produced to operate all +;; 4 blocks (full width of ZMM) + +%macro ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16 14 +%define %%NUM_BLOCKS %1 ; [in] numerical value, number of AES blocks (0 to 16) +%define %%OPCODE %2 ; [in] instruction name +%define %%DST0 %3 ; [out] destination ZMM register +%define %%DST1 %4 ; [out] destination ZMM register +%define %%DST2 %5 ; [out] destination ZMM register +%define %%DST3 %6 ; [out] destination ZMM register +%define %%SRC1_0 %7 ; [in] source 1 ZMM register +%define %%SRC1_1 %8 ; [in] source 1 ZMM register +%define %%SRC1_2 %9 ; [in] source 1 ZMM register +%define %%SRC1_3 %10 ; [in] source 1 ZMM register +%define %%SRC2_0 %11 ; [in] source 2 ZMM register +%define %%SRC2_1 %12 ; [in] source 2 ZMM register +%define %%SRC2_2 %13 ; [in] source 2 ZMM register +%define %%SRC2_3 %14 ; [in] source 2 ZMM register + +%assign reg_idx 0 +%assign blocks_left %%NUM_BLOCKS + +%rep (%%NUM_BLOCKS / 4) +%xdefine %%DSTREG %%DST %+ reg_idx +%xdefine %%SRC1REG %%SRC1_ %+ reg_idx +%xdefine %%SRC2REG %%SRC2_ %+ reg_idx + %%OPCODE %%DSTREG, %%SRC1REG, %%SRC2REG +%undef %%DSTREG +%undef %%SRC1REG +%undef %%SRC2REG +%assign reg_idx (reg_idx + 1) +%assign blocks_left (blocks_left - 4) +%endrep + +%xdefine %%DSTREG %%DST %+ reg_idx +%xdefine %%SRC1REG %%SRC1_ %+ reg_idx +%xdefine %%SRC2REG %%SRC2_ %+ reg_idx + +%if blocks_left == 1 + %%OPCODE XWORD(%%DSTREG), XWORD(%%SRC1REG), XWORD(%%SRC2REG) +%elif blocks_left == 2 + %%OPCODE YWORD(%%DSTREG), YWORD(%%SRC1REG), YWORD(%%SRC2REG) +%elif blocks_left == 3 + %%OPCODE %%DSTREG, %%SRC1REG, %%SRC2REG +%endif + +%endmacro + +;; ============================================================================= +;; Loads specified number of AES blocks into ZMM registers +;; %%FLAGS are optional and only affect behavior when 3 trailing blocks are left +;; - if %%FlAGS not provided then exactly 3 blocks are loaded (move and insert) +;; - if "load_4_instead_of_3" option is passed then 4 blocks are loaded +%macro ZMM_LOAD_BLOCKS_0_16 7-8 +%define %%NUM_BLOCKS %1 ; [in] numerical value, number of AES blocks (0 to 16) +%define %%INP %2 ; [in] input data pointer to read from +%define %%DATA_OFFSET %3 ; [in] offset to the output pointer (GP or numerical) +%define %%DST0 %4 ; [out] ZMM register with loaded data +%define %%DST1 %5 ; [out] ZMM register with loaded data +%define %%DST2 %6 ; [out] ZMM register with loaded data +%define %%DST3 %7 ; [out] ZMM register with loaded data +%define %%FLAGS %8 ; [in] optional "load_4_instead_of_3" + +%assign src_offset 0 +%assign dst_idx 0 + +%rep (%%NUM_BLOCKS / 4) +%xdefine %%DSTREG %%DST %+ dst_idx + vmovdqu8 %%DSTREG, [%%INP + %%DATA_OFFSET + src_offset] +%undef %%DSTREG +%assign src_offset (src_offset + 64) +%assign dst_idx (dst_idx + 1) +%endrep + +%assign blocks_left (%%NUM_BLOCKS % 4) +%xdefine %%DSTREG %%DST %+ dst_idx + +%if blocks_left == 1 + vmovdqu8 XWORD(%%DSTREG), [%%INP + %%DATA_OFFSET + src_offset] +%elif blocks_left == 2 + vmovdqu8 YWORD(%%DSTREG), [%%INP + %%DATA_OFFSET + src_offset] +%elif blocks_left == 3 +%ifidn %%FLAGS, load_4_instead_of_3 + vmovdqu8 %%DSTREG, [%%INP + %%DATA_OFFSET + src_offset] +%else + vmovdqu8 YWORD(%%DSTREG), [%%INP + %%DATA_OFFSET + src_offset] + vinserti64x2 %%DSTREG, [%%INP + %%DATA_OFFSET + src_offset + 32], 2 +%endif +%endif + +%endmacro + +;; ============================================================================= +;; Loads specified number of AES blocks into ZMM registers using mask register +;; for the last loaded register (xmm, ymm or zmm). +;; Loads take place at 1 byte granularity. +%macro ZMM_LOAD_MASKED_BLOCKS_0_16 8 +%define %%NUM_BLOCKS %1 ; [in] numerical value, number of AES blocks (0 to 16) +%define %%INP %2 ; [in] input data pointer to read from +%define %%DATA_OFFSET %3 ; [in] offset to the output pointer (GP or numerical) +%define %%DST0 %4 ; [out] ZMM register with loaded data +%define %%DST1 %5 ; [out] ZMM register with loaded data +%define %%DST2 %6 ; [out] ZMM register with loaded data +%define %%DST3 %7 ; [out] ZMM register with loaded data +%define %%MASK %8 ; [in] mask register + +%assign src_offset 0 +%assign dst_idx 0 +%assign blocks_left %%NUM_BLOCKS + +%if %%NUM_BLOCKS > 0 +%rep (((%%NUM_BLOCKS + 3) / 4) - 1) +%xdefine %%DSTREG %%DST %+ dst_idx + vmovdqu8 %%DSTREG, [%%INP + %%DATA_OFFSET + src_offset] +%undef %%DSTREG +%assign src_offset (src_offset + 64) +%assign dst_idx (dst_idx + 1) +%assign blocks_left (blocks_left - 4) +%endrep +%endif ; %if %%NUM_BLOCKS > 0 + +%xdefine %%DSTREG %%DST %+ dst_idx + +%if blocks_left == 1 + vmovdqu8 XWORD(%%DSTREG){%%MASK}{z}, [%%INP + %%DATA_OFFSET + src_offset] +%elif blocks_left == 2 + vmovdqu8 YWORD(%%DSTREG){%%MASK}{z}, [%%INP + %%DATA_OFFSET + src_offset] +%elif (blocks_left == 3 || blocks_left == 4) + vmovdqu8 %%DSTREG{%%MASK}{z}, [%%INP + %%DATA_OFFSET + src_offset] +%endif + +%endmacro + +;; ============================================================================= +;; Stores specified number of AES blocks from ZMM registers +%macro ZMM_STORE_BLOCKS_0_16 7 +%define %%NUM_BLOCKS %1 ; [in] numerical value, number of AES blocks (0 to 16) +%define %%OUTP %2 ; [in] output data pointer to write to +%define %%DATA_OFFSET %3 ; [in] offset to the output pointer (GP or numerical) +%define %%SRC0 %4 ; [in] ZMM register with data to store +%define %%SRC1 %5 ; [in] ZMM register with data to store +%define %%SRC2 %6 ; [in] ZMM register with data to store +%define %%SRC3 %7 ; [in] ZMM register with data to store + +%assign dst_offset 0 +%assign src_idx 0 + +%rep (%%NUM_BLOCKS / 4) +%xdefine %%SRCREG %%SRC %+ src_idx + vmovdqu8 [%%OUTP + %%DATA_OFFSET + dst_offset], %%SRCREG +%undef %%SRCREG +%assign dst_offset (dst_offset + 64) +%assign src_idx (src_idx + 1) +%endrep + +%assign blocks_left (%%NUM_BLOCKS % 4) +%xdefine %%SRCREG %%SRC %+ src_idx + +%if blocks_left == 1 + vmovdqu8 [%%OUTP + %%DATA_OFFSET + dst_offset], XWORD(%%SRCREG) +%elif blocks_left == 2 + vmovdqu8 [%%OUTP + %%DATA_OFFSET + dst_offset], YWORD(%%SRCREG) +%elif blocks_left == 3 + vmovdqu8 [%%OUTP + %%DATA_OFFSET + dst_offset], YWORD(%%SRCREG) + vextracti32x4 [%%OUTP + %%DATA_OFFSET + dst_offset + 32], %%SRCREG, 2 +%endif + +%endmacro + +;; ============================================================================= +;; Stores specified number of AES blocks from ZMM registers with mask register +;; for the last loaded register (xmm, ymm or zmm). +;; Stores take place at 1 byte granularity. +%macro ZMM_STORE_MASKED_BLOCKS_0_16 8 +%define %%NUM_BLOCKS %1 ; [in] numerical value, number of AES blocks (0 to 16) +%define %%OUTP %2 ; [in] output data pointer to write to +%define %%DATA_OFFSET %3 ; [in] offset to the output pointer (GP or numerical) +%define %%SRC0 %4 ; [in] ZMM register with data to store +%define %%SRC1 %5 ; [in] ZMM register with data to store +%define %%SRC2 %6 ; [in] ZMM register with data to store +%define %%SRC3 %7 ; [in] ZMM register with data to store +%define %%MASK %8 ; [in] mask register + +%assign dst_offset 0 +%assign src_idx 0 +%assign blocks_left %%NUM_BLOCKS + +%if %%NUM_BLOCKS > 0 +%rep (((%%NUM_BLOCKS + 3) / 4) - 1) +%xdefine %%SRCREG %%SRC %+ src_idx + vmovdqu8 [%%OUTP + %%DATA_OFFSET + dst_offset], %%SRCREG +%undef %%SRCREG +%assign dst_offset (dst_offset + 64) +%assign src_idx (src_idx + 1) +%assign blocks_left (blocks_left - 4) +%endrep +%endif ; %if %%NUM_BLOCKS > 0 + +%xdefine %%SRCREG %%SRC %+ src_idx + +%if blocks_left == 1 + vmovdqu8 [%%OUTP + %%DATA_OFFSET + dst_offset]{%%MASK}, XWORD(%%SRCREG) +%elif blocks_left == 2 + vmovdqu8 [%%OUTP + %%DATA_OFFSET + dst_offset]{%%MASK}, YWORD(%%SRCREG) +%elif (blocks_left == 3 || blocks_left == 4) + vmovdqu8 [%%OUTP + %%DATA_OFFSET + dst_offset]{%%MASK}, %%SRCREG +%endif + +%endmacro + +;;; =========================================================================== +;;; Handles AES encryption rounds +;;; It handles special cases: the last and first rounds +;;; Optionally, it performs XOR with data after the last AES round. +;;; Uses NROUNDS parameterto check what needs to be done for the current round. +;;; If 3 blocks are trailing then operation on whole ZMM is performed (4 blocks). +%macro ZMM_AESENC_ROUND_BLOCKS_0_16 12 +%define %%L0B0_3 %1 ; [in/out] zmm; blocks 0 to 3 +%define %%L0B4_7 %2 ; [in/out] zmm; blocks 4 to 7 +%define %%L0B8_11 %3 ; [in/out] zmm; blocks 8 to 11 +%define %%L0B12_15 %4 ; [in/out] zmm; blocks 12 to 15 +%define %%KEY %5 ; [in] zmm containing round key +%define %%ROUND %6 ; [in] round number +%define %%D0_3 %7 ; [in] zmm or no_data; plain/cipher text blocks 0-3 +%define %%D4_7 %8 ; [in] zmm or no_data; plain/cipher text blocks 4-7 +%define %%D8_11 %9 ; [in] zmm or no_data; plain/cipher text blocks 8-11 +%define %%D12_15 %10 ; [in] zmm or no_data; plain/cipher text blocks 12-15 +%define %%NUMBL %11 ; [in] number of blocks; numerical value +%define %%NROUNDS %12 ; [in] number of rounds; numerical value + +;;; === first AES round +%if (%%ROUND < 1) + ;; round 0 + ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16 %%NUMBL, vpxorq, \ + %%L0B0_3, %%L0B4_7, %%L0B8_11, %%L0B12_15, \ + %%L0B0_3, %%L0B4_7, %%L0B8_11, %%L0B12_15, \ + %%KEY, %%KEY, %%KEY, %%KEY +%endif ; ROUND 0 + +;;; === middle AES rounds +%if (%%ROUND >= 1 && %%ROUND <= %%NROUNDS) + ;; rounds 1 to 9/11/13 + ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16 %%NUMBL, vaesenc, \ + %%L0B0_3, %%L0B4_7, %%L0B8_11, %%L0B12_15, \ + %%L0B0_3, %%L0B4_7, %%L0B8_11, %%L0B12_15, \ + %%KEY, %%KEY, %%KEY, %%KEY +%endif ; rounds 1 to 9/11/13 + +;;; === last AES round +%if (%%ROUND > %%NROUNDS) + ;; the last round - mix enclast with text xor's + ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16 %%NUMBL, vaesenclast, \ + %%L0B0_3, %%L0B4_7, %%L0B8_11, %%L0B12_15, \ + %%L0B0_3, %%L0B4_7, %%L0B8_11, %%L0B12_15, \ + %%KEY, %%KEY, %%KEY, %%KEY + +;;; === XOR with data +%ifnidn %%D0_3, no_data +%ifnidn %%D4_7, no_data +%ifnidn %%D8_11, no_data +%ifnidn %%D12_15, no_data + ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16 %%NUMBL, vpxorq, \ + %%L0B0_3, %%L0B4_7, %%L0B8_11, %%L0B12_15, \ + %%L0B0_3, %%L0B4_7, %%L0B8_11, %%L0B12_15, \ + %%D0_3, %%D4_7, %%D8_11, %%D12_15 +%endif ; !no_data +%endif ; !no_data +%endif ; !no_data +%endif ; !no_data + +%endif ; The last round + +%endmacro + +;;; =========================================================================== +;;; Handles AES decryption rounds +;;; It handles special cases: the last and first rounds +;;; Optionally, it performs XOR with data after the last AES round. +;;; Uses NROUNDS parameter to check what needs to be done for the current round. +;;; If 3 blocks are trailing then operation on whole ZMM is performed (4 blocks). +%macro ZMM_AESDEC_ROUND_BLOCKS_0_16 12 +%define %%L0B0_3 %1 ; [in/out] zmm; blocks 0 to 3 +%define %%L0B4_7 %2 ; [in/out] zmm; blocks 4 to 7 +%define %%L0B8_11 %3 ; [in/out] zmm; blocks 8 to 11 +%define %%L0B12_15 %4 ; [in/out] zmm; blocks 12 to 15 +%define %%KEY %5 ; [in] zmm containing round key +%define %%ROUND %6 ; [in] round number +%define %%D0_3 %7 ; [in] zmm or no_data; cipher text blocks 0-3 +%define %%D4_7 %8 ; [in] zmm or no_data; cipher text blocks 4-7 +%define %%D8_11 %9 ; [in] zmm or no_data; cipher text blocks 8-11 +%define %%D12_15 %10 ; [in] zmm or no_data; cipher text blocks 12-15 +%define %%NUMBL %11 ; [in] number of blocks; numerical value +%define %%NROUNDS %12 ; [in] number of rounds; numerical value + +;;; === first AES round +%if (%%ROUND < 1) + ;; round 0 + ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16 %%NUMBL, vpxorq, \ + %%L0B0_3, %%L0B4_7, %%L0B8_11, %%L0B12_15, \ + %%L0B0_3, %%L0B4_7, %%L0B8_11, %%L0B12_15, \ + %%KEY, %%KEY, %%KEY, %%KEY +%endif ; ROUND 0 + +;;; === middle AES rounds +%if (%%ROUND >= 1 && %%ROUND <= %%NROUNDS) + ;; rounds 1 to 9/11/13 + ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16 %%NUMBL, vaesdec, \ + %%L0B0_3, %%L0B4_7, %%L0B8_11, %%L0B12_15, \ + %%L0B0_3, %%L0B4_7, %%L0B8_11, %%L0B12_15, \ + %%KEY, %%KEY, %%KEY, %%KEY +%endif ; rounds 1 to 9/11/13 + +;;; === last AES round +%if (%%ROUND > %%NROUNDS) + ;; the last round - mix enclast with text xor's + ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16 %%NUMBL, vaesdeclast, \ + %%L0B0_3, %%L0B4_7, %%L0B8_11, %%L0B12_15, \ + %%L0B0_3, %%L0B4_7, %%L0B8_11, %%L0B12_15, \ + %%KEY, %%KEY, %%KEY, %%KEY + +;;; === XOR with data +%ifnidn %%D0_3, no_data +%ifnidn %%D4_7, no_data +%ifnidn %%D8_11, no_data +%ifnidn %%D12_15, no_data + ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16 %%NUMBL, vpxorq, \ + %%L0B0_3, %%L0B4_7, %%L0B8_11, %%L0B12_15, \ + %%L0B0_3, %%L0B4_7, %%L0B8_11, %%L0B12_15, \ + %%D0_3, %%D4_7, %%D8_11, %%D12_15 +%endif ; !no_data +%endif ; !no_data +%endif ; !no_data +%endif ; !no_data + +%endif ; The last round + +%endmacro + +%endif ;; _AES_COMMON_ASM diff --git a/src/spdk/intel-ipsec-mb/include/aesni_emu.h b/src/spdk/intel-ipsec-mb/include/aesni_emu.h new file mode 100644 index 000000000..575fada22 --- /dev/null +++ b/src/spdk/intel-ipsec-mb/include/aesni_emu.h @@ -0,0 +1,120 @@ +/******************************************************************************* + Copyright (c) 2018, Intel Corporation + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of Intel Corporation nor the names of its contributors + may be used to endorse or promote products derived from this software + without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE + FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#ifndef _AESNI_EMU_H_ +#define _AESNI_EMU_H_ +#include + +/* Interface to AESNI emulation routines */ + +/* XMM type definitions and constants */ + +#define MAX_BYTES_PER_XMM 16 +#define MAX_WORDS_PER_XMM 8 +#define MAX_DWORDS_PER_XMM 4 +#define MAX_QWORDS_PER_XMM 2 + +union xmm_reg { + uint8_t byte[MAX_BYTES_PER_XMM]; + uint16_t word[MAX_WORDS_PER_XMM]; + uint32_t dword[MAX_DWORDS_PER_XMM]; + uint64_t qword[MAX_QWORDS_PER_XMM]; +}; + +/* AESNI emulation API */ + +/** + * @brief AESKEYGENASIST instruction emulation function + * + * Assist in AES round key generation using an 8 bits Round Constant + * (RCON) specified in \a imm8, operating on 128 bits of data + * + * @param dst pointer to 128 bit buffer to store generated key + * @param src pointer to 128 bit src key + * @param imm8 round constant used to generate key + */ +IMB_DLL_LOCAL void emulate_AESKEYGENASSIST(union xmm_reg *dst, + const union xmm_reg *src, + const uint32_t imm8); + +/** + * @brief AESENC instruction emulation function + * + * Perform one round of an AES encryption flow + * + * @param dst pointer to 128 bit data (state) to operate on + * @param src pointer to 128 bit round key + */ +IMB_DLL_LOCAL void emulate_AESENC(union xmm_reg *dst, + const union xmm_reg *src); + +/** + * @brief AESENCLAST instruction emulation function + * + * Perform last round of an AES encryption flow + * + * @param dst pointer to 128 bit data (state) to operate on + * @param src pointer to 128 bit round key + */ +IMB_DLL_LOCAL void emulate_AESENCLAST(union xmm_reg *dst, + const union xmm_reg *src); + +/** + * @brief AESDEC instruction emulation function + * + * Perform one round of an AES decryption flow + * + * @param dst pointer to 128 bit data (state) to operate on + * @param src pointer to 128 bit round key + */ +IMB_DLL_LOCAL void emulate_AESDEC(union xmm_reg *dst, + const union xmm_reg *src); + +/** + * @brief AESDECLAST instruction emulation function + * + * Perform last round of an AES decryption flow + * + * @param dst pointer to 128 bit data (state) to operate on + * @param src pointer to 128 bit round key + */ +IMB_DLL_LOCAL void emulate_AESDECLAST(union xmm_reg *dst, + const union xmm_reg *src); + +/** + * @brief AESIMC instruction emulation function + * + * Perform the InvMixColumn transformation on + * a 128 bit round key + * + * @param dst pointer to 128 bit buffer to store result + * @param src pointer to 128 bit round key + */ +IMB_DLL_LOCAL void emulate_AESIMC(union xmm_reg *dst, + const union xmm_reg *src); + +#endif /* _AESNI_EMU_H_ */ diff --git a/src/spdk/intel-ipsec-mb/include/aesni_emu.inc b/src/spdk/intel-ipsec-mb/include/aesni_emu.inc new file mode 100644 index 000000000..5a40180c8 --- /dev/null +++ b/src/spdk/intel-ipsec-mb/include/aesni_emu.inc @@ -0,0 +1,247 @@ +;; +;; Copyright (c) 2018, Intel Corporation +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are met: +;; +;; * Redistributions of source code must retain the above copyright notice, +;; this list of conditions and the following disclaimer. +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; * Neither the name of Intel Corporation nor the names of its contributors +;; may be used to endorse or promote products derived from this software +;; without specific prior written permission. +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;; + +%ifndef _AESNI_EMU_INC_ +%define _AESNI_EMU_INC_ + +%include "include/reg_sizes.asm" + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; Utility macros and defines to assist AESNI translation macros +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%define GP0 rax +%define GP1 rbx +%define GP2 rcx +%define GP3 rdx +%define GP4 rbp +%define GP5 rsi +%define GP6 rdi +%define GP7 r8 +%define GP8 r9 +%define GP9 r10 +%define GP10 r11 +%define GP11 r12 +%define GP12 r13 +%define GP13 r14 +%define GP14 r15 +%define NUM_GP_REGS 15 +%define NUM_XMM_REGS 16 + +%define GP_SZ 8 +%define XMM_SZ 16 +%define ARG_SZ 16 + +;; 8 extra bytes added to align to 16 bytes +%define XMM_OFFSET ((NUM_GP_REGS + 1) * GP_SZ) +;; ARG1 placed in the stack after all GP and XMM registers +%define ARG1_OFFSET (XMM_OFFSET + (NUM_XMM_REGS * XMM_SZ)) +;; ARG2 placed in the stack after all GP and XMM registers and ARG1 +%define ARG2_OFFSET (ARG1_OFFSET + ARG_SZ) + +%define GP(x) GP %+ x +%define XMM(x) xmm %+ x + +;; Reserve enough stack space to store all GP and XMM +;; registers and emulation function arguments +;; e.g. void emulate_AESXXX(xmm_reg *dst, xmm_reg *src); +%define RES_STACK_SZ (ARG2_OFFSET + ARG_SZ) + +;; Allocate stack space and save GP registers +%macro SAVE_GP_REGS 0 + push rax + mov rax, rsp + sub rsp, RES_STACK_SZ + and rsp, -16 +%assign gp_regs_i 0 +%rep NUM_GP_REGS + mov [rsp + 8*gp_regs_i], GP(gp_regs_i) +%assign gp_regs_i gp_regs_i+1 +%endrep +%endmacro + +;; Restore GP registers and stack pointer +%macro RESTORE_GP_REGS 0 +%assign gp_regs_i 0 +%rep NUM_GP_REGS + mov GP(gp_regs_i), [rsp + 8*gp_regs_i] +%assign gp_regs_i gp_regs_i+1 +%endrep + mov rsp, rax + pop rax +%endmacro + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; Generic macro to translate AESNI instructions to AESNI emulation functions +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%macro EMULATE_AESNI 4 +%define %%func %1 +%define %%src_dst %2 +%define %%key %3 +%define %%imm %4 + +%ifdef LINUX +%define %%arg1 rdi +%define %%arg2 rsi +%define %%arg3 rdx +%else +%define %%arg1 rcx +%define %%arg2 rdx +%define %%arg3 r8 +%endif + +;; Check if key is reg or ptr +%assign IS_REG 0 +%assign x 0 +%rep NUM_XMM_REGS +%ifidni %%key, XMM(x) + %assign IS_REG 1 + %exitrep +%endif +%assign x x+1 +%endrep + ;; save GP registers to stack + SAVE_GP_REGS + + ;; move function args onto stack before function call + movdqa [rsp + ARG1_OFFSET], %%src_dst +%if IS_REG + movdqa [rsp + ARG2_OFFSET], %%key +%else + movdqu %%src_dst, %%key + movdqa [rsp + ARG2_OFFSET], %%src_dst +%endif + lea %%arg1, [rsp + ARG1_OFFSET] + lea %%arg2, [rsp + ARG2_OFFSET] + + ;; move 8 bit imm rcon for aeskeygenassist +%ifnum %%imm + mov BYTE(%%arg3), %%imm +%endif + +;; save XMM registers to stack, as some compilers may use them in "func" +%assign reg_idx 0 +%rep NUM_XMM_REGS + movdqa [rsp + XMM_OFFSET + (reg_idx * XMM_SZ)], XMM(reg_idx) +%assign reg_idx reg_idx + 1 +%endrep + +;; reserve space on stack for up to 4 arguments on the stack (windows only) +%ifndef LINUX + sub rsp, 32 +%endif + ;; call emulation function + call %%func +%ifndef LINUX + add rsp, 32 +%endif + +;; restore XMM registers from stack +%assign reg_idx 0 +%rep NUM_XMM_REGS + movdqa XMM(reg_idx), [rsp + XMM_OFFSET + (reg_idx * XMM_SZ)] +%assign reg_idx reg_idx + 1 +%endrep + + ;; Destination XMM gets overwritten with result from func + movdqa %%src_dst, [rsp + ARG1_OFFSET] + + RESTORE_GP_REGS +%endmacro + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; Macros to translate AESNI instructions to AESNI emulation functions +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; AESENC translation macro +%macro EMULATE_AESENC 2 +%define %%src_dst %1 +%define %%key %2 + EMULATE_AESNI emulate_AESENC, %%src_dst, %%key, "" +%endmacro + +;; AESENCLAST translation macro +%macro EMULATE_AESENCLAST 2 +%define %%src_dst %1 +%define %%key %2 + EMULATE_AESNI emulate_AESENCLAST, %%src_dst, %%key, "" +%endmacro + +;; AESDEC translation macro +%macro EMULATE_AESDEC 2 +%define %%src_dst %1 +%define %%key %2 + EMULATE_AESNI emulate_AESDEC, %%src_dst, %%key, "" +%endmacro + +;; AESDECLAST translation macro +%macro EMULATE_AESDECLAST 2 +%define %%src_dst %1 +%define %%key %2 + EMULATE_AESNI emulate_AESDECLAST, %%src_dst, %%key, "" +%endmacro + +;; AESIMC translation macro +%macro EMULATE_AESIMC 2 +%define %%src_dst %1 +%define %%key %2 + EMULATE_AESNI emulate_AESIMC, %%src_dst, %%key, "" +%endmacro + +;; AESKEYGENASSIST translation macro +%macro EMULATE_AESKEYGENASSIST 3 +%define %%src_dst %1 +%define %%key %2 +%define %%imm %3 + EMULATE_AESNI emulate_AESKEYGENASSIST, %%src_dst, %%key, %%imm +%endmacro + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; AESNI defines +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%ifndef NO_AESNI_RENAME +%define aesenc EMULATE_AESENC +%define aesenclast EMULATE_AESENCLAST +%define aesdec EMULATE_AESDEC +%define aesdeclast EMULATE_AESDECLAST +%define aesimc EMULATE_AESIMC +%define aeskeygenassist EMULATE_AESKEYGENASSIST +%endif + +extern emulate_AESENC +extern emulate_AESENCLAST +extern emulate_AESDEC +extern emulate_AESDECLAST +extern emulate_AESIMC +extern emulate_AESKEYGENASSIST + +%endif ; end ifndef _AESNI_EMU_INC_ diff --git a/src/spdk/intel-ipsec-mb/include/clear_regs.asm b/src/spdk/intel-ipsec-mb/include/clear_regs.asm new file mode 100644 index 000000000..6cb48c49e --- /dev/null +++ b/src/spdk/intel-ipsec-mb/include/clear_regs.asm @@ -0,0 +1,196 @@ +;; +;; Copyright (c) 2019, Intel Corporation +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are met: +;; +;; * Redistributions of source code must retain the above copyright notice, +;; this list of conditions and the following disclaimer. +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; * Neither the name of Intel Corporation nor the names of its contributors +;; may be used to endorse or promote products derived from this software +;; without specific prior written permission. +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;; + +%ifndef _CLEAR_REGS_ASM_ +%define _CLEAR_REGS_ASM_ + +%include "include/os.asm" + +; +; This macro clears any GP registers passed +; +%macro clear_gps 1-16 +%define %%NUM_REGS %0 +%rep %%NUM_REGS + xor %1, %1 +%rotate 1 +%endrep +%endmacro + +; +; This macro clears any XMM registers passed on SSE +; +%macro clear_xmms_sse 1-16 +%define %%NUM_REGS %0 +%rep %%NUM_REGS + pxor %1, %1 +%rotate 1 +%endrep +%endmacro + +; +; This macro clears any XMM registers passed on AVX +; +%macro clear_xmms_avx 1-16 +%define %%NUM_REGS %0 +%rep %%NUM_REGS + vpxor %1, %1 +%rotate 1 +%endrep +%endmacro + +; +; This macro clears any YMM registers passed +; +%macro clear_ymms 1-16 +%define %%NUM_REGS %0 +%rep %%NUM_REGS + vpxor %1, %1 +%rotate 1 +%endrep +%endmacro + +; +; This macro clears any ZMM registers passed +; +%macro clear_zmms 1-32 +%define %%NUM_REGS %0 +%rep %%NUM_REGS + vpxorq %1, %1 +%rotate 1 +%endrep +%endmacro + +; +; This macro clears all scratch GP registers +; for Windows or Linux +; +%macro clear_scratch_gps_asm 0 + clear_gps rax, rcx, rdx, r8, r9, r10, r11 +%ifdef LINUX + clear_gps rdi, rsi +%endif +%endmacro + +; +; This macro clears all scratch XMM registers on SSE +; +%macro clear_scratch_xmms_sse_asm 0 +%ifdef LINUX +%assign i 0 +%rep 16 + pxor xmm %+ i, xmm %+ i +%assign i (i+1) +%endrep +; On Windows, XMM0-XMM5 registers are scratch registers +%else +%assign i 0 +%rep 6 + pxor xmm %+ i, xmm %+ i +%assign i (i+1) +%endrep +%endif ; LINUX +%endmacro + +; +; This macro clears all scratch XMM registers on AVX +; +%macro clear_scratch_xmms_avx_asm 0 +%ifdef LINUX + vzeroall +; On Windows, XMM0-XMM5 registers are scratch registers +%else +%assign i 0 +%rep 6 + vpxor xmm %+ i, xmm %+ i +%assign i (i+1) +%endrep +%endif ; LINUX +%endmacro + +; +; This macro clears all scratch YMM registers +; +; It should be called before restoring the XMM registers +; for Windows (XMM6-XMM15) +; +%macro clear_scratch_ymms_asm 0 +; On Linux, all YMM registers are scratch registers +%ifdef LINUX + vzeroall +; On Windows, YMM0-YMM5 registers are scratch registers. +; YMM6-YMM15 upper 128 bits are scratch registers too, but +; the lower 128 bits are to be restored after calling these function +; which clears the upper bits too. +%else +%assign i 0 +%rep 6 + vpxor ymm %+ i, ymm %+ i +%assign i (i+1) +%endrep +%endif ; LINUX +%endmacro + +; +; This macro clears all scratch ZMM registers +; +; It should be called before restoring the XMM registers +; for Windows (XMM6-XMM15). YMM registers are used +; on purpose, since XOR'ing YMM registers is faster +; than XOR'ing ZMM registers, and the operation clears +; also the upper 256 bits +; +%macro clear_scratch_zmms_asm 0 +; On Linux, all ZMM registers are scratch registers +%ifdef LINUX + vzeroall + ;; vzeroall only clears the first 16 ZMM registers +%assign i 16 +%rep 16 + vpxorq ymm %+ i, ymm %+ i +%assign i (i+1) +%endrep +; On Windows, ZMM0-ZMM5 and ZMM16-ZMM31 registers are scratch registers. +; ZMM6-ZMM15 upper 384 bits are scratch registers too, but +; the lower 128 bits are to be restored after calling these function +; which clears the upper bits too. +%else +%assign i 0 +%rep 6 + vpxorq ymm %+ i, ymm %+ i +%assign i (i+1) +%endrep + +%assign i 16 +%rep 16 + vpxorq ymm %+ i, ymm %+ i +%assign i (i+1) +%endrep +%endif ; LINUX +%endmacro + +%endif ;; _CLEAR_REGS_ASM diff --git a/src/spdk/intel-ipsec-mb/include/clear_regs_mem.h b/src/spdk/intel-ipsec-mb/include/clear_regs_mem.h new file mode 100644 index 000000000..40f888ec4 --- /dev/null +++ b/src/spdk/intel-ipsec-mb/include/clear_regs_mem.h @@ -0,0 +1,53 @@ +/******************************************************************************* + Copyright (c) 2019, Intel Corporation + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of Intel Corporation nor the names of its contributors + may be used to endorse or promote products derived from this software + without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE + FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#ifndef CLEAR_REGS_H +#define CLEAR_REGS_H + +#define CLEAR_SCRATCH_GPS clear_scratch_gps + +void force_memset_zero(void *mem, const size_t size); + +static inline void +clear_mem(void *mem, const size_t size) +{ + force_memset_zero(mem, size); +} + +static inline void +clear_var(void *var, const size_t size) +{ + force_memset_zero(var, size); +} + +void clear_scratch_gps(void); +void clear_scratch_xmms_sse(void); +void clear_scratch_xmms_avx(void); +void clear_scratch_ymms(void); +void clear_scratch_zmms(void); + +#endif /* CLEAR_REGS_H */ diff --git a/src/spdk/intel-ipsec-mb/include/clear_regs_mem_fns.asm b/src/spdk/intel-ipsec-mb/include/clear_regs_mem_fns.asm new file mode 100644 index 000000000..4fd6f7edb --- /dev/null +++ b/src/spdk/intel-ipsec-mb/include/clear_regs_mem_fns.asm @@ -0,0 +1,124 @@ +;; +;; Copyright (c) 2019, Intel Corporation +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are met: +;; +;; * Redistributions of source code must retain the above copyright notice, +;; this list of conditions and the following disclaimer. +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; * Neither the name of Intel Corporation nor the names of its contributors +;; may be used to endorse or promote products derived from this software +;; without specific prior written permission. +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;; + +%include "include/os.asm" +%include "include/clear_regs.asm" + +section .text +; +; This function clears all scratch GP registers +; +; void clear_scratch_gps(void) +MKGLOBAL(clear_scratch_gps,function,internal) +clear_scratch_gps: + + clear_scratch_gps_asm + + ret + +; +; This function clears all scratch XMM registers +; +; void clear_scratch_xmms_sse(void) +MKGLOBAL(clear_scratch_xmms_sse,function,internal) +clear_scratch_xmms_sse: + + clear_scratch_xmms_sse_asm + + ret + +; +; This function clears all scratch XMM registers +; +; It should be called before restoring the XMM registers +; for Windows (XMM6-XMM15) +; +; void clear_scratch_xmms_avx(void) +MKGLOBAL(clear_scratch_xmms_avx,function,internal) +clear_scratch_xmms_avx: + + clear_scratch_xmms_avx_asm + + ret + +; +; This function clears all scratch YMM registers +; +; It should be called before restoring the XMM registers +; for Windows (XMM6-XMM15) +; +; void clear_scratch_ymms(void) +MKGLOBAL(clear_scratch_ymms,function,internal) +clear_scratch_ymms: + + clear_scratch_ymms_asm + + ret + +; +; This function clears all scratch ZMM registers +; +; It should be called before restoring the XMM registers +; for Windows (XMM6-XMM15). YMM registers are used +; on purpose, since XOR'ing YMM registers is faster +; than XOR'ing ZMM registers, and the operation clears +; also the upper 256 bits +; +; void clear_scratch_zmms(void) +MKGLOBAL(clear_scratch_zmms,function,internal) +clear_scratch_zmms: + + clear_scratch_zmms_asm + + ret + +; +; This function clears all memory passed +; +; void force_memset_zero(void *mem, const size_t size) +MKGLOBAL(force_memset_zero,function,internal) +force_memset_zero: + +%ifdef LINUX + mov rcx, rsi +%else + push rdi + mov rdi, rcx + mov rcx, rdx +%endif + xor eax, eax + cld + rep stosb + +%ifndef LINUX + pop rdi +%endif + ret + +%ifdef LINUX +section .note.GNU-stack noalloc noexec nowrite progbits +%endif diff --git a/src/spdk/intel-ipsec-mb/include/const.inc b/src/spdk/intel-ipsec-mb/include/const.inc new file mode 100644 index 000000000..e77e80d2e --- /dev/null +++ b/src/spdk/intel-ipsec-mb/include/const.inc @@ -0,0 +1,163 @@ +;; +;; Copyright (c) 2018, Intel Corporation +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are met: +;; +;; * Redistributions of source code must retain the above copyright notice, +;; this list of conditions and the following disclaimer. +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; * Neither the name of Intel Corporation nor the names of its contributors +;; may be used to endorse or promote products derived from this software +;; without specific prior written permission. +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;; + +%ifndef _CONST_INC_ +%define _CONST_INC_ + +;;; Tables used to insert word into a SIMD register +extern len_shift_tab +extern len_mask_tab +extern shift_tab_16 + +;;; Table to do 0x80 byte shift for padding prefix +extern padding_0x80_tab16 + +;;; Size of len_shift_tab defined in const.asm module +%define len_tab_diff 128 + +; PINSRW_COMMON insert word into 128 bit SIMD register +%macro PINSRW_COMMON 7 + +%define %%type %1 ; instruction type - sse or avx +%define %%dest %2 ; dest XMM reg to insert word +%define %%tmp_simd %3 ; XMM reg to clobber +%define %%tmp_gp %4 ; GP reg to clobber +%define %%idx %5 ; word index to insert value into XMM +%define %%val %6 ; word value to insert into idx +%define %%scale_idx %7 ; flag to set if index is to be scaled x16 + +%ifidn %%scale_idx, scale_x16 + shl %%idx, 4 ; scale idx up x16 +%endif +%ifnum %%val + ;; immediate value passed on + mov DWORD(%%tmp_gp), %%val +%ifidn %%type, sse + movd %%tmp_simd, DWORD(%%tmp_gp) +%else + vmovd %%tmp_simd, DWORD(%%tmp_gp) +%endif +%else + ;; register name passed on +%ifidn %%type, sse + movd %%tmp_simd, DWORD(%%val) +%else + vmovd %%tmp_simd, DWORD(%%val) +%endif +%endif + lea %%tmp_gp, [rel len_shift_tab] + ;; check type - SSE or AVX +%ifidn %%type, sse + pshufb %%tmp_simd, [%%tmp_gp + %%idx] + pand %%dest, [%%tmp_gp + len_tab_diff + %%idx] + por %%dest, %%tmp_simd +%else + vpshufb %%tmp_simd, [%%tmp_gp + %%idx] + vpand %%dest, [%%tmp_gp + len_tab_diff + %%idx] + vpor %%dest, %%tmp_simd +%endif +%ifidn %%scale_idx, scale_x16 + shr %%idx, 4 ; reset idx +%endif +%endmacro + +;;; Call SSE macro +%define XPINSRW PINSRW_COMMON sse, + +;;; Call AVX macro +%define XVPINSRW PINSRW_COMMON avx, + + +;;; VPINSRW_M256 insert word into 32 byte memory range +%macro VPINSRW_M256 8 + +%define %%mem_addr %1 ; 16 byte aligned memory address to insert word +%define %%tmp_simd1 %2 ; XMM reg to clobber +%define %%tmp_simd2 %3 ; XMM reg to clobber +%define %%tmp_gp %4 ; GP reg to clobber +%define %%offset %5 ; GP reg used to store offset +%define %%idx %6 ; word index to insert value +%define %%val %7 ; word value to insert into idx +%define %%scale_idx %8 ; flag to set if index is to be scaled x16 + + mov %%offset, %%idx + and %%offset, 0x8 ; set offset 0 or 8 + and %%idx, 0x7 ; remove offset from idx + vmovdqa %%tmp_simd1, [%%mem_addr + %%offset*2] + XVPINSRW %%tmp_simd1, %%tmp_simd2, %%tmp_gp, %%idx, %%val, %%scale_idx + vmovdqa [%%mem_addr + %%offset*2], %%tmp_simd1 + or %%idx, %%offset ; reset offset +%endmacro + +;;; PSLB_COMMON shift bytes 128 bit SIMD register +%macro PSLB_COMMON 6 + +%define %%type %1 ; [in] instruction type - sse or avx +%define %%dir %2 ; [in] shift direction - left or right +%define %%reg %3 ; [in/out] XMM reg to shift bytes +%define %%num %4 ; [in] GP reg containing number of bytes to shift +%define %%shuf_tab %5 ; [out] XMM reg to store shuffle table +%define %%tmp_gp %6 ; [clobbered] GP reg to clobber + + ;; load shift table into %%shuf_tab + lea %%tmp_gp, [rel shift_tab_16 + 16] +%ifidn %%dir, left + sub %%tmp_gp, %%num +%else + add %%tmp_gp, %%num +%endif + +%ifidn %%type, sse + movdqu %%shuf_tab, [%%tmp_gp] + pshufb %%reg, %%shuf_tab +%else + vmovdqu %%shuf_tab, [%%tmp_gp] + vpshufb %%reg, %%shuf_tab +%endif +%endmacro + +;;; Call SSE left shift macro +%macro XPSLLB 4 + PSLB_COMMON sse, left, %1,%2,%3,%4 +%endm + +;;; Call SSE right shift macro +%macro XPSRLB 4 + PSLB_COMMON sse, right, %1,%2,%3,%4 +%endm + +;;; Call AVX left shift macro +%macro XVPSLLB 4 + PSLB_COMMON avx, left, %1,%2,%3,%4 +%endm + +;;; Call AVX right shift macro +%macro XVPSRLB 4 + PSLB_COMMON avx, right, %1,%2,%3,%4 +%endm + +%endif ; end ifndef _CONST_INC_ diff --git a/src/spdk/intel-ipsec-mb/include/constant_lookup.asm b/src/spdk/intel-ipsec-mb/include/constant_lookup.asm new file mode 100644 index 000000000..a3c81dc75 --- /dev/null +++ b/src/spdk/intel-ipsec-mb/include/constant_lookup.asm @@ -0,0 +1,561 @@ +;; +;; Copyright (c) 2019, Intel Corporation +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are met: +;; +;; * Redistributions of source code must retain the above copyright notice, +;; this list of conditions and the following disclaimer. +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; * Neither the name of Intel Corporation nor the names of its contributors +;; may be used to endorse or promote products derived from this software +;; without specific prior written permission. +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;; + +%include "include/os.asm" +%include "include/reg_sizes.asm" + +section .data +default rel + +align 16 +idx_tab8: + db 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, + db 0x8, 0x9, 0xA, 0xB, 0xC, 0xD, 0xE, 0xF, + +align 16 +add_16: + db 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, + db 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10 + +align 16 +idx_tab16: + dw 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7 + +align 16 +add_8: + dw 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8 + +align 16 +idx_tab32: + dd 0x0, 0x1, 0x2, 0x3 + +align 16 +add_4: + dd 0x4, 0x4, 0x4, 0x4 + +align 16 +idx_tab64: + dq 0x0, 0x1 + +add_2: + dq 0x2, 0x2 + +align 16 +bcast_mask: + db 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, + db 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01 + +section .text + +%ifdef LINUX + %define arg1 rdi + %define arg2 rsi + %define arg3 rdx +%else + %define arg1 rcx + %define arg2 rdx + %define arg3 r8 +%endif + +%define bcast_idx xmm0 +%define xadd xmm1 +%define accum_val xmm2 +%define xindices xmm3 +%define xtmp xmm4 +%define xtmp2 xmm5 +%define tmp r9 +%define offset r10 + +%define table arg1 +%define idx arg2 +%define size arg3 + +; uint8_t lookup_8bit_sse(const void *table, const uint32_t idx, const uint32_t size); +; arg 1 : pointer to table to look up +; arg 2 : index to look up +; arg 3 : size of table to look up (multiple of 16 bytes) +MKGLOBAL(lookup_8bit_sse,function,internal) +lookup_8bit_sse: + + ;; Number of loop iters = matrix size / 4 (number of values in XMM) + shr size, 4 + je exit8_sse + + xor offset, offset + + ;; Broadcast idx to look up + movd bcast_idx, DWORD(idx) + pxor xtmp, xtmp + pxor accum_val, accum_val + pshufb bcast_idx, xtmp + + movdqa xadd, [rel add_16] + movdqa xindices, [rel idx_tab8] + +loop8_sse: + movdqa xtmp, xindices + + ;; Compare indices with idx + ;; This generates a mask with all 0s except for the position where idx matches (all 1s here) + pcmpeqb xtmp, bcast_idx + + ;; Load next 16 values + movdqa xtmp2, [table + offset] + + ;; This generates data with all 0s except the value we are looking for in the index to look up + pand xtmp2, xtmp + + por accum_val, xtmp2 + + ;; Get next 16 indices + paddb xindices, xadd + + add offset, 16 + dec size + + jne loop8_sse + + ;; Extract value from XMM register + movdqa xtmp, accum_val + pslldq xtmp, 8 ; shift left by 64 bits + por accum_val, xtmp + + movdqa xtmp, accum_val + pslldq xtmp, 4 ; shift left by 32 bits + por accum_val, xtmp + + movdqa xtmp, accum_val + pslldq xtmp, 2 ; shift left by 16 bits + por accum_val, xtmp + + movdqa xtmp, accum_val + pslldq xtmp, 1 ; shift left by 8 bits + por accum_val, xtmp + + pextrb rax, accum_val, 15 + +exit8_sse: + ret + +; uint8_t lookup_8bit_avx(const void *table, const uint32_t idx, const uint32_t size); +; arg 1 : pointer to table to look up +; arg 2 : index to look up +; arg 3 : size of table to look up (multiple of 16 bytes) +MKGLOBAL(lookup_8bit_avx,function,internal) +lookup_8bit_avx: + ;; Number of loop iters = matrix size / 4 (number of values in XMM) + shr size, 4 + je exit8_avx + + xor offset, offset + + ;; Broadcast idx to look up + vmovd bcast_idx, DWORD(idx) + vpxor xtmp, xtmp + vpxor accum_val, accum_val + vpshufb bcast_idx, xtmp + + vmovdqa xadd, [rel add_16] + vmovdqa xindices, [rel idx_tab8] + +loop8_avx: + ;; Compare indices with idx + ;; This generates a mask with all 0s except for the position where idx matches (all 1s here) + vpcmpeqb xtmp, xindices, bcast_idx + + ;; Load next 16 values + vmovdqa xtmp2, [table + offset] + + ;; This generates data with all 0s except the value we are looking for in the index to look up + vpand xtmp2, xtmp + + vpor accum_val, xtmp2 + + ;; Get next 16 indices + vpaddb xindices, xadd + + add offset, 16 + dec size + + jne loop8_avx + + ;; Extract value from XMM register + vpslldq xtmp, accum_val, 8 ; shift left by 64 bits + vpor accum_val, xtmp + + vpslldq xtmp, accum_val, 4 ; shift left by 32 bits + vpor accum_val, xtmp + + vpslldq xtmp, accum_val, 2 ; shift left by 16 bits + vpor accum_val, xtmp + + vpslldq xtmp, accum_val, 1 ; shift left by 8 bits + vpor accum_val, xtmp + + vpextrb rax, accum_val, 15 + +exit8_avx: + + ret + +; uint8_t lookup_16bit_sse(const void *table, const uint32_t idx, const uint32_t size); +; arg 1 : pointer to table to look up +; arg 2 : index to look up +; arg 3 : size of table to look up +MKGLOBAL(lookup_16bit_sse,function,internal) +lookup_16bit_sse: + + ;; Number of loop iters = matrix size / 8 (number of values in XMM) + shr size, 3 + je exit16_sse + + xor offset, offset + + ;; Broadcast idx to look up + movd bcast_idx, DWORD(idx) + movdqa xtmp, [rel bcast_mask] + pxor accum_val, accum_val + pshufb bcast_idx, xtmp + + movdqa xadd, [rel add_8] + movdqa xindices, [rel idx_tab16] + +loop16_sse: + + movdqa xtmp, xindices + + ;; Compare indices with idx + ;; This generates a mask with all 0s except for the position where idx matches (all 1s here) + pcmpeqw xtmp, bcast_idx + + ;; Load next 8 values + movdqa xtmp2, [table + offset] + + ;; This generates data with all 0s except the value we are looking for in the index to look up + pand xtmp2, xtmp + + por accum_val, xtmp2 + + ;; Get next 8 indices + paddw xindices, xadd + add offset, 16 + dec size + + jne loop16_sse + + ;; Extract value from XMM register + movdqa xtmp, accum_val + pslldq xtmp, 8 ; shift left by 64 bits + por accum_val, xtmp + + movdqa xtmp, accum_val + pslldq xtmp, 4 ; shift left by 32 bits + por accum_val, xtmp + + movdqa xtmp, accum_val + pslldq xtmp, 2 ; shift left by 16 bits + por accum_val, xtmp + + pextrw rax, accum_val, 7 + +exit16_sse: + ret + +; uint8_t lookup_16bit_avx(const void *table, const uint32_t idx, const uint32_t size); +; arg 1 : pointer to table to look up +; arg 2 : index to look up +; arg 3 : size of table to look up +MKGLOBAL(lookup_16bit_avx,function,internal) +lookup_16bit_avx: + + ;; Number of loop iters = matrix size / 8 (number of values in XMM) + shr size, 3 + je exit16_avx + + xor offset, offset + + ;; Broadcast idx to look up + vmovd bcast_idx, DWORD(idx) + vmovdqa xtmp, [rel bcast_mask] + vpxor accum_val, accum_val + vpshufb bcast_idx, xtmp + + vmovdqa xadd, [rel add_8] + vmovdqa xindices, [rel idx_tab16] + +loop16_avx: + + ;; Compare indices with idx + ;; This generates a mask with all 0s except for the position where idx matches (all 1s here) + vpcmpeqw xtmp, xindices, bcast_idx + + ;; Load next 16 values + vmovdqa xtmp2, [table + offset] + + ;; This generates data with all 0s except the value we are looking for in the index to look up + vpand xtmp2, xtmp + + vpor accum_val, xtmp2 + + ;; Get next 8 indices + vpaddw xindices, xadd + add offset, 16 + dec size + + jne loop16_avx + + ;; Extract value from XMM register + vpslldq xtmp, accum_val, 8 ; shift left by 64 bits + vpor accum_val, xtmp + + vpslldq xtmp, accum_val, 4 ; shift left by 32 bits + vpor accum_val, xtmp + + vpslldq xtmp, accum_val, 2 ; shift left by 16 bits + vpor accum_val, xtmp + + vpextrw rax, accum_val, 7 + +exit16_avx: + ret + +; uint32_t lookup_32bit_sse(const void *table, const uint32_t idx, const uint32_t size); +; arg 1 : pointer to table to look up +; arg 2 : index to look up +; arg 3 : size of table to look up +MKGLOBAL(lookup_32bit_sse,function,internal) +lookup_32bit_sse: + + ;; Number of loop iters = matrix size / 4 (number of values in XMM) + shr size, 2 + je exit32_sse + + xor offset, offset + + ;; Broadcast idx to look up + movd bcast_idx, DWORD(idx) + pxor accum_val, accum_val + pshufd bcast_idx, bcast_idx, 0 + + movdqa xadd, [rel add_4] + movdqa xindices, [rel idx_tab32] + +loop32_sse: + movdqa xtmp, xindices + + ;; Compare indices with idx + ;; This generates a mask with all 0s except for the position where idx matches (all 1s here) + pcmpeqd xtmp, bcast_idx + + ;; Load next 4 values + movdqa xtmp2, [table + offset] + + ;; This generates data with all 0s except the value we are looking for in the index to look up + pand xtmp2, xtmp + + por accum_val, xtmp2 + + ;; Get next 4 indices + paddd xindices, xadd + add offset, 16 + dec size + + jne loop32_sse + + ;; Extract value from XMM register + movdqa xtmp, accum_val + psrldq xtmp, 8 ; shift right by 64 bits + por accum_val, xtmp + + movdqa xtmp, accum_val + psrldq xtmp, 4 ; shift right by 32 bits + por accum_val, xtmp + + movd eax, accum_val + +exit32_sse: + ret + + +; uint32_t lookup_32bit_avx(const void *table, const uint32_t idx, const uint32_t size); +; arg 1 : pointer to table to look up +; arg 2 : index to look up +; arg 3 : size of table to look up +MKGLOBAL(lookup_32bit_avx,function,internal) +lookup_32bit_avx: + ;; Number of loop iters = matrix size / 4 (number of values in XMM) + shr size, 2 + je exit32_avx + + xor offset, offset + + ;; Broadcast idx to look up + vmovd bcast_idx, DWORD(idx) + vpxor accum_val, accum_val + vpshufd bcast_idx, bcast_idx, 0 + + vmovdqa xadd, [rel add_4] + vmovdqa xindices, [rel idx_tab32] + +loop32_avx: + ;; Compare indices with idx + ;; This generates a mask with all 0s except for the position where idx matches (all 1s here) + vpcmpeqd xtmp, xindices, bcast_idx + + ;; Load next 4 values + vmovdqa xtmp2, [table + offset] + + ;; This generates data with all 0s except the value we are looking for in the index to look up + vpand xtmp2, xtmp + + vpor accum_val, xtmp2 + + ;; Get next 4 indices + vpaddd xindices, xadd + add offset, 16 + dec size + + jne loop32_avx + + ;; Extract value from XMM register + vpsrldq xtmp, accum_val, 8 ; shift right by 64 bits + vpor accum_val, xtmp + + vpsrldq xtmp, accum_val, 4 ; shift right by 32 bits + vpor accum_val, xtmp + + vmovd eax, accum_val + +exit32_avx: + ret + + +; uint64_t lookup_64bit_sse(const void *table, const uint32_t idx, const uint32_t size); +; arg 1 : pointer to table to look up +; arg 2 : index to look up +; arg 3 : size of table to look up +MKGLOBAL(lookup_64bit_sse,function,internal) +lookup_64bit_sse: + ;; Number of loop iters = matrix size / 2 (number of values in XMM) + shr size, 1 + je exit64_sse + + xor offset, offset + + ;; Broadcast idx to look up + movq bcast_idx, idx + pxor accum_val, accum_val + pinsrq bcast_idx, idx, 1 + + movdqa xadd, [rel add_2] + movdqa xindices, [rel idx_tab64] + +loop64_sse: + movdqa xtmp, xindices + + ;; Compare indices with idx + ;; This generates a mask with all 0s except for the position where idx matches (all 1s here) + pcmpeqq xtmp, bcast_idx + + ;; Load next 2 values + movdqa xtmp2, [table + offset] + + ;; This generates data with all 0s except the value we are looking for in the index to look up + pand xtmp2, xtmp + + por accum_val, xtmp2 + + ;; Get next 2 indices + paddq xindices, xadd + add offset, 16 + dec size + + jne loop64_sse + + ;; Extract value from XMM register + movdqa xtmp, accum_val + psrldq xtmp, 8 ; shift right by 64 bits + por accum_val, xtmp + + movq rax, accum_val + +exit64_sse: + ret + + +; uint64_t lookup_64bit_avx(const void *table, const uint32_t idx, const uint32_t size); +; arg 1 : pointer to table to look up +; arg 2 : index to look up +; arg 3 : size of table to look up +MKGLOBAL(lookup_64bit_avx,function,internal) +lookup_64bit_avx: + ;; Number of loop iters = matrix size / 2 (number of values in XMM) + shr size, 1 + je exit64_avx + + xor offset, offset + + vmovq bcast_idx, idx + vpxor accum_val, accum_val + vpinsrq bcast_idx, idx, 1 + + vmovdqa xadd, [rel add_2] + vmovdqa xindices, [rel idx_tab64] + +loop64_avx: + ;; Compare indices with idx + ;; This generates a mask with all 0s except for the position where idx matches (all 1s here) + vpcmpeqq xtmp, xindices, bcast_idx + + ;; Load next 2 values + vmovdqa xtmp2, [table + offset] + + ;; This generates data with all 0s except the value we are looking for in the index to look up + vpand xtmp2, xtmp + + vpor accum_val, xtmp2 + + ;; Get next 2 indices + vpaddq xindices, xadd + add offset, 16 + dec size + + jne loop64_avx + + ;; Extract value from XMM register + vpsrldq xtmp, accum_val, 8 ; shift right by 64 bits + vpor accum_val, xtmp + + vmovq rax, accum_val + +exit64_avx: + ret + + +%ifdef LINUX +section .note.GNU-stack noalloc noexec nowrite progbits +%endif diff --git a/src/spdk/intel-ipsec-mb/include/constant_lookup.h b/src/spdk/intel-ipsec-mb/include/constant_lookup.h new file mode 100644 index 000000000..bd56a24d2 --- /dev/null +++ b/src/spdk/intel-ipsec-mb/include/constant_lookup.h @@ -0,0 +1,173 @@ +/******************************************************************************* + Copyright (c) 2019, Intel Corporation + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of Intel Corporation nor the names of its contributors + may be used to endorse or promote products derived from this software + without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE + FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#ifndef CONSTANT_LOOKUP_H +#define CONSTANT_LOOKUP_H + +#include "intel-ipsec-mb.h" + +#ifdef SAFE_LOOKUP +#define LOOKUP8_SSE(_table, _idx, _size) \ + lookup_8bit_sse(_table, _idx, _size) +#define LOOKUP8_AVX(_table, _idx, _size) \ + lookup_8bit_avx(_table, _idx, _size) +#define LOOKUP16_SSE(_table, _idx, _size) \ + lookup_16bit_sse(_table, _idx, _size) +#define LOOKUP16_AVX(_table, _idx, _size) \ + lookup_16bit_avx(_table, _idx, _size) +#define LOOKUP32_SSE(_table, _idx, _size) \ + lookup_32bit_sse(_table, _idx, _size) +#define LOOKUP32_AVX(_table, _idx, _size) \ + lookup_32bit_avx(_table, _idx, _size) +#define LOOKUP64_SSE(_table, _idx, _size) \ + lookup_64bit_sse(_table, _idx, _size) +#define LOOKUP64_AVX(_table, _idx, _size) \ + lookup_64bit_avx(_table, _idx, _size) +#else +#define LOOKUP8_SSE(_table, _idx, _size) \ + _table[_idx] +#define LOOKUP8_AVX(_table, _idx, _size) \ + _table[_idx] +#define LOOKUP16_SSE(_table, _idx, _size) \ + _table[_idx] +#define LOOKUP16_AVX(_table, _idx, _size) \ + _table[_idx] +#define LOOKUP32_SSE(_table, _idx, _size) \ + _table[_idx] +#define LOOKUP32_AVX(_table, _idx, _size) \ + _table[_idx] +#define LOOKUP64_SSE(_table, _idx, _size) \ + _table[_idx] +#define LOOKUP64_AVX(_table, _idx, _size) \ + _table[_idx] +#endif + +/* + * @brief Constant time SSE lookup function on variable size table + * with 8-bit values + * + * @param[in] table Pointer to the table to look up (16-byte aligned) + * @param[in] idx Index to look up + * @param[in] size Number of 8 bit elements in the table (multiple of 16) + * + * @return value to lookup + */ +uint8_t +lookup_8bit_sse(const void *table, const uint32_t idx, const uint32_t size); + +/* + * @brief Constant time AVX lookup function on variable size table + * with 8-bit values + * + * @param[in] table Pointer to the table to look up (16-byte aligned) + * @param[in] idx Index to look up + * @param[in] size Number of 8 bit elements in the table (multiple of 16) + * + * @return value to lookup + */ +uint8_t +lookup_8bit_avx(const void *table, const uint32_t idx, const uint32_t size); + +/* + * @brief Constant time SSE lookup function on variable size table + * with 16-bit values + * + * @param[in] table Pointer to the table to look up (16-byte aligned) + * @param[in] idx Index to look up + * @param[in] size Number of 16 bit elements in the table (multiple of 8) + * + * @return value to lookup + */ +uint16_t +lookup_16bit_sse(const void *table, const uint32_t idx, const uint32_t size); + +/* + * @brief Constant time AVX lookup function on variable size table + * with 16-bit values + * + * @param[in] table Pointer to the table to look up (16-byte aligned) + * @param[in] idx Index to look up + * @param[in] size Number of 16 bit elements in the table (multiple of 8) + * + * @return value to lookup + */ +uint16_t +lookup_16bit_avx(const void *table, const uint32_t idx, const uint32_t size); + +/* + * @brief Constant time SSE lookup function on + * variable size table with 32-bit values + * + * @param[in] table Pointer to the table to look up (16-byte aligned) + * @param[in] idx Index to look up + * @param[in] size Number of 32 bit elements in the table (multiple of 4) + * + * @return value to lookup + */ +uint32_t +lookup_32bit_sse(const void *table, const uint32_t idx, const uint32_t size); + +/* + * @brief Constant time AVX lookup function on + * variable size table with 32-bit values + * + * @param[in] table Pointer to the table to look up (16-byte aligned) + * @param[in] idx Index to look up + * @param[in] size Number of 32 bit elements in the table (multiple of 4) + * + * @return value to lookup + */ +uint32_t +lookup_32bit_avx(const void *table, const uint32_t idx, const uint32_t size); + +/* + * @brief Constant time SSE lookup function on + * variable size table with 64-bit values + * + * @param[in] table Pointer to the table to look up (16-byte aligned) + * @param[in] idx Index to look up + * @param[in] size Number of 64 bit elements in the table (multiple of 2) + * + * @return value to lookup + */ +uint64_t +lookup_64bit_sse(const void *table, const uint32_t idx, const uint32_t size); + +/* + * @brief Constant time AVX lookup function on + * variable size table with 64-bit values + * + * @param[in] table Pointer to the table to look up (16-byte aligned) + * @param[in] idx Index to look up + * @param[in] size Number of 64 bit elements in the table (multiple of 2) + * + * @return value to lookup + */ +uint64_t +lookup_64bit_avx(const void *table, const uint32_t idx, const uint32_t size); + +#endif /* CONSTANT_LOOKUP_H */ diff --git a/src/spdk/intel-ipsec-mb/include/cpu_feature.h b/src/spdk/intel-ipsec-mb/include/cpu_feature.h new file mode 100644 index 000000000..1347094a7 --- /dev/null +++ b/src/spdk/intel-ipsec-mb/include/cpu_feature.h @@ -0,0 +1,52 @@ +/******************************************************************************* + Copyright (c) 2018, Intel Corporation + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of Intel Corporation nor the names of its contributors + may be used to endorse or promote products derived from this software + without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE + FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#include "intel-ipsec-mb.h" + +#ifndef CPU_FEATURE_H +#define CPU_FEATURE_H + +/** + * @brief Detects hardware features and returns their status + * + * @return Bitmask representing presence of CPU features/extensions, + * see intel-ipsec-mb.h IMB_FEATURE_xyz definitions for details. + */ +IMB_DLL_LOCAL uint64_t cpu_feature_detect(void); + +/** + * @brief Modifies CPU \a features mask based on requested \a flags + * + * @param flags bitmask describing CPU feature adjustments + * @param features bitmask describing present CPU features + * + * @return \a features with applied modifications on them via \a flags + */ +IMB_DLL_LOCAL uint64_t +cpu_feature_adjust(const uint64_t flags, uint64_t features); + +#endif /* CPU_FEATURE_H */ diff --git a/src/spdk/intel-ipsec-mb/include/datastruct.asm b/src/spdk/intel-ipsec-mb/include/datastruct.asm new file mode 100644 index 000000000..0ab1113ab --- /dev/null +++ b/src/spdk/intel-ipsec-mb/include/datastruct.asm @@ -0,0 +1,235 @@ +;; +;; Copyright (c) 2012-2018, Intel Corporation +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are met: +;; +;; * Redistributions of source code must retain the above copyright notice, +;; this list of conditions and the following disclaimer. +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; * Neither the name of Intel Corporation nor the names of its contributors +;; may be used to endorse or promote products derived from this software +;; without specific prior written permission. +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;; + +; Macros for defining data structures + +; Usage example + +;START_FIELDS ; JOB_AES +;;; name size align +;FIELD _plaintext, 8, 8 ; pointer to plaintext +;FIELD _ciphertext, 8, 8 ; pointer to ciphertext +;FIELD _IV, 16, 8 ; IV +;FIELD _keys, 8, 8 ; pointer to keys +;FIELD _len, 4, 4 ; length in bytes +;FIELD _status, 4, 4 ; status enumeration +;FIELD _user_data, 8, 8 ; pointer to user data +;UNION _union, size1, align1, \ +; size2, align2, \ +; size3, align3, \ +; ... +;END_FIELDS +;%assign _JOB_AES_size _FIELD_OFFSET +;%assign _JOB_AES_align _STRUCT_ALIGN + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +; Alternate "struc-like" syntax: +; STRUCT job_aes2 +; RES_Q .plaintext, 1 +; RES_Q .ciphertext, 1 +; RES_DQ .IV, 1 +; RES_B .nested, _JOB_AES_SIZE, _JOB_AES_ALIGN +; RES_U .union, size1, align1, \ +; size2, align2, \ +; ... +; ENDSTRUCT +; ; Following only needed if nesting +; %assign job_aes2_size _FIELD_OFFSET +; %assign job_aes2_align _STRUCT_ALIGN +; +; RES_* macros take a name, a count and an optional alignment. +; The count in in terms of the base size of the macro, and the +; default alignment is the base size. +; The macros are: +; Macro Base size +; RES_B 1 +; RES_W 2 +; RES_D 4 +; RES_Q 8 +; RES_DQ 16 +; RES_Y 32 +; RES_Z 64 +; +; RES_U defines a union. It's arguments are a name and two or more +; pairs of "size, alignment" +; +; The two assigns are only needed if this structure is being nested +; within another. Even if the assigns are not done, one can still use +; STRUCT_NAME_size as the size of the structure. +; +; Note that for nesting, you still need to assign to STRUCT_NAME_size. +; +; The differences between this and using "struc" directly are that each +; type is implicitly aligned to its natural length (although this can be +; over-ridden with an explicit third parameter), and that the structure +; is padded at the end to its overall alignment. +; + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%ifndef _DATASTRUCT_ASM_ +%define _DATASTRUCT_ASM_ + +;; START_FIELDS +%macro START_FIELDS 0 +%assign _FIELD_OFFSET 0 +%assign _STRUCT_ALIGN 0 +%endm + +;; FIELD name size align +%macro FIELD 3 +%define %%name %1 +%define %%size %2 +%define %%align %3 + +%assign _FIELD_OFFSET (_FIELD_OFFSET + (%%align) - 1) & (~ ((%%align)-1)) +%%name equ _FIELD_OFFSET +%assign _FIELD_OFFSET _FIELD_OFFSET + (%%size) +%if (%%align > _STRUCT_ALIGN) +%assign _STRUCT_ALIGN %%align +%endif +%endm + +;; END_FIELDS +%macro END_FIELDS 0 +%assign _FIELD_OFFSET (_FIELD_OFFSET + _STRUCT_ALIGN-1) & (~ (_STRUCT_ALIGN-1)) +%endm + +%macro UNION 5-* +%if (0 == (%0 & 1)) + %error EVEN number of parameters to UNION Macro + %err +%endif +%rotate 1 + %assign _UNION_SIZE %1 + %assign _UNION_ALIGN %2 +%rep (%0 - 3)/2 + %rotate 2 + %if (%1 > _UNION_SIZE) + %assign _UNION_SIZE %1 + %endif + %if (%2 > _UNION_ALIGN) + %assign _UNION_ALIGN %2 + %endif +%endrep +%rotate 2 +FIELD %1, _UNION_SIZE, _UNION_ALIGN +%endm + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%macro STRUCT 1 +START_FIELDS +struc %1 +%endm + +%macro ENDSTRUCT 0 +%assign %%tmp _FIELD_OFFSET +END_FIELDS +%assign %%tmp (_FIELD_OFFSET - %%tmp) +%if (%%tmp > 0) + resb %%tmp +%endif +endstruc +%endm + +;; RES_int name size align +%macro RES_int 3 +%define %%name %1 +%define %%size %2 +%define %%align %3 + +%assign _FIELD_OFFSET (_FIELD_OFFSET + (%%align) - 1) & (~ ((%%align)-1)) +align %%align +%%name resb %%size +%assign _FIELD_OFFSET _FIELD_OFFSET + (%%size) +%if (%%align > _STRUCT_ALIGN) +%assign _STRUCT_ALIGN %%align +%endif +%endm + + + +; macro RES_B name, size [, align] +%macro RES_B 2-3 1 +RES_int %1, %2, %3 +%endm + +; macro RES_W name, size [, align] +%macro RES_W 2-3 2 +RES_int %1, 2*(%2), %3 +%endm + +; macro RES_D name, size [, align] +%macro RES_D 2-3 4 +RES_int %1, 4*(%2), %3 +%endm + +; macro RES_Q name, size [, align] +%macro RES_Q 2-3 8 +RES_int %1, 8*(%2), %3 +%endm + +; macro RES_DQ name, size [, align] +%macro RES_DQ 2-3 16 +RES_int %1, 16*(%2), %3 +%endm + +; macro RES_Y name, size [, align] +%macro RES_Y 2-3 32 +RES_int %1, 32*(%2), %3 +%endm + +; macro RES_Z name, size [, align] +%macro RES_Z 2-3 64 +RES_int %1, 64*(%2), %3 +%endm + + +%macro RES_U 5-* +%if (0 == (%0 & 1)) + %error EVEN number of parameters to RES_U Macro + %err +%endif +%rotate 1 + %assign _UNION_SIZE %1 + %assign _UNION_ALIGN %2 +%rep (%0 - 3)/2 + %rotate 2 + %if (%1 > _UNION_SIZE) + %assign _UNION_SIZE %1 + %endif + %if (%2 > _UNION_ALIGN) + %assign _UNION_ALIGN %2 + %endif +%endrep +%rotate 2 +RES_int %1, _UNION_SIZE, _UNION_ALIGN +%endm + +%endif ; end ifdef _DATASTRUCT_ASM_ diff --git a/src/spdk/intel-ipsec-mb/include/dbgprint.asm b/src/spdk/intel-ipsec-mb/include/dbgprint.asm new file mode 100644 index 000000000..d14eb0ebc --- /dev/null +++ b/src/spdk/intel-ipsec-mb/include/dbgprint.asm @@ -0,0 +1,413 @@ +;; +;; Copyright (c) 2012-2018, Intel Corporation +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are met: +;; +;; * Redistributions of source code must retain the above copyright notice, +;; this list of conditions and the following disclaimer. +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; * Neither the name of Intel Corporation nor the names of its contributors +;; may be used to endorse or promote products derived from this software +;; without specific prior written permission. +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;; + +; Macros for "printing" for debug purposes from within asm code +; +; The basic macros are: +; DBGPRINT16, DBGPRINT32, DBGPRINT64, DBGPRINT_XMM, DBGPRINT_YMM, DBGPRINT_ZMM +; These are called with 1 or more arguments, all of which are of the +; size/type as specified in the name. E.g. +; DBGPRINT64 reg1, reg2, reg3, ... +; +; There is also a macro DEBUGPRINTL that takes one argument, a string. E.g. +; DBGPRINTL "hit this point in the code" +; +; There are also variations on these with the "DBGPRINT" suffixed with "L", e.g. +; DBGPRINTL64. These take two or more arguments, where the first is a string, +; and the rest are of the specified type, e.g. +; DBGPRINTL64 "Rindex", Rindex +; Essentially, this is the same as a DBGPRINTL followed by DBGPRINT64. +; +; If DO_DBGPRINT is defined, then the macros write the debug information into +; a buffer. If DO_DBGPRINT is *not* defined, then the macros expand to nothing. +; +; CAVEAT: The macros need a GPR. Currently, it uses R15. If the first register +; argument is R15, then it will use R14. This means that if you try +; DBGPRINTL64 "text", rax, r15 +; you will not get the proper value of r15. +; One way to avoid this issue is to not use multiple registers on the same line +; if the register types are GPR (i.e. this is not an issue for printing XMM +; registers). E.g the above could be done with: +; DBGPRINTL64 "test", rax +; DBGPRINT64 r15 +; +; Note also that the macros only check for r15. Thus is you tried something +; like (after token expansion): +; DBGPRINT32 r15d +; you won't get the right results. If you want to display r15d, you should +; print it as the 64-bit r15. +; +; To actually print the data, from your C code include the file +; "dbgprint.h". The default buffer size is 16kB. If you want to change +; that, #define DBG_BUFFER_SIZE before including "dbgprint.h". +; +; Then, (after your asm routine(s) have returned, call +; print_debug() or print_debug(file pointer) +; If you do not specify a file pointer, it defaults to stdout. +; +; Printing the debug data also resets the write pointer to the beginning, +; effectively "deleting" the previous messages. +; +%ifndef DBGPRINT_ASM_INCLUDED +%define DBGPRINT_ASM_INCLUDED + +;%define DO_DBGPRINT +%ifdef DO_DBGPRINT +extern pDebugBuffer +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; DBGPRINT_INT size, param, ... +%macro DBGPRINT_INT 2-* +%ifidni %2,r15 +%xdefine %%reg r14 +%else +%xdefine %%reg r15 +%endif +%xdefine %%size %1 +%rotate 1 + push %%reg + mov %%reg, [pDebugBuffer] +%rep %0 - 1 + mov byte [%%reg], %%size + %if (%%size == 2) + mov word [%%reg+1], %1 + %elif (%%size == 4) + mov dword [%%reg+1], %1 + %elif (%%size == 8) + mov qword [%%reg+1], %1 + %elif (%%size == 16) + movdqu oword [%%reg+1], %1 + %elif (%%size == 32) + vmovdqu [%%reg+1], %1 + %elif (%%size == 64) + vmovdqu32 [%%reg+1], %1 + %else + %error invalid size %%size + %endif + add %%reg, %%size+1 +%rotate 1 +%endrep + mov [pDebugBuffer], %%reg + pop %%reg +%endmacro +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; DBGPRINTL_INT size, label, param, ... +%macro DBGPRINTL_INT 3-* +%ifidni %3,r15 +%xdefine %%reg r14 +%else +%xdefine %%reg r15 +%endif +%xdefine %%size %1 +%rotate 1 + push %%reg + mov %%reg, [pDebugBuffer] + + mov byte [%%reg], 0x57 +section .data +%%lab: db %1, 0 +section .text + mov qword [%%reg+1], %%lab + add %%reg, 8+1 +%rotate 1 + +%rep %0 - 2 + mov byte [%%reg], %%size +%if (%%size == 2) + mov word [%%reg+1], %1 +%elif (%%size == 4) + mov dword [%%reg+1], %1 +%elif (%%size == 8) + mov qword [%%reg+1], %1 +%elif (%%size == 16) + movdqu oword [%%reg+1], %1 +%elif (%%size == 32) + vmovdqu [%%reg+1], %1 +%elif (%%size == 64) + vmovdqu32 [%%reg+1], %1 +%else +%error invalid size %%size +%endif + add %%reg, %%size+1 +%rotate 1 +%endrep + mov [pDebugBuffer], %%reg + pop %%reg +%endmacro +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; DBGPRINTL* data, ... +%macro DBGPRINT16 1+ + DBGPRINT_INT 2, %1 +%endmacro +%macro DBGPRINT32 1+ + DBGPRINT_INT 4, %1 +%endmacro +%macro DBGPRINT64 1+ + DBGPRINT_INT 8, %1 +%endmacro +%macro DBGPRINT_XMM 1+ + DBGPRINT_INT 16, %1 +%endmacro +%macro DBGPRINT_YMM 1+ + DBGPRINT_INT 32, %1 +%endmacro +%macro DBGPRINT_ZMM 1+ + DBGPRINT_INT 64, %1 +%endmacro +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; DBGPRINTL* label, data, ... +%macro DBGPRINTL16 2+ + DBGPRINTL_INT 2, %1, %2 +%endmacro +%macro DBGPRINTL32 2+ + DBGPRINTL_INT 4, %1, %2 +%endmacro +%macro DBGPRINTL64 2+ + DBGPRINTL_INT 8, %1, %2 +%endmacro +%macro DBGPRINTL_XMM 2+ + DBGPRINTL_INT 16, %1, %2 +%endmacro +%macro DBGPRINTL_YMM 2+ + DBGPRINTL_INT 32, %1, %2 +%endmacro +%macro DBGPRINTL_ZMM 2+ + DBGPRINTL_INT 64, %1, %2 +%endmacro +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%macro DBGPRINTL 1 + push r15 + mov r15, [pDebugBuffer] + + mov byte [r15], 0x57 +section .data +%%lab: db %1, 0 +section .text + mov qword [r15+1], %%lab + add r15, 8+1 + + mov [pDebugBuffer], r15 + pop r15 +%endmacro +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%else +%macro DBGPRINT16 1+ +%endmacro +%macro DBGPRINT32 1+ +%endmacro +%macro DBGPRINT64 1+ +%endmacro +%macro DBGPRINT_XMM 1+ +%endmacro +%macro DBGPRINT_YMM 1+ +%endmacro +%macro DBGPRINT_ZMM 1+ +%endmacro +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%macro DBGPRINTL16 2+ +%endmacro +%macro DBGPRINTL32 2+ +%endmacro +%macro DBGPRINTL64 2+ +%endmacro +%macro DBGPRINTL_XMM 2+ +%endmacro +%macro DBGPRINTL_YMM 2+ +%endmacro +%macro DBGPRINTL_ZMM 2+ +%endmacro +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%macro DBGPRINTL 1 +%endmacro +%endif + + + +%if 0 ; OLD +%macro DBGPRINTL_ZMM 2-* + push rax + mov rax, [pDebugBuffer] + + mov byte [rax], 0x57 +section .data +%%lab: db %1, 0 +section .text + mov qword [rax+1], %%lab + add rax, 8+1 +%rotate 1 + +%rep %0 - 1 + mov byte [rax], 64 + vmovdqu32 [rax+1], %1 +%rotate 1 + add rax, 64+1 +%endrep + mov [pDebugBuffer], rax + pop rax +%endmacro +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%macro DBGPRINT_ZMM 1-* + push rax + mov rax, [pDebugBuffer] +%rep %0 + mov byte [rax], 64 + vmovdqu32 [rax+1], %1 +%rotate 1 + add rax, 64+1 +%endrep + mov [pDebugBuffer], rax + pop rax +%endmacro +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%macro DBGPRINT_YMM 1-* + push rax + mov rax, [pDebugBuffer] +%rep %0 + mov byte [rax], 32 + vmovdqu [rax+1], %1 +%rotate 1 + add rax, 32+1 +%endrep + mov [pDebugBuffer], rax + pop rax +%endmacro +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%macro DBGPRINT_XMM 1-* + push rax + mov rax, [pDebugBuffer] +%rep %0 + mov byte [rax], 16 + vmovdqu oword [rax+1], %1 +%rotate 1 + add rax, 16+1 +%endrep + mov [pDebugBuffer], rax + pop rax +%endmacro +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%macro DBGPRINTL64 2-* + push rax + mov rax, [pDebugBuffer] + + mov byte [rax], 0x57 +section .data +%%lab: db %1, 0 +section .text + mov qword [rax+1], %%lab + add rax, 8+1 +%rotate 1 + +%rep %0 - 1 + mov byte [rax], 8 + mov qword [rax+1], %1 +%rotate 1 + add rax, 8+1 +%endrep + mov [pDebugBuffer], rax + pop rax +%endmacro +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%macro DBGPRINT64 1-* + push rax + mov rax, [pDebugBuffer] +%rep %0 + mov byte [rax], 8 + mov qword [rax+1], %1 +%rotate 1 + add rax, 8+1 +%endrep + mov [pDebugBuffer], rax + pop rax +%endmacro +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%macro DBGPRINT32 1-* + push rax + mov rax, [pDebugBuffer] +%rep %0 + mov byte [rax], 4 + mov dword [rax+1], %1 +%rotate 1 + add rax, 4+1 +%endrep + mov [pDebugBuffer], rax + pop rax +%endmacro +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%macro DBGPRINT16 1-* + push rax + mov rax, [pDebugBuffer] +%rep %0 + mov byte [rax], 2 + mov word [rax+1], %1 +%rotate 1 + add rax, 2+1 +%endrep + mov [pDebugBuffer], rax + pop rax +%endmacro +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%macro DBGPRINT_LAB 1 + push rax + mov rax, [pDebugBuffer] + + mov byte [rax], 0x57 +section .data +%%lab: db %1, 0 +section .text + mov qword [rax+1], %%lab + add rax, 8+1 + + mov [pDebugBuffer], rax + pop rax +%endmacro +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%macro DBGHIST 2 + inc dword [%1 + 4 * %2] +%endmacro +%macro DBGPRINT_ZMM 1-* +%endmacro +%macro DBGPRINT_YMM 1-* +%endmacro +%macro DBGPRINT_XMM 1-* +%endmacro +%macro DBGPRINT64 1-* +%endmacro +%macro DBGPRINT32 1-* +%endmacro +%macro DBGPRINT16 1-* +%endmacro +%macro DBGHIST 2 +%endmacro +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%endif ; ifdef 0 ; OLD + +%endif ; DBGPRINT_ASM_INCLUDED diff --git a/src/spdk/intel-ipsec-mb/include/des_utils.h b/src/spdk/intel-ipsec-mb/include/des_utils.h new file mode 100644 index 000000000..4358132d0 --- /dev/null +++ b/src/spdk/intel-ipsec-mb/include/des_utils.h @@ -0,0 +1,134 @@ +/******************************************************************************* + Copyright (c) 2017-2018, Intel Corporation + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of Intel Corporation nor the names of its contributors + may be used to endorse or promote products derived from this software + without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE + FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +/* DES utility functions and macros */ + +#ifndef DES_UTILS_H +#define DES_UTILS_H + +#include +#include "intel-ipsec-mb.h" + +/** + * @brief Gets selected bit value out of a 64-bit word + * + * @param val 64-bit word + * @param n bit number (0 to 63) to get value of + * + * @return n-th bit value (0 or 1 value only) + */ +__forceinline +uint64_t bit_get64b(const uint64_t val, const unsigned n) +{ + IMB_ASSERT(n < 64); + return (val >> n) & UINT64_C(1); +} + +/** + * @brief Sets selected bit in a 64-bit word + * + * @param val 64-bit word + * @param n bit number (0 to 63) to get value of + * @param b bit value (0 or 1) + * + * @return val with n-th bit set to value b + */ +__forceinline +uint64_t bit_set64b(const uint64_t val, const unsigned n, const uint64_t b) +{ + const uint64_t m = UINT64_C(1) << n; + + IMB_ASSERT(n < 64); + return (val & (~m)) | (b << n); +} + +/** + * @brief Permutes bits in a 64-bit word as described by pattern + * + * The function goes through pattern array from index 0 to 'size' (max 63). + * It sets output bit number 'index' to value of + * bit number 'pattern[index] - 1' from 'in'. + * + * @param in 64-bit word to be permuted + * @param pattern pointer to array defining the permutation + * @param size is size of the permutation pattern + * + * @return permuted in word as described by the pattern + */ +__forceinline +uint64_t permute_64b(const uint64_t in, const uint8_t *pattern, const int size) +{ + uint64_t out = 0; + int n = 0; + + IMB_ASSERT(size <= 64); + + for (n = 0; n < size; n++) { + /* '-1' is required as bit numbers in FIPS start with 1 not 0 */ + const int m = ((int) pattern[n]) - 1; + const uint64_t bit_val = bit_get64b(in, m); + + out = bit_set64b(out, n, bit_val); + } + + return out; +} + +static const uint8_t reflect_tab[16] = { + /* [ 0] 0000 => 0000 */ 0, /* [ 1] 0001 => 1000 */ 8, + /* [ 2] 0010 => 0100 */ 4, /* [ 3] 0011 => 1100 */ 12, + /* [ 4] 0100 => 0010 */ 2, /* [ 5] 0101 => 1010 */ 10, + /* [ 6] 0110 => 0110 */ 6, /* [ 7] 0111 => 1110 */ 14, + /* [ 8] 1000 => 0001 */ 1, /* [ 9] 1001 => 1001 */ 9, + /* [10] 1010 => 0101 */ 5, /* [11] 1011 => 1101 */ 13, + /* [12] 1100 => 0011 */ 3, /* [13] 1101 => 1011 */ 11, + /* [14] 1110 => 0111 */ 7, /* [15] 1111 => 1111 */ 15 +}; + +__forceinline +uint8_t reflect_8b(const uint8_t pb) +{ + return reflect_tab[pb >> 4] | (reflect_tab[pb & 15] << 4); +} + +__forceinline +uint64_t load64_reflect(const void *key) +{ + const uint8_t *kb = (const uint8_t *) key; + + return ((uint64_t) reflect_8b(kb[0])) | + ((uint64_t) reflect_8b(kb[1])) << 8 | + ((uint64_t) reflect_8b(kb[2])) << 16 | + ((uint64_t) reflect_8b(kb[3])) << 24 | + ((uint64_t) reflect_8b(kb[4])) << 32 | + ((uint64_t) reflect_8b(kb[5])) << 40 | + ((uint64_t) reflect_8b(kb[6])) << 48 | + ((uint64_t) reflect_8b(kb[7])) << 56; +} + + +#endif /* DES_UTILS_H */ diff --git a/src/spdk/intel-ipsec-mb/include/gcm.h b/src/spdk/intel-ipsec-mb/include/gcm.h new file mode 100644 index 000000000..bcc13cb3a --- /dev/null +++ b/src/spdk/intel-ipsec-mb/include/gcm.h @@ -0,0 +1,428 @@ +/******************************************************************************* + Copyright (c) 2018-2019, Intel Corporation + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of Intel Corporation nor the names of its contributors + may be used to endorse or promote products derived from this software + without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE + FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#include "intel-ipsec-mb.h" + +#ifndef NO_GCM + +#ifndef _GCM_H_ +#define _GCM_H_ + +/* + * AVX512+VAES+VPCLMULQDQ GCM API + * - intentionally this is not exposed in intel-ipsec-mb.h + * - available through IMB_GCM_xxx() macros from intel-ipsec-mb.h + */ +IMB_DLL_EXPORT void +aes_gcm_enc_128_vaes_avx512(const struct gcm_key_data *key_data, + struct gcm_context_data *context_data, + uint8_t *out, uint8_t const *in, uint64_t len, + const uint8_t *iv, + uint8_t const *aad, uint64_t aad_len, + uint8_t *auth_tag, uint64_t auth_tag_len); +IMB_DLL_EXPORT void +aes_gcm_enc_192_vaes_avx512(const struct gcm_key_data *key_data, + struct gcm_context_data *context_data, + uint8_t *out, uint8_t const *in, uint64_t len, + const uint8_t *iv, + uint8_t const *aad, uint64_t aad_len, + uint8_t *auth_tag, uint64_t auth_tag_len); +IMB_DLL_EXPORT void +aes_gcm_enc_256_vaes_avx512(const struct gcm_key_data *key_data, + struct gcm_context_data *context_data, + uint8_t *out, uint8_t const *in, uint64_t len, + const uint8_t *iv, + uint8_t const *aad, uint64_t aad_len, + uint8_t *auth_tag, uint64_t auth_tag_len); +IMB_DLL_EXPORT void +aes_gcm_dec_128_vaes_avx512(const struct gcm_key_data *key_data, + struct gcm_context_data *context_data, + uint8_t *out, uint8_t const *in, uint64_t len, + const uint8_t *iv, + uint8_t const *aad, uint64_t aad_len, + uint8_t *auth_tag, uint64_t auth_tag_len); +IMB_DLL_EXPORT void +aes_gcm_dec_192_vaes_avx512(const struct gcm_key_data *key_data, + struct gcm_context_data *context_data, + uint8_t *out, uint8_t const *in, uint64_t len, + const uint8_t *iv, + uint8_t const *aad, uint64_t aad_len, + uint8_t *auth_tag, uint64_t auth_tag_len); +IMB_DLL_EXPORT void +aes_gcm_dec_256_vaes_avx512(const struct gcm_key_data *key_data, + struct gcm_context_data *context_data, + uint8_t *out, uint8_t const *in, uint64_t len, + const uint8_t *iv, + uint8_t const *aad, uint64_t aad_len, + uint8_t *auth_tag, uint64_t auth_tag_len); + +IMB_DLL_EXPORT void +aes_gcm_init_128_vaes_avx512(const struct gcm_key_data *key_data, + struct gcm_context_data *context_data, + const uint8_t *iv, uint8_t const *aad, + uint64_t aad_len); +IMB_DLL_EXPORT void +aes_gcm_init_192_vaes_avx512(const struct gcm_key_data *key_data, + struct gcm_context_data *context_data, + const uint8_t *iv, uint8_t const *aad, + uint64_t aad_len); +IMB_DLL_EXPORT void +aes_gcm_init_256_vaes_avx512(const struct gcm_key_data *key_data, + struct gcm_context_data *context_data, + const uint8_t *iv, uint8_t const *aad, + uint64_t aad_len); +IMB_DLL_EXPORT void +aes_gcm_enc_128_update_vaes_avx512(const struct gcm_key_data *key_data, + struct gcm_context_data *context_data, + uint8_t *out, const uint8_t *in, + uint64_t len); +IMB_DLL_EXPORT void +aes_gcm_enc_192_update_vaes_avx512(const struct gcm_key_data *key_data, + struct gcm_context_data *context_data, + uint8_t *out, const uint8_t *in, + uint64_t len); +IMB_DLL_EXPORT void +aes_gcm_enc_256_update_vaes_avx512(const struct gcm_key_data *key_data, + struct gcm_context_data *context_data, + uint8_t *out, const uint8_t *in, + uint64_t len); +IMB_DLL_EXPORT void +aes_gcm_dec_128_update_vaes_avx512(const struct gcm_key_data *key_data, + struct gcm_context_data *context_data, + uint8_t *out, const uint8_t *in, + uint64_t len); +IMB_DLL_EXPORT void +aes_gcm_dec_192_update_vaes_avx512(const struct gcm_key_data *key_data, + struct gcm_context_data *context_data, + uint8_t *out, const uint8_t *in, + uint64_t len); +IMB_DLL_EXPORT void +aes_gcm_dec_256_update_vaes_avx512(const struct gcm_key_data *key_data, + struct gcm_context_data *context_data, + uint8_t *out, const uint8_t *in, + uint64_t len); +IMB_DLL_EXPORT void +aes_gcm_enc_128_finalize_vaes_avx512(const struct gcm_key_data *key_data, + struct gcm_context_data *context_data, + uint8_t *auth_tag, uint64_t auth_tag_len); +IMB_DLL_EXPORT void +aes_gcm_enc_192_finalize_vaes_avx512(const struct gcm_key_data *key_data, + struct gcm_context_data *context_data, + uint8_t *auth_tag, uint64_t auth_tag_len); +IMB_DLL_EXPORT void +aes_gcm_enc_256_finalize_vaes_avx512(const struct gcm_key_data *key_data, + struct gcm_context_data *context_data, + uint8_t *auth_tag, uint64_t auth_tag_len); +IMB_DLL_EXPORT void +aes_gcm_dec_128_finalize_vaes_avx512(const struct gcm_key_data *key_data, + struct gcm_context_data *context_data, + uint8_t *auth_tag, uint64_t auth_tag_len); +IMB_DLL_EXPORT void +aes_gcm_dec_192_finalize_vaes_avx512(const struct gcm_key_data *key_data, + struct gcm_context_data *context_data, + uint8_t *auth_tag, uint64_t auth_tag_len); +IMB_DLL_EXPORT void +aes_gcm_dec_256_finalize_vaes_avx512(const struct gcm_key_data *key_data, + struct gcm_context_data *context_data, + uint8_t *auth_tag, uint64_t auth_tag_len); +IMB_DLL_EXPORT void +aes_gcm_precomp_128_vaes_avx512(struct gcm_key_data *key_data); +IMB_DLL_EXPORT void +aes_gcm_precomp_192_vaes_avx512(struct gcm_key_data *key_data); +IMB_DLL_EXPORT void +aes_gcm_precomp_256_vaes_avx512(struct gcm_key_data *key_data); + +IMB_DLL_EXPORT void +aes_gcm_pre_128_vaes_avx512(const void *key, struct gcm_key_data *key_data); +IMB_DLL_EXPORT void +aes_gcm_pre_192_vaes_avx512(const void *key, struct gcm_key_data *key_data); +IMB_DLL_EXPORT void +aes_gcm_pre_256_vaes_avx512(const void *key, struct gcm_key_data *key_data); + +/* + * AVX512 GCM API + * - intentionally this is not exposed in intel-ipsec-mb.h + * - available through IMB_GCM_xxx() macros from intel-ipsec-mb.h + */ +IMB_DLL_EXPORT void +aes_gcm_enc_128_avx512(const struct gcm_key_data *key_data, + struct gcm_context_data *context_data, + uint8_t *out, uint8_t const *in, uint64_t len, + const uint8_t *iv, + uint8_t const *aad, uint64_t aad_len, + uint8_t *auth_tag, uint64_t auth_tag_len); +IMB_DLL_EXPORT void +aes_gcm_enc_192_avx512(const struct gcm_key_data *key_data, + struct gcm_context_data *context_data, + uint8_t *out, uint8_t const *in, uint64_t len, + const uint8_t *iv, + uint8_t const *aad, uint64_t aad_len, + uint8_t *auth_tag, uint64_t auth_tag_len); +IMB_DLL_EXPORT void +aes_gcm_enc_256_avx512(const struct gcm_key_data *key_data, + struct gcm_context_data *context_data, + uint8_t *out, uint8_t const *in, uint64_t len, + const uint8_t *iv, + uint8_t const *aad, uint64_t aad_len, + uint8_t *auth_tag, uint64_t auth_tag_len); +IMB_DLL_EXPORT void +aes_gcm_dec_128_avx512(const struct gcm_key_data *key_data, + struct gcm_context_data *context_data, + uint8_t *out, uint8_t const *in, uint64_t len, + const uint8_t *iv, + uint8_t const *aad, uint64_t aad_len, + uint8_t *auth_tag, uint64_t auth_tag_len); +IMB_DLL_EXPORT void +aes_gcm_dec_192_avx512(const struct gcm_key_data *key_data, + struct gcm_context_data *context_data, + uint8_t *out, uint8_t const *in, uint64_t len, + const uint8_t *iv, + uint8_t const *aad, uint64_t aad_len, + uint8_t *auth_tag, uint64_t auth_tag_len); +IMB_DLL_EXPORT void +aes_gcm_dec_256_avx512(const struct gcm_key_data *key_data, + struct gcm_context_data *context_data, + uint8_t *out, uint8_t const *in, uint64_t len, + const uint8_t *iv, + uint8_t const *aad, uint64_t aad_len, + uint8_t *auth_tag, uint64_t auth_tag_len); + +IMB_DLL_EXPORT void +aes_gcm_init_128_avx512(const struct gcm_key_data *key_data, + struct gcm_context_data *context_data, + const uint8_t *iv, uint8_t const *aad, + uint64_t aad_len); +IMB_DLL_EXPORT void +aes_gcm_init_192_avx512(const struct gcm_key_data *key_data, + struct gcm_context_data *context_data, + const uint8_t *iv, uint8_t const *aad, + uint64_t aad_len); +IMB_DLL_EXPORT void +aes_gcm_init_256_avx512(const struct gcm_key_data *key_data, + struct gcm_context_data *context_data, + const uint8_t *iv, uint8_t const *aad, + uint64_t aad_len); +IMB_DLL_EXPORT void +aes_gcm_enc_128_update_avx512(const struct gcm_key_data *key_data, + struct gcm_context_data *context_data, + uint8_t *out, const uint8_t *in, + uint64_t len); +IMB_DLL_EXPORT void +aes_gcm_enc_192_update_avx512(const struct gcm_key_data *key_data, + struct gcm_context_data *context_data, + uint8_t *out, const uint8_t *in, + uint64_t len); +IMB_DLL_EXPORT void +aes_gcm_enc_256_update_avx512(const struct gcm_key_data *key_data, + struct gcm_context_data *context_data, + uint8_t *out, const uint8_t *in, + uint64_t len); +IMB_DLL_EXPORT void +aes_gcm_dec_128_update_avx512(const struct gcm_key_data *key_data, + struct gcm_context_data *context_data, + uint8_t *out, const uint8_t *in, + uint64_t len); +IMB_DLL_EXPORT void +aes_gcm_dec_192_update_avx512(const struct gcm_key_data *key_data, + struct gcm_context_data *context_data, + uint8_t *out, const uint8_t *in, + uint64_t len); +IMB_DLL_EXPORT void +aes_gcm_dec_256_update_avx512(const struct gcm_key_data *key_data, + struct gcm_context_data *context_data, + uint8_t *out, const uint8_t *in, + uint64_t len); +IMB_DLL_EXPORT void +aes_gcm_enc_128_finalize_avx512(const struct gcm_key_data *key_data, + struct gcm_context_data *context_data, + uint8_t *auth_tag, uint64_t auth_tag_len); +IMB_DLL_EXPORT void +aes_gcm_enc_192_finalize_avx512(const struct gcm_key_data *key_data, + struct gcm_context_data *context_data, + uint8_t *auth_tag, uint64_t auth_tag_len); +IMB_DLL_EXPORT void +aes_gcm_enc_256_finalize_avx512(const struct gcm_key_data *key_data, + struct gcm_context_data *context_data, + uint8_t *auth_tag, uint64_t auth_tag_len); +IMB_DLL_EXPORT void +aes_gcm_dec_128_finalize_avx512(const struct gcm_key_data *key_data, + struct gcm_context_data *context_data, + uint8_t *auth_tag, uint64_t auth_tag_len); +IMB_DLL_EXPORT void +aes_gcm_dec_192_finalize_avx512(const struct gcm_key_data *key_data, + struct gcm_context_data *context_data, + uint8_t *auth_tag, uint64_t auth_tag_len); +IMB_DLL_EXPORT void +aes_gcm_dec_256_finalize_avx512(const struct gcm_key_data *key_data, + struct gcm_context_data *context_data, + uint8_t *auth_tag, uint64_t auth_tag_len); +IMB_DLL_EXPORT void +aes_gcm_precomp_128_avx512(struct gcm_key_data *key_data); +IMB_DLL_EXPORT void +aes_gcm_precomp_192_avx512(struct gcm_key_data *key_data); +IMB_DLL_EXPORT void +aes_gcm_precomp_256_avx512(struct gcm_key_data *key_data); + +IMB_DLL_EXPORT void +aes_gcm_pre_128_avx512(const void *key, struct gcm_key_data *key_data); +IMB_DLL_EXPORT void +aes_gcm_pre_192_avx512(const void *key, struct gcm_key_data *key_data); +IMB_DLL_EXPORT void +aes_gcm_pre_256_avx512(const void *key, struct gcm_key_data *key_data); + +/* + * AESNI emulation GCM API (based on SSE acrhitecture) + * - intentionally this is not exposed in intel-ipsec-mb.h + * - available through IMB_GCM_xxx() macros from intel-ipsec-mb.h + */ +IMB_DLL_EXPORT void +aes_gcm_enc_128_sse_no_aesni(const struct gcm_key_data *key_data, + struct gcm_context_data *context_data, + uint8_t *out, uint8_t const *in, uint64_t len, + const uint8_t *iv, uint8_t const *aad, + uint64_t aad_len, uint8_t *auth_tag, + uint64_t auth_tag_len); +IMB_DLL_EXPORT void +aes_gcm_enc_192_sse_no_aesni(const struct gcm_key_data *key_data, + struct gcm_context_data *context_data, + uint8_t *out, uint8_t const *in, uint64_t len, + const uint8_t *iv, uint8_t const *aad, + uint64_t aad_len, uint8_t *auth_tag, + uint64_t auth_tag_len); +IMB_DLL_EXPORT void +aes_gcm_enc_256_sse_no_aesni(const struct gcm_key_data *key_data, + struct gcm_context_data *context_data, + uint8_t *out, uint8_t const *in, uint64_t len, + const uint8_t *iv, + uint8_t const *aad, uint64_t aad_len, + uint8_t *auth_tag, uint64_t auth_tag_len); +IMB_DLL_EXPORT void +aes_gcm_dec_128_sse_no_aesni(const struct gcm_key_data *key_data, + struct gcm_context_data *context_data, + uint8_t *out, uint8_t const *in, uint64_t len, + const uint8_t *iv, uint8_t const *aad, + uint64_t aad_len, uint8_t *auth_tag, + uint64_t auth_tag_len); +IMB_DLL_EXPORT void +aes_gcm_dec_192_sse_no_aesni(const struct gcm_key_data *key_data, + struct gcm_context_data *context_data, + uint8_t *out, uint8_t const *in, uint64_t len, + const uint8_t *iv, uint8_t const *aad, + uint64_t aad_len, uint8_t *auth_tag, + uint64_t auth_tag_len); +IMB_DLL_EXPORT void +aes_gcm_dec_256_sse_no_aesni(const struct gcm_key_data *key_data, + struct gcm_context_data *context_data, + uint8_t *out, uint8_t const *in, uint64_t len, + const uint8_t *iv, uint8_t const *aad, + uint64_t aad_len, uint8_t *auth_tag, + uint64_t auth_tag_len); +IMB_DLL_EXPORT void +aes_gcm_init_128_sse_no_aesni(const struct gcm_key_data *key_data, + struct gcm_context_data *context_data, + const uint8_t *iv, uint8_t const *aad, + uint64_t aad_len); +IMB_DLL_EXPORT void +aes_gcm_init_192_sse_no_aesni(const struct gcm_key_data *key_data, + struct gcm_context_data *context_data, + const uint8_t *iv, uint8_t const *aad, + uint64_t aad_len); +IMB_DLL_EXPORT void +aes_gcm_init_256_sse_no_aesni(const struct gcm_key_data *key_data, + struct gcm_context_data *context_data, + const uint8_t *iv, uint8_t const *aad, + uint64_t aad_len); +IMB_DLL_EXPORT void +aes_gcm_enc_128_update_sse_no_aesni(const struct gcm_key_data *key_data, + struct gcm_context_data *context_data, + uint8_t *out, const uint8_t *in, + uint64_t len); +IMB_DLL_EXPORT void +aes_gcm_enc_192_update_sse_no_aesni(const struct gcm_key_data *key_data, + struct gcm_context_data *context_data, + uint8_t *out, const uint8_t *in, + uint64_t len); +IMB_DLL_EXPORT void +aes_gcm_enc_256_update_sse_no_aesni(const struct gcm_key_data *key_data, + struct gcm_context_data *context_data, + uint8_t *out, const uint8_t *in, + uint64_t len); +IMB_DLL_EXPORT void +aes_gcm_dec_128_update_sse_no_aesni(const struct gcm_key_data *key_data, + struct gcm_context_data *context_data, + uint8_t *out, const uint8_t *in, + uint64_t len); +IMB_DLL_EXPORT void +aes_gcm_dec_192_update_sse_no_aesni(const struct gcm_key_data *key_data, + struct gcm_context_data *context_data, + uint8_t *out, const uint8_t *in, + uint64_t len); +IMB_DLL_EXPORT void +aes_gcm_dec_256_update_sse_no_aesni(const struct gcm_key_data *key_data, + struct gcm_context_data *context_data, + uint8_t *out, const uint8_t *in, + uint64_t len); +IMB_DLL_EXPORT void +aes_gcm_enc_128_finalize_sse_no_aesni(const struct gcm_key_data *key_data, + struct gcm_context_data *context_data, + uint8_t *auth_tag, uint64_t auth_tag_len); +IMB_DLL_EXPORT void +aes_gcm_enc_192_finalize_sse_no_aesni(const struct gcm_key_data *key_data, + struct gcm_context_data *context_data, + uint8_t *auth_tag, uint64_t auth_tag_len); +IMB_DLL_EXPORT void +aes_gcm_enc_256_finalize_sse_no_aesni(const struct gcm_key_data *key_data, + struct gcm_context_data *context_data, + uint8_t *auth_tag, uint64_t auth_tag_len); +IMB_DLL_EXPORT void +aes_gcm_dec_128_finalize_sse_no_aesni(const struct gcm_key_data *key_data, + struct gcm_context_data *context_data, + uint8_t *auth_tag, uint64_t auth_tag_len); +IMB_DLL_EXPORT void +aes_gcm_dec_192_finalize_sse_no_aesni(const struct gcm_key_data *key_data, + struct gcm_context_data *context_data, + uint8_t *auth_tag, uint64_t auth_tag_len); +IMB_DLL_EXPORT void +aes_gcm_dec_256_finalize_sse_no_aesni(const struct gcm_key_data *key_data, + struct gcm_context_data *context_data, + uint8_t *auth_tag, uint64_t auth_tag_len); +IMB_DLL_EXPORT void +aes_gcm_precomp_128_sse_no_aesni(struct gcm_key_data *key_data); +IMB_DLL_EXPORT void +aes_gcm_precomp_192_sse_no_aesni(struct gcm_key_data *key_data); +IMB_DLL_EXPORT void +aes_gcm_precomp_256_sse_no_aesni(struct gcm_key_data *key_data); + +IMB_DLL_EXPORT void +aes_gcm_pre_128_sse_no_aesni(const void *key, struct gcm_key_data *key_data); +IMB_DLL_EXPORT void +aes_gcm_pre_192_sse_no_aesni(const void *key, struct gcm_key_data *key_data); +IMB_DLL_EXPORT void +aes_gcm_pre_256_sse_no_aesni(const void *key, struct gcm_key_data *key_data); + +#endif /* _GCM_H_ */ +#endif /* NO_GCM */ diff --git a/src/spdk/intel-ipsec-mb/include/gcm_defines.asm b/src/spdk/intel-ipsec-mb/include/gcm_defines.asm new file mode 100644 index 000000000..31a961729 --- /dev/null +++ b/src/spdk/intel-ipsec-mb/include/gcm_defines.asm @@ -0,0 +1,272 @@ +;; +;; Copyright (c) 2012-2019, Intel Corporation +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are met: +;; +;; * Redistributions of source code must retain the above copyright notice, +;; this list of conditions and the following disclaimer. +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; * Neither the name of Intel Corporation nor the names of its contributors +;; may be used to endorse or promote products derived from this software +;; without specific prior written permission. +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;; + +%ifndef GCM_DEFINES_ASM_INCLUDED +%define GCM_DEFINES_ASM_INCLUDED + +; +; Authors: +; Erdinc Ozturk +; Vinodh Gopal +; James Guilford + +section .data +default rel + +align 16 +POLY: dq 0x0000000000000001, 0xC200000000000000 + +align 64 +POLY2: + dq 0x00000001C2000000, 0xC200000000000000 + dq 0x00000001C2000000, 0xC200000000000000 + dq 0x00000001C2000000, 0xC200000000000000 + dq 0x00000001C2000000, 0xC200000000000000 + +align 16 +TWOONE: dq 0x0000000000000001, 0x0000000100000000 + +;;; @note Order of these constants should not change. +;;; More specifically, ALL_F should follow SHIFT_MASK, and ZERO should follow ALL_F +align 64 +SHUF_MASK: + dq 0x08090A0B0C0D0E0F, 0x0001020304050607 + dq 0x08090A0B0C0D0E0F, 0x0001020304050607 + dq 0x08090A0B0C0D0E0F, 0x0001020304050607 + dq 0x08090A0B0C0D0E0F, 0x0001020304050607 + +align 16 +SHIFT_MASK: + dq 0x0706050403020100, 0x0f0e0d0c0b0a0908 + +ALL_F: + dq 0xffffffffffffffff, 0xffffffffffffffff + +ZERO: + dq 0x0000000000000000, 0x0000000000000000 + +align 16 +ONE: + dq 0x0000000000000001, 0x0000000000000000 + +align 16 +TWO: + dq 0x0000000000000002, 0x0000000000000000 + +align 16 +ONEf: + dq 0x0000000000000000, 0x0100000000000000 + +align 16 +TWOf: + dq 0x0000000000000000, 0x0200000000000000 + +align 64 +ddq_add_1234: + dq 0x0000000000000001, 0x0000000000000000 + dq 0x0000000000000002, 0x0000000000000000 + dq 0x0000000000000003, 0x0000000000000000 + dq 0x0000000000000004, 0x0000000000000000 + +align 64 +ddq_add_5678: + dq 0x0000000000000005, 0x0000000000000000 + dq 0x0000000000000006, 0x0000000000000000 + dq 0x0000000000000007, 0x0000000000000000 + dq 0x0000000000000008, 0x0000000000000000 + +align 64 +ddq_add_4444: + dq 0x0000000000000004, 0x0000000000000000 + dq 0x0000000000000004, 0x0000000000000000 + dq 0x0000000000000004, 0x0000000000000000 + dq 0x0000000000000004, 0x0000000000000000 + +align 64 +ddq_add_8888: + dq 0x0000000000000008, 0x0000000000000000 + dq 0x0000000000000008, 0x0000000000000000 + dq 0x0000000000000008, 0x0000000000000000 + dq 0x0000000000000008, 0x0000000000000000 + +align 64 +ddq_addbe_1234: + dq 0x0000000000000000, 0x0100000000000000 + dq 0x0000000000000000, 0x0200000000000000 + dq 0x0000000000000000, 0x0300000000000000 + dq 0x0000000000000000, 0x0400000000000000 + +align 64 +ddq_addbe_5678: + dq 0x0000000000000000, 0x0500000000000000 + dq 0x0000000000000000, 0x0600000000000000 + dq 0x0000000000000000, 0x0700000000000000 + dq 0x0000000000000000, 0x0800000000000000 + +align 64 +ddq_addbe_4444: + dq 0x0000000000000000, 0x0400000000000000 + dq 0x0000000000000000, 0x0400000000000000 + dq 0x0000000000000000, 0x0400000000000000 + dq 0x0000000000000000, 0x0400000000000000 + +align 64 +ddq_addbe_8888: + dq 0x0000000000000000, 0x0800000000000000 + dq 0x0000000000000000, 0x0800000000000000 + dq 0x0000000000000000, 0x0800000000000000 + dq 0x0000000000000000, 0x0800000000000000 + +align 64 +byte_len_to_mask_table: + dw 0x0000, 0x0001, 0x0003, 0x0007, + dw 0x000f, 0x001f, 0x003f, 0x007f, + dw 0x00ff, 0x01ff, 0x03ff, 0x07ff, + dw 0x0fff, 0x1fff, 0x3fff, 0x7fff, + dw 0xffff + +align 64 +byte64_len_to_mask_table: + dq 0x0000000000000000, 0x0000000000000001 + dq 0x0000000000000003, 0x0000000000000007 + dq 0x000000000000000f, 0x000000000000001f + dq 0x000000000000003f, 0x000000000000007f + dq 0x00000000000000ff, 0x00000000000001ff + dq 0x00000000000003ff, 0x00000000000007ff + dq 0x0000000000000fff, 0x0000000000001fff + dq 0x0000000000003fff, 0x0000000000007fff + dq 0x000000000000ffff, 0x000000000001ffff + dq 0x000000000003ffff, 0x000000000007ffff + dq 0x00000000000fffff, 0x00000000001fffff + dq 0x00000000003fffff, 0x00000000007fffff + dq 0x0000000000ffffff, 0x0000000001ffffff + dq 0x0000000003ffffff, 0x0000000007ffffff + dq 0x000000000fffffff, 0x000000001fffffff + dq 0x000000003fffffff, 0x000000007fffffff + dq 0x00000000ffffffff, 0x00000001ffffffff + dq 0x00000003ffffffff, 0x00000007ffffffff + dq 0x0000000fffffffff, 0x0000001fffffffff + dq 0x0000003fffffffff, 0x0000007fffffffff + dq 0x000000ffffffffff, 0x000001ffffffffff + dq 0x000003ffffffffff, 0x000007ffffffffff + dq 0x00000fffffffffff, 0x00001fffffffffff + dq 0x00003fffffffffff, 0x00007fffffffffff + dq 0x0000ffffffffffff, 0x0001ffffffffffff + dq 0x0003ffffffffffff, 0x0007ffffffffffff + dq 0x000fffffffffffff, 0x001fffffffffffff + dq 0x003fffffffffffff, 0x007fffffffffffff + dq 0x00ffffffffffffff, 0x01ffffffffffffff + dq 0x03ffffffffffffff, 0x07ffffffffffffff + dq 0x0fffffffffffffff, 0x1fffffffffffffff + dq 0x3fffffffffffffff, 0x7fffffffffffffff + dq 0xffffffffffffffff + +align 64 +mask_out_top_block: + dq 0xffffffffffffffff, 0xffffffffffffffff + dq 0xffffffffffffffff, 0xffffffffffffffff + dq 0xffffffffffffffff, 0xffffffffffffffff + dq 0x0000000000000000, 0x0000000000000000 + +section .text + +;;define the fields of gcm_context_data struct +;; struct gcm_context_data { +;; // init, update and finalize context data +;; uint8_t aad_hash[GCM_BLOCK_LEN]; +;; uint64_t aad_length; +;; uint64_t in_length; +;; uint8_t partial_block_enc_key[GCM_BLOCK_LEN]; +;; uint8_t orig_IV[GCM_BLOCK_LEN]; +;; uint8_t current_counter[GCM_BLOCK_LEN]; +;; uint64_t partial_block_length; +;; }; + +%define AadHash (16*0) ; store current Hash of data which has been input +%define AadLen (16*1) ; store length of input data which will not be encrypted or decrypted +%define InLen ((16*1)+8); store length of input data which will be encrypted or decrypted +%define PBlockEncKey (16*2) ; encryption key for the partial block at the end of the previous update +%define OrigIV (16*3) ; input IV +%define CurCount (16*4) ; Current counter for generation of encryption key +%define PBlockLen (16*5) ; length of partial block at the end of the previous update + +%define reg(q) xmm %+ q +%define regy(q) ymm %+ q +%define regz(q) zmm %+ q + +%ifdef WIN_ABI + %xdefine arg1 rcx + %xdefine arg2 rdx + %xdefine arg3 r8 + %xdefine arg4 r9 + %xdefine arg5 qword [r14 + STACK_OFFSET + 8*5] + %xdefine arg6 qword [r14 + STACK_OFFSET + 8*6] + %xdefine arg7 qword [r14 + STACK_OFFSET + 8*7] + %xdefine arg8 qword [r14 + STACK_OFFSET + 8*8] + %xdefine arg9 qword [r14 + STACK_OFFSET + 8*9] + %xdefine arg10 qword [r14 + STACK_OFFSET + 8*10] +%else + %xdefine arg1 rdi + %xdefine arg2 rsi + %xdefine arg3 rdx + %xdefine arg4 rcx + %xdefine arg5 r8 + %xdefine arg6 r9 + %xdefine arg7 qword [r14 + STACK_OFFSET + 8*1] + %xdefine arg8 qword [r14 + STACK_OFFSET + 8*2] + %xdefine arg9 qword [r14 + STACK_OFFSET + 8*3] + %xdefine arg10 qword [r14 + STACK_OFFSET + 8*4] +%endif + +%ifdef NT_LDST + %define NT_LD + %define NT_ST +%endif + +;;; Use Non-temporal load/stor +%ifdef NT_LD + %define XLDR movntdqa + %define VXLDR vmovntdqa + %define VX512LDR vmovntdqa +%else + %define XLDR movdqu + %define VXLDR vmovdqu + %define VX512LDR vmovdqu8 +%endif + +;;; Use Non-temporal load/stor +%ifdef NT_ST + %define XSTR movntdq + %define VXSTR vmovntdq + %define VX512STR vmovntdq +%else + %define XSTR movdqu + %define VXSTR vmovdqu + %define VX512STR vmovdqu8 +%endif + +%endif ; GCM_DEFINES_ASM_INCLUDED diff --git a/src/spdk/intel-ipsec-mb/include/gcm_keys_avx2_avx512.asm b/src/spdk/intel-ipsec-mb/include/gcm_keys_avx2_avx512.asm new file mode 100644 index 000000000..d812e53bd --- /dev/null +++ b/src/spdk/intel-ipsec-mb/include/gcm_keys_avx2_avx512.asm @@ -0,0 +1,52 @@ +;; +;; Copyright (c) 2019, Intel Corporation +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are met: +;; +;; * Redistributions of source code must retain the above copyright notice, +;; this list of conditions and the following disclaimer. +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; * Neither the name of Intel Corporation nor the names of its contributors +;; may be used to endorse or promote products derived from this software +;; without specific prior written permission. +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;; + +%ifndef GCM_KEYS_AVX2_AVX512_INCLUDED +%define GCM_KEYS_AVX2_AVX512_INCLUDED + +;; Define the fields of gcm_key_data struct: +;; uint8_t expanded_keys[GCM_ENC_KEY_LEN * GCM_KEY_SETS]; +;; uint8_t shifted_hkey_8[GCM_ENC_KEY_LEN]; // HashKey^8 <<1 mod poly +;; uint8_t shifted_hkey_7[GCM_ENC_KEY_LEN]; // HashKey^7 <<1 mod poly +;; uint8_t shifted_hkey_6[GCM_ENC_KEY_LEN]; // HashKey^6 <<1 mod poly +;; uint8_t shifted_hkey_5[GCM_ENC_KEY_LEN]; // HashKey^5 <<1 mod poly +;; uint8_t shifted_hkey_4[GCM_ENC_KEY_LEN]; // HashKey^4 <<1 mod poly +;; uint8_t shifted_hkey_3[GCM_ENC_KEY_LEN]; // HashKey^3 <<1 mod poly +;; uint8_t shifted_hkey_2[GCM_ENC_KEY_LEN]; // HashKey^2 <<1 mod poly +;; uint8_t shifted_hkey_1[GCM_ENC_KEY_LEN]; // HashKey <<1 mod poly + +%define HashKey_8 (16*15) ; HashKey^8 <<1 mod poly +%define HashKey_7 (16*16) ; HashKey^7 <<1 mod poly +%define HashKey_6 (16*17) ; HashKey^6 <<1 mod poly +%define HashKey_5 (16*18) ; HashKey^5 <<1 mod poly +%define HashKey_4 (16*19) ; HashKey^4 <<1 mod poly +%define HashKey_3 (16*20) ; HashKey^3 <<1 mod poly +%define HashKey_2 (16*21) ; HashKey^2 <<1 mod poly +%define HashKey_1 (16*22) ; HashKey <<1 mod poly +%define HashKey (16*22) ; HashKey <<1 mod poly + +%endif ; GCM_KEYS_AVX2_AVX512_INCLUDED diff --git a/src/spdk/intel-ipsec-mb/include/gcm_keys_sse_avx.asm b/src/spdk/intel-ipsec-mb/include/gcm_keys_sse_avx.asm new file mode 100644 index 000000000..f7531e5a7 --- /dev/null +++ b/src/spdk/intel-ipsec-mb/include/gcm_keys_sse_avx.asm @@ -0,0 +1,73 @@ +;; +;; Copyright (c) 2019, Intel Corporation +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are met: +;; +;; * Redistributions of source code must retain the above copyright notice, +;; this list of conditions and the following disclaimer. +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; * Neither the name of Intel Corporation nor the names of its contributors +;; may be used to endorse or promote products derived from this software +;; without specific prior written permission. +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;; + +%ifndef GCM_KEYS_SSE_AVX_INCLUDED +%define GCM_KEYS_SSE_AVX_INCLUDED + +;; Define the fields of gcm_key_data struct: +;; uint8_t expanded_keys[GCM_ENC_KEY_LEN * GCM_KEY_SETS]; +;; uint8_t shifted_hkey_8[GCM_ENC_KEY_LEN]; // HashKey^8 <<1 mod poly +;; uint8_t shifted_hkey_7[GCM_ENC_KEY_LEN]; // HashKey^7 <<1 mod poly +;; uint8_t shifted_hkey_6[GCM_ENC_KEY_LEN]; // HashKey^6 <<1 mod poly +;; uint8_t shifted_hkey_5[GCM_ENC_KEY_LEN]; // HashKey^5 <<1 mod poly +;; uint8_t shifted_hkey_4[GCM_ENC_KEY_LEN]; // HashKey^4 <<1 mod poly +;; uint8_t shifted_hkey_3[GCM_ENC_KEY_LEN]; // HashKey^3 <<1 mod poly +;; uint8_t shifted_hkey_2[GCM_ENC_KEY_LEN]; // HashKey^2 <<1 mod poly +;; uint8_t shifted_hkey_1[GCM_ENC_KEY_LEN]; // HashKey <<1 mod poly +;; uint8_t shifted_hkey_1_k[GCM_ENC_KEY_LEN]; // XOR of High and Low 64 bits of HashKey <<1 mod poly (Karatsuba) +;; uint8_t shifted_hkey_2_k[GCM_ENC_KEY_LEN]; // XOR of High and Low 64 bits of HashKey^2 <<1 mod poly (Karatsuba) +;; uint8_t shifted_hkey_3_k[GCM_ENC_KEY_LEN]; // XOR of High and Low 64 bits of HashKey^3 <<1 mod poly (Karatsuba) +;; uint8_t shifted_hkey_4_k[GCM_ENC_KEY_LEN]; // XOR of High and Low 64 bits of HashKey^4 <<1 mod poly (Karatsuba) +;; uint8_t shifted_hkey_5_k[GCM_ENC_KEY_LEN]; // XOR of High and Low 64 bits of HashKey^5 <<1 mod poly (Karatsuba) +;; uint8_t shifted_hkey_6_k[GCM_ENC_KEY_LEN]; // XOR of High and Low 64 bits of HashKey^6 <<1 mod poly (Karatsuba) +;; uint8_t shifted_hkey_7_k[GCM_ENC_KEY_LEN]; // XOR of High and Low 64 bits of HashKey^7 <<1 mod poly (Karatsuba) +;; uint8_t shifted_hkey_8_k[GCM_ENC_KEY_LEN]; // XOR of High and Low 64 bits of HashKey^8 <<1 mod poly (Karatsuba) + +;; +;; Key structure holds up to 8 ghash keys +;; +%define HashKey_8 (16*15) ; HashKey^8 <<1 mod poly +%define HashKey_7 (16*16) ; HashKey^7 <<1 mod poly +%define HashKey_6 (16*17) ; HashKey^6 <<1 mod poly +%define HashKey_5 (16*18) ; HashKey^5 <<1 mod poly +%define HashKey_4 (16*19) ; HashKey^4 <<1 mod poly +%define HashKey_3 (16*20) ; HashKey^3 <<1 mod poly +%define HashKey_2 (16*21) ; HashKey^2 <<1 mod poly +%define HashKey_1 (16*22) ; HashKey <<1 mod poly +%define HashKey (16*22) ; HashKey <<1 mod poly +;; ghash keys for Karatsuba multiply +%define HashKey_k (16*23) ; XOR of High 64 bits and Low 64 bits of HashKey <<1 mod poly +%define HashKey_1_k (16*23) ; XOR of High 64 bits and Low 64 bits of HashKey <<1 mod poly +%define HashKey_2_k (16*24) ; XOR of High 64 bits and Low 64 bits of HashKey^2 <<1 mod poly +%define HashKey_3_k (16*25) ; XOR of High 64 bits and Low 64 bits of HashKey^3 <<1 mod poly +%define HashKey_4_k (16*26) ; XOR of High 64 bits and Low 64 bits of HashKey^4 <<1 mod poly +%define HashKey_5_k (16*27) ; XOR of High 64 bits and Low 64 bits of HashKey^5 <<1 mod poly +%define HashKey_6_k (16*28) ; XOR of High 64 bits and Low 64 bits of HashKey^6 <<1 mod poly +%define HashKey_7_k (16*29) ; XOR of High 64 bits and Low 64 bits of HashKey^7 <<1 mod poly +%define HashKey_8_k (16*30) ; XOR of High 64 bits and Low 64 bits of HashKey^8 <<1 mod poly + +%endif ; GCM_KEYS_SSE_AVX_INCLUDED diff --git a/src/spdk/intel-ipsec-mb/include/gcm_keys_vaes_avx512.asm b/src/spdk/intel-ipsec-mb/include/gcm_keys_vaes_avx512.asm new file mode 100644 index 000000000..4aea2f5c9 --- /dev/null +++ b/src/spdk/intel-ipsec-mb/include/gcm_keys_vaes_avx512.asm @@ -0,0 +1,231 @@ +;; +;; Copyright (c) 2019, Intel Corporation +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are met: +;; +;; * Redistributions of source code must retain the above copyright notice, +;; this list of conditions and the following disclaimer. +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; * Neither the name of Intel Corporation nor the names of its contributors +;; may be used to endorse or promote products derived from this software +;; without specific prior written permission. +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;; + +%ifndef GCM_KEYS_VAES_AVX512_INCLUDED +%define GCM_KEYS_VAES_AVX512_INCLUDED + +;; Define the fields of gcm_key_data struct: +;; uint8_t expanded_keys[GCM_ENC_KEY_LEN * GCM_KEY_SETS]; +;; uint8_t shifted_hkey_9_128[GCM_ENC_KEY_LEN * (128 - 8)]; +;; uint8_t shifted_hkey_8[GCM_ENC_KEY_LEN]; // HashKey^8 <<1 mod poly +;; uint8_t shifted_hkey_7[GCM_ENC_KEY_LEN]; // HashKey^7 <<1 mod poly +;; uint8_t shifted_hkey_6[GCM_ENC_KEY_LEN]; // HashKey^6 <<1 mod poly +;; uint8_t shifted_hkey_5[GCM_ENC_KEY_LEN]; // HashKey^5 <<1 mod poly +;; uint8_t shifted_hkey_4[GCM_ENC_KEY_LEN]; // HashKey^4 <<1 mod poly +;; uint8_t shifted_hkey_3[GCM_ENC_KEY_LEN]; // HashKey^3 <<1 mod poly +;; uint8_t shifted_hkey_2[GCM_ENC_KEY_LEN]; // HashKey^2 <<1 mod poly +;; uint8_t shifted_hkey_1[GCM_ENC_KEY_LEN]; // HashKey <<1 mod poly + +%ifdef GCM_BIG_DATA +;; +;; Key structure holds up to 128 ghash keys +;; +%define HashKey_128 (16*15) ; HashKey^128 <<1 mod poly +%define HashKey_127 (16*16) ; HashKey^127 <<1 mod poly +%define HashKey_126 (16*17) ; HashKey^126 <<1 mod poly +%define HashKey_125 (16*18) ; HashKey^125 <<1 mod poly +%define HashKey_124 (16*19) ; HashKey^124 <<1 mod poly +%define HashKey_123 (16*20) ; HashKey^123 <<1 mod poly +%define HashKey_122 (16*21) ; HashKey^122 <<1 mod poly +%define HashKey_121 (16*22) ; HashKey^121 <<1 mod poly +%define HashKey_120 (16*23) ; HashKey^120 <<1 mod poly +%define HashKey_119 (16*24) ; HashKey^119 <<1 mod poly +%define HashKey_118 (16*25) ; HashKey^118 <<1 mod poly +%define HashKey_117 (16*26) ; HashKey^117 <<1 mod poly +%define HashKey_116 (16*27) ; HashKey^116 <<1 mod poly +%define HashKey_115 (16*28) ; HashKey^115 <<1 mod poly +%define HashKey_114 (16*29) ; HashKey^114 <<1 mod poly +%define HashKey_113 (16*30) ; HashKey^113 <<1 mod poly +%define HashKey_112 (16*31) ; HashKey^112 <<1 mod poly +%define HashKey_111 (16*32) ; HashKey^111 <<1 mod poly +%define HashKey_110 (16*33) ; HashKey^110 <<1 mod poly +%define HashKey_109 (16*34) ; HashKey^109 <<1 mod poly +%define HashKey_108 (16*35) ; HashKey^108 <<1 mod poly +%define HashKey_107 (16*36) ; HashKey^107 <<1 mod poly +%define HashKey_106 (16*37) ; HashKey^106 <<1 mod poly +%define HashKey_105 (16*38) ; HashKey^105 <<1 mod poly +%define HashKey_104 (16*39) ; HashKey^104 <<1 mod poly +%define HashKey_103 (16*40) ; HashKey^103 <<1 mod poly +%define HashKey_102 (16*41) ; HashKey^102 <<1 mod poly +%define HashKey_101 (16*42) ; HashKey^101 <<1 mod poly +%define HashKey_100 (16*43) ; HashKey^100 <<1 mod poly +%define HashKey_99 (16*44) ; HashKey^99 <<1 mod poly +%define HashKey_98 (16*45) ; HashKey^98 <<1 mod poly +%define HashKey_97 (16*46) ; HashKey^97 <<1 mod poly +%define HashKey_96 (16*47) ; HashKey^96 <<1 mod poly +%define HashKey_95 (16*48) ; HashKey^95 <<1 mod poly +%define HashKey_94 (16*49) ; HashKey^94 <<1 mod poly +%define HashKey_93 (16*50) ; HashKey^93 <<1 mod poly +%define HashKey_92 (16*51) ; HashKey^92 <<1 mod poly +%define HashKey_91 (16*52) ; HashKey^91 <<1 mod poly +%define HashKey_90 (16*53) ; HashKey^90 <<1 mod poly +%define HashKey_89 (16*54) ; HashKey^89 <<1 mod poly +%define HashKey_88 (16*55) ; HashKey^88 <<1 mod poly +%define HashKey_87 (16*56) ; HashKey^87 <<1 mod poly +%define HashKey_86 (16*57) ; HashKey^86 <<1 mod poly +%define HashKey_85 (16*58) ; HashKey^85 <<1 mod poly +%define HashKey_84 (16*59) ; HashKey^84 <<1 mod poly +%define HashKey_83 (16*60) ; HashKey^83 <<1 mod poly +%define HashKey_82 (16*61) ; HashKey^82 <<1 mod poly +%define HashKey_81 (16*62) ; HashKey^81 <<1 mod poly +%define HashKey_80 (16*63) ; HashKey^80 <<1 mod poly +%define HashKey_79 (16*64) ; HashKey^79 <<1 mod poly +%define HashKey_78 (16*65) ; HashKey^78 <<1 mod poly +%define HashKey_77 (16*66) ; HashKey^77 <<1 mod poly +%define HashKey_76 (16*67) ; HashKey^76 <<1 mod poly +%define HashKey_75 (16*68) ; HashKey^75 <<1 mod poly +%define HashKey_74 (16*69) ; HashKey^74 <<1 mod poly +%define HashKey_73 (16*70) ; HashKey^73 <<1 mod poly +%define HashKey_72 (16*71) ; HashKey^72 <<1 mod poly +%define HashKey_71 (16*72) ; HashKey^71 <<1 mod poly +%define HashKey_70 (16*73) ; HashKey^70 <<1 mod poly +%define HashKey_69 (16*74) ; HashKey^69 <<1 mod poly +%define HashKey_68 (16*75) ; HashKey^68 <<1 mod poly +%define HashKey_67 (16*76) ; HashKey^67 <<1 mod poly +%define HashKey_66 (16*77) ; HashKey^66 <<1 mod poly +%define HashKey_65 (16*78) ; HashKey^65 <<1 mod poly +%define HashKey_64 (16*79) ; HashKey^64 <<1 mod poly +%define HashKey_63 (16*80) ; HashKey^63 <<1 mod poly +%define HashKey_62 (16*81) ; HashKey^62 <<1 mod poly +%define HashKey_61 (16*82) ; HashKey^61 <<1 mod poly +%define HashKey_60 (16*83) ; HashKey^60 <<1 mod poly +%define HashKey_59 (16*84) ; HashKey^59 <<1 mod poly +%define HashKey_58 (16*85) ; HashKey^58 <<1 mod poly +%define HashKey_57 (16*86) ; HashKey^57 <<1 mod poly +%define HashKey_56 (16*87) ; HashKey^56 <<1 mod poly +%define HashKey_55 (16*88) ; HashKey^55 <<1 mod poly +%define HashKey_54 (16*89) ; HashKey^54 <<1 mod poly +%define HashKey_53 (16*90) ; HashKey^53 <<1 mod poly +%define HashKey_52 (16*91) ; HashKey^52 <<1 mod poly +%define HashKey_51 (16*92) ; HashKey^51 <<1 mod poly +%define HashKey_50 (16*93) ; HashKey^50 <<1 mod poly +%define HashKey_49 (16*94) ; HashKey^49 <<1 mod poly +%define HashKey_48 (16*95) ; HashKey^48 <<1 mod poly +%define HashKey_47 (16*96) ; HashKey^47 <<1 mod poly +%define HashKey_46 (16*97) ; HashKey^46 <<1 mod poly +%define HashKey_45 (16*98) ; HashKey^45 <<1 mod poly +%define HashKey_44 (16*99) ; HashKey^44 <<1 mod poly +%define HashKey_43 (16*100) ; HashKey^43 <<1 mod poly +%define HashKey_42 (16*101) ; HashKey^42 <<1 mod poly +%define HashKey_41 (16*102) ; HashKey^41 <<1 mod poly +%define HashKey_40 (16*103) ; HashKey^40 <<1 mod poly +%define HashKey_39 (16*104) ; HashKey^39 <<1 mod poly +%define HashKey_38 (16*105) ; HashKey^38 <<1 mod poly +%define HashKey_37 (16*106) ; HashKey^37 <<1 mod poly +%define HashKey_36 (16*107) ; HashKey^36 <<1 mod poly +%define HashKey_35 (16*108) ; HashKey^35 <<1 mod poly +%define HashKey_34 (16*109) ; HashKey^34 <<1 mod poly +%define HashKey_33 (16*110) ; HashKey^33 <<1 mod poly +%define HashKey_32 (16*111) ; HashKey^32 <<1 mod poly +%define HashKey_31 (16*112) ; HashKey^31 <<1 mod poly +%define HashKey_30 (16*113) ; HashKey^30 <<1 mod poly +%define HashKey_29 (16*114) ; HashKey^29 <<1 mod poly +%define HashKey_28 (16*115) ; HashKey^28 <<1 mod poly +%define HashKey_27 (16*116) ; HashKey^27 <<1 mod poly +%define HashKey_26 (16*117) ; HashKey^26 <<1 mod poly +%define HashKey_25 (16*118) ; HashKey^25 <<1 mod poly +%define HashKey_24 (16*119) ; HashKey^24 <<1 mod poly +%define HashKey_23 (16*120) ; HashKey^23 <<1 mod poly +%define HashKey_22 (16*121) ; HashKey^22 <<1 mod poly +%define HashKey_21 (16*122) ; HashKey^21 <<1 mod poly +%define HashKey_20 (16*123) ; HashKey^20 <<1 mod poly +%define HashKey_19 (16*124) ; HashKey^19 <<1 mod poly +%define HashKey_18 (16*125) ; HashKey^18 <<1 mod poly +%define HashKey_17 (16*126) ; HashKey^17 <<1 mod poly +%define HashKey_16 (16*127) ; HashKey^16 <<1 mod poly +%define HashKey_15 (16*128) ; HashKey^15 <<1 mod poly +%define HashKey_14 (16*129) ; HashKey^14 <<1 mod poly +%define HashKey_13 (16*130) ; HashKey^13 <<1 mod poly +%define HashKey_12 (16*131) ; HashKey^12 <<1 mod poly +%define HashKey_11 (16*132) ; HashKey^11 <<1 mod poly +%define HashKey_10 (16*133) ; HashKey^10 <<1 mod poly +%define HashKey_9 (16*134) ; HashKey^9 <<1 mod poly +%define HashKey_8 (16*135) ; HashKey^8 <<1 mod poly +%define HashKey_7 (16*136) ; HashKey^7 <<1 mod poly +%define HashKey_6 (16*137) ; HashKey^6 <<1 mod poly +%define HashKey_5 (16*138) ; HashKey^5 <<1 mod poly +%define HashKey_4 (16*139) ; HashKey^4 <<1 mod poly +%define HashKey_3 (16*140) ; HashKey^3 <<1 mod poly +%define HashKey_2 (16*141) ; HashKey^2 <<1 mod poly +%define HashKey_1 (16*142) ; HashKey <<1 mod poly +%define HashKey (16*142) ; HashKey <<1 mod poly +%else +;; +;; Key structure holds up to 48 ghash keys +;; +%define HashKey_48 (16*15) ; HashKey^48 <<1 mod poly +%define HashKey_47 (16*16) ; HashKey^47 <<1 mod poly +%define HashKey_46 (16*17) ; HashKey^46 <<1 mod poly +%define HashKey_45 (16*18) ; HashKey^45 <<1 mod poly +%define HashKey_44 (16*19) ; HashKey^44 <<1 mod poly +%define HashKey_43 (16*20) ; HashKey^43 <<1 mod poly +%define HashKey_42 (16*21) ; HashKey^42 <<1 mod poly +%define HashKey_41 (16*22) ; HashKey^41 <<1 mod poly +%define HashKey_40 (16*23) ; HashKey^40 <<1 mod poly +%define HashKey_39 (16*24) ; HashKey^39 <<1 mod poly +%define HashKey_38 (16*25) ; HashKey^38 <<1 mod poly +%define HashKey_37 (16*26) ; HashKey^37 <<1 mod poly +%define HashKey_36 (16*27) ; HashKey^36 <<1 mod poly +%define HashKey_35 (16*28) ; HashKey^35 <<1 mod poly +%define HashKey_34 (16*29) ; HashKey^34 <<1 mod poly +%define HashKey_33 (16*30) ; HashKey^33 <<1 mod poly +%define HashKey_32 (16*31) ; HashKey^32 <<1 mod poly +%define HashKey_31 (16*32) ; HashKey^31 <<1 mod poly +%define HashKey_30 (16*33) ; HashKey^30 <<1 mod poly +%define HashKey_29 (16*34) ; HashKey^29 <<1 mod poly +%define HashKey_28 (16*35) ; HashKey^28 <<1 mod poly +%define HashKey_27 (16*36) ; HashKey^27 <<1 mod poly +%define HashKey_26 (16*37) ; HashKey^26 <<1 mod poly +%define HashKey_25 (16*38) ; HashKey^25 <<1 mod poly +%define HashKey_24 (16*39) ; HashKey^24 <<1 mod poly +%define HashKey_23 (16*40) ; HashKey^23 <<1 mod poly +%define HashKey_22 (16*41) ; HashKey^22 <<1 mod poly +%define HashKey_21 (16*42) ; HashKey^21 <<1 mod poly +%define HashKey_20 (16*43) ; HashKey^20 <<1 mod poly +%define HashKey_19 (16*44) ; HashKey^19 <<1 mod poly +%define HashKey_18 (16*45) ; HashKey^18 <<1 mod poly +%define HashKey_17 (16*46) ; HashKey^17 <<1 mod poly +%define HashKey_16 (16*47) ; HashKey^16 <<1 mod poly +%define HashKey_15 (16*48) ; HashKey^15 <<1 mod poly +%define HashKey_14 (16*49) ; HashKey^14 <<1 mod poly +%define HashKey_13 (16*50) ; HashKey^13 <<1 mod poly +%define HashKey_12 (16*51) ; HashKey^12 <<1 mod poly +%define HashKey_11 (16*52) ; HashKey^11 <<1 mod poly +%define HashKey_10 (16*53) ; HashKey^10 <<1 mod poly +%define HashKey_9 (16*54) ; HashKey^9 <<1 mod poly +%define HashKey_8 (16*55) ; HashKey^8 <<1 mod poly +%define HashKey_7 (16*56) ; HashKey^7 <<1 mod poly +%define HashKey_6 (16*57) ; HashKey^6 <<1 mod poly +%define HashKey_5 (16*58) ; HashKey^5 <<1 mod poly +%define HashKey_4 (16*59) ; HashKey^4 <<1 mod poly +%define HashKey_3 (16*60) ; HashKey^3 <<1 mod poly +%define HashKey_2 (16*61) ; HashKey^2 <<1 mod poly +%define HashKey_1 (16*62) ; HashKey <<1 mod poly +%define HashKey (16*62) ; HashKey <<1 mod poly +%endif ; !GCM_BIG_DATA + +%endif ; GCM_KEYS_VAES_AVX512_INCLUDED diff --git a/src/spdk/intel-ipsec-mb/include/kasumi_internal.h b/src/spdk/intel-ipsec-mb/include/kasumi_internal.h new file mode 100755 index 000000000..87b114d88 --- /dev/null +++ b/src/spdk/intel-ipsec-mb/include/kasumi_internal.h @@ -0,0 +1,1853 @@ +/******************************************************************************* + Copyright (c) 2009-2019, Intel Corporation + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of Intel Corporation nor the names of its contributors + may be used to endorse or promote products derived from this software + without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE + FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + + +/*--------------------------------------------------------- +* Kasumi_internal.h +*---------------------------------------------------------*/ + +#ifndef _KASUMI_INTERNAL_H_ +#define _KASUMI_INTERNAL_H_ + +#include +#include +#include +#include + +#include "intel-ipsec-mb.h" +#include "wireless_common.h" +#include "include/clear_regs_mem.h" +#include "include/constant_lookup.h" + +/*--------------------------------------------------------------------- +* Kasumi Inner S-Boxes +*---------------------------------------------------------------------*/ + +/* Table version based on a small table, no cache trash */ +static const uint16_t sso_kasumi_S7e[] = { + 0x6c00, 0x6601, 0x7802, 0x7603, 0x2404, 0x4e05, 0xb006, 0xce07, + 0x5c08, 0x1e09, 0x6a0a, 0xac0b, 0x1c0c, 0x3e0d, 0xea0e, 0x5c0f, + 0x4e10, 0xc011, 0x6a12, 0xc213, 0x0214, 0xac15, 0xae16, 0x3617, + 0x6e18, 0xa019, 0x681a, 0x001b, 0x0a1c, 0xe41d, 0xc41e, 0x9c1f, + 0x2a20, 0x5021, 0xb622, 0xd823, 0x2024, 0x3225, 0x3826, 0x2e27, + 0x9a28, 0xac29, 0x042a, 0xa62b, 0x882c, 0xd62d, 0xd22e, 0x082f, + 0x4830, 0x9631, 0xf432, 0x1c33, 0x4634, 0xb035, 0x7636, 0xa637, + 0xea38, 0x7039, 0x543a, 0x783b, 0xdc3c, 0x6e3d, 0xae3e, 0xba3f, + 0x6a40, 0x6a41, 0x1c42, 0x9043, 0x3a44, 0x5e45, 0x8c46, 0x7447, + 0x7c48, 0x5449, 0x384a, 0x1c4b, 0xa44c, 0xe84d, 0x604e, 0x304f, + 0x4050, 0xc451, 0x8652, 0xac53, 0x1654, 0xb655, 0x1856, 0x0657, + 0x0658, 0xa259, 0xf25a, 0x785b, 0xf85c, 0x785d, 0x845e, 0x3a5f, + 0x0c60, 0xfc61, 0xf062, 0x9c63, 0x5e64, 0xc265, 0x6666, 0x7667, + 0x9a68, 0x4669, 0x746a, 0xb46b, 0x506c, 0xe06d, 0x3a6e, 0x866f, + 0x6070, 0x3471, 0x3c72, 0xd673, 0x3474, 0x4c75, 0xa476, 0x7277, + 0xa478, 0xd479, 0xea7a, 0xa47b, 0x487c, 0x147d, 0x8a7e, 0xf87f, + 0x6c00, 0x6601, 0x7802, 0x7603, 0x2404, 0x4e05, 0xb006, 0xce07, + 0x5c08, 0x1e09, 0x6a0a, 0xac0b, 0x1c0c, 0x3e0d, 0xea0e, 0x5c0f, + 0x4e10, 0xc011, 0x6a12, 0xc213, 0x0214, 0xac15, 0xae16, 0x3617, + 0x6e18, 0xa019, 0x681a, 0x001b, 0x0a1c, 0xe41d, 0xc41e, 0x9c1f, + 0x2a20, 0x5021, 0xb622, 0xd823, 0x2024, 0x3225, 0x3826, 0x2e27, + 0x9a28, 0xac29, 0x042a, 0xa62b, 0x882c, 0xd62d, 0xd22e, 0x082f, + 0x4830, 0x9631, 0xf432, 0x1c33, 0x4634, 0xb035, 0x7636, 0xa637, + 0xea38, 0x7039, 0x543a, 0x783b, 0xdc3c, 0x6e3d, 0xae3e, 0xba3f, + 0x6a40, 0x6a41, 0x1c42, 0x9043, 0x3a44, 0x5e45, 0x8c46, 0x7447, + 0x7c48, 0x5449, 0x384a, 0x1c4b, 0xa44c, 0xe84d, 0x604e, 0x304f, + 0x4050, 0xc451, 0x8652, 0xac53, 0x1654, 0xb655, 0x1856, 0x0657, + 0x0658, 0xa259, 0xf25a, 0x785b, 0xf85c, 0x785d, 0x845e, 0x3a5f, + 0x0c60, 0xfc61, 0xf062, 0x9c63, 0x5e64, 0xc265, 0x6666, 0x7667, + 0x9a68, 0x4669, 0x746a, 0xb46b, 0x506c, 0xe06d, 0x3a6e, 0x866f, + 0x6070, 0x3471, 0x3c72, 0xd673, 0x3474, 0x4c75, 0xa476, 0x7277, + 0xa478, 0xd479, 0xea7a, 0xa47b, 0x487c, 0x147d, 0x8a7e, 0xf87f +}; + +static const uint16_t sso_kasumi_S9e[] = { + 0x4ea7, 0xdeef, 0x42a1, 0xf77b, 0x0f87, 0x9d4e, 0x1209, 0xa552, + 0x4c26, 0xc4e2, 0x6030, 0xcd66, 0x89c4, 0x0381, 0xb45a, 0x1b8d, + 0x6eb7, 0xfafd, 0x2693, 0x974b, 0x3f9f, 0xa954, 0x6633, 0xd56a, + 0x6532, 0xe9f4, 0x0d06, 0xa452, 0xb0d8, 0x3e9f, 0xc964, 0x62b1, + 0x5eaf, 0xe2f1, 0xd3e9, 0x4a25, 0x9cce, 0x2211, 0x0000, 0x9b4d, + 0x582c, 0xfcfe, 0xf57a, 0x743a, 0x1e8f, 0xb8dc, 0xa251, 0x2190, + 0xbe5f, 0x0603, 0x773b, 0xeaf5, 0x6c36, 0xd6eb, 0xb4da, 0x2b95, + 0xb1d8, 0x1108, 0x58ac, 0xddee, 0xe773, 0x4522, 0x1f8f, 0x984c, + 0x4aa5, 0x8ac5, 0x178b, 0xf279, 0x0301, 0xc1e0, 0x4fa7, 0xa8d4, + 0xe0f0, 0x381c, 0x9dce, 0x60b0, 0x2d96, 0xf7fb, 0x4120, 0xbedf, + 0xebf5, 0x2f97, 0xf2f9, 0x1309, 0xb259, 0x74ba, 0xbadd, 0x59ac, + 0x48a4, 0x944a, 0x71b8, 0x88c4, 0x95ca, 0x4ba5, 0xbd5e, 0x46a3, + 0xd0e8, 0x3c9e, 0x0c86, 0xc562, 0x1a0d, 0xf4fa, 0xd7eb, 0x1c8e, + 0x7ebf, 0x8a45, 0x82c1, 0x53a9, 0x3098, 0xc6e3, 0xdd6e, 0x0e87, + 0xb158, 0x592c, 0x2914, 0xe4f2, 0x6bb5, 0x8140, 0xe271, 0x2d16, + 0x160b, 0xe6f3, 0xae57, 0x7b3d, 0x4824, 0xba5d, 0xe1f0, 0x361b, + 0xcfe7, 0x7dbe, 0xc5e2, 0x5229, 0x8844, 0x389c, 0x93c9, 0x0683, + 0x8d46, 0x2793, 0xa753, 0x2814, 0x4e27, 0xe673, 0x75ba, 0xf87c, + 0xb7db, 0x0180, 0xf9fc, 0x6a35, 0xe070, 0x54aa, 0xbfdf, 0x2e97, + 0xfc7e, 0x52a9, 0x9249, 0x190c, 0x2f17, 0x8341, 0x50a8, 0xd96c, + 0xd76b, 0x4924, 0x5c2e, 0xe7f3, 0x1389, 0x8f47, 0x8944, 0x3018, + 0x91c8, 0x170b, 0x3a9d, 0x99cc, 0xd1e8, 0x55aa, 0x6b35, 0xcae5, + 0x6fb7, 0xf5fa, 0xa0d0, 0x1f0f, 0xbb5d, 0x2391, 0x65b2, 0xd8ec, + 0x2010, 0xa2d1, 0xcf67, 0x6834, 0x7038, 0xf078, 0x8ec7, 0x2b15, + 0xa3d1, 0x41a0, 0xf8fc, 0x3f1f, 0xecf6, 0x0c06, 0xa653, 0x6331, + 0x49a4, 0xb359, 0x3299, 0xedf6, 0x8241, 0x7a3d, 0xe8f4, 0x351a, + 0x5aad, 0xbcde, 0x45a2, 0x8643, 0x0582, 0xe170, 0x0b05, 0xca65, + 0xb9dc, 0x4723, 0x86c3, 0x5dae, 0x6231, 0x9e4f, 0x4ca6, 0x954a, + 0x3118, 0xff7f, 0xeb75, 0x0080, 0xfd7e, 0x3198, 0x369b, 0xdfef, + 0xdf6f, 0x0984, 0x2512, 0xd66b, 0x97cb, 0x43a1, 0x7c3e, 0x8dc6, + 0x0884, 0xc2e1, 0x96cb, 0x793c, 0xd4ea, 0x1c0e, 0x5b2d, 0xb65b, + 0xeff7, 0x3d1e, 0x51a8, 0xa6d3, 0xb75b, 0x6733, 0x188c, 0xed76, + 0x4623, 0xce67, 0xfa7d, 0x57ab, 0x2613, 0xacd6, 0x8bc5, 0x2492, + 0xe5f2, 0x753a, 0x79bc, 0xcce6, 0x0100, 0x9349, 0x8cc6, 0x3b1d, + 0x6432, 0xe874, 0x9c4e, 0x359a, 0x140a, 0x9acd, 0xfdfe, 0x56ab, + 0xcee7, 0x5a2d, 0x168b, 0xa7d3, 0x3a1d, 0xac56, 0xf3f9, 0x4020, + 0x9048, 0x341a, 0xad56, 0x2c96, 0x7339, 0xd5ea, 0x5faf, 0xdcee, + 0x379b, 0x8b45, 0x2a95, 0xb3d9, 0x5028, 0xee77, 0x5cae, 0xc763, + 0x72b9, 0xd2e9, 0x0b85, 0x8e47, 0x81c0, 0x2311, 0xe974, 0x6e37, + 0xdc6e, 0x64b2, 0x8542, 0x180c, 0xabd5, 0x1188, 0xe371, 0x7cbe, + 0x0201, 0xda6d, 0xef77, 0x1289, 0x6ab5, 0xb058, 0x964b, 0x6934, + 0x0904, 0xc9e4, 0xc462, 0x2110, 0xe572, 0x2713, 0x399c, 0xde6f, + 0xa150, 0x7d3e, 0x0804, 0xf1f8, 0xd9ec, 0x0703, 0x6130, 0x9a4d, + 0xa351, 0x67b3, 0x2a15, 0xcb65, 0x5f2f, 0x994c, 0xc7e3, 0x2412, + 0x5e2f, 0xaa55, 0x3219, 0xe3f1, 0xb5da, 0x4321, 0xc864, 0x1b0d, + 0x5128, 0xbdde, 0x1d0e, 0xd46a, 0x3e1f, 0xd068, 0x63b1, 0xa854, + 0x3d9e, 0xcde6, 0x158a, 0xc060, 0xc663, 0x349a, 0xffff, 0x2894, + 0x3b9d, 0xd369, 0x3399, 0xfeff, 0x44a2, 0xaed7, 0x5d2e, 0x92c9, + 0x150a, 0xbf5f, 0xaf57, 0x2090, 0x73b9, 0xdb6d, 0xd86c, 0x552a, + 0xf6fb, 0x4422, 0x6cb6, 0xfbfd, 0x148a, 0xa4d2, 0x9f4f, 0x0a85, + 0x6f37, 0xc160, 0x9148, 0x1a8d, 0x198c, 0xb55a, 0xf67b, 0x7f3f, + 0x85c2, 0x3319, 0x5bad, 0xc8e4, 0x77bb, 0xc3e1, 0xb85c, 0x2994, + 0xcbe5, 0x4da6, 0xf0f8, 0x5329, 0x2e17, 0xaad5, 0x0482, 0xa5d2, + 0x2c16, 0xb2d9, 0x371b, 0x8c46, 0x4d26, 0xd168, 0x47a3, 0xfe7f, + 0x7138, 0xf379, 0x0e07, 0xa9d4, 0x84c2, 0x0402, 0xea75, 0x4f27, + 0x9fcf, 0x0502, 0xc0e0, 0x7fbf, 0xeef7, 0x76bb, 0xa050, 0x1d8e, + 0x391c, 0xc361, 0xd269, 0x0d86, 0x572b, 0xafd7, 0xadd6, 0x70b8, + 0x7239, 0x90c8, 0xb95c, 0x7e3f, 0x98cc, 0x78bc, 0x4221, 0x87c3, + 0xc261, 0x3c1e, 0x6d36, 0xb6db, 0xbc5e, 0x40a0, 0x0281, 0xdbed, + 0x8040, 0x66b3, 0x0f07, 0xcc66, 0x7abd, 0x9ecf, 0xe472, 0x2592, + 0x6db6, 0xbbdd, 0x0783, 0xf47a, 0x80c0, 0x542a, 0xfb7d, 0x0a05, + 0x2291, 0xec76, 0x68b4, 0x83c1, 0x4b25, 0x8743, 0x1088, 0xf97c, + 0x562b, 0x8442, 0x783c, 0x8fc7, 0xab55, 0x7bbd, 0x94ca, 0x61b0, + 0x1008, 0xdaed, 0x1e0f, 0xf178, 0x69b4, 0xa1d0, 0x763b, 0x9bcd +}; + +/* Range of input data for KASUMI is from 1 to 20000 bits */ +#define KASUMI_MIN_LEN 1 +#define KASUMI_MAX_LEN 20000 + +/* KASUMI cipher definitions */ +#define NUM_KASUMI_ROUNDS (8) /* 8 rounds in the kasumi spec */ +#define QWORDSIZEINBITS (64) +#define QWORDSIZEINBYTES (8) +#define LAST_PADDING_BIT (1) + +#define BYTESIZE (8) +#define BITSIZE(x) ((int)(sizeof(x)*BYTESIZE)) + +/*--------- 16 bit rotate left ------------------------------------------*/ +#define ROL16(a,b) (uint16_t)((a<>(16-b))) + +/*----- a 64-bit structure to help with kasumi endian issues -----*/ +typedef union _ku64 { + uint64_t b64[1]; + uint32_t b32[2]; + uint16_t b16[4]; + uint8_t b8[8]; +} kasumi_union_t; + +typedef union SafeBuffer { + uint64_t b64; + uint32_t b32[2]; + uint8_t b8[KASUMI_BLOCK_SIZE]; +} SafeBuf; + +/*--------------------------------------------------------------------- +* Inline 16-bit left rotation +*---------------------------------------------------------------------*/ + +#define ROL16(a,b) (uint16_t)((a<>(16-b))) + +#define FIp1(data, key1, key2, key3) \ + do { \ + uint16_t datal, datah; \ + \ + (data) ^= (key1); \ + datal = LOOKUP16_SSE(sso_kasumi_S7e, (uint8_t)(data), 256); \ + datah = LOOKUP16_SSE(sso_kasumi_S9e, (data) >> 7, 512); \ + (data) = datal ^ datah; \ + (data) ^= (key2); \ + datal = LOOKUP16_SSE(sso_kasumi_S7e, (data) >> 9, 256); \ + datah = LOOKUP16_SSE(sso_kasumi_S9e, (data) & 0x1FF, 512); \ + (data) = datal ^ datah; \ + (data) ^= (key3); \ + } while (0) + +#define FIp2(data1, data2, key1, key2, key3, key4) \ + do { \ + FIp1(data1, key1, key2, key3); \ + FIp1(data2, key1, key2, key4); \ + } while (0) + +#define FLpi(key1, key2, res_h, res_l) \ + do { \ + uint16_t l, r; \ + r = (res_l) & (key1); \ + r = (res_h) ^ ROL16(r, 1); \ + l = r | (key2); \ + (res_h) = (res_l) ^ ROL16(l, 1); \ + (res_l) = r; \ + } while (0) + +#define FLp1(index, h, l) \ + do { \ + uint16_t ka = *(index + 0); \ + uint16_t kb = *(index + 1); \ + FLpi(ka, kb, h, l); \ + } while (0) + +#define FLp2(index, h1, l1, h2, l2) \ + do { \ + uint16_t ka = *(index + 0); \ + uint16_t kb = *(index + 1); \ + FLpi(ka, kb, h1, l1); \ + FLpi(ka, kb, h2, l2); \ + } while (0) + +#define FLp3(index, h1, l1, h2, l2, h3, l3) \ + do { \ + uint16_t ka = *(index + 0); \ + uint16_t kb = *(index + 1); \ + FLpi(ka, kb, h1, l1); \ + FLpi(ka, kb, h2, l2); \ + FLpi(ka, kb, h3, l3); \ + } while (0) + +#define FLp4(index, h1, l1, h2, l2, h3, l3, h4, l4) \ + do { \ + FLp2(index, h1, l1, h2, l2); \ + FLp2(index, h3, l3, h4, l4); \ + } while (0) + +#define FOp1(index, h, l) \ + do { \ + FIp1(h, *(index + 2), *(index + 3), l); \ + FIp1(l, *(index + 4), *(index + 5), h); \ + FIp1(h, *(index + 6), *(index + 7), l); \ + } while (0) + +#define FOp2(index, h1, l1, h2, l2) \ + do { \ + uint16_t ka = *(index + 2); \ + uint16_t kb = *(index + 3); \ + FIp2(h1, h2, ka, kb, l1, l2); \ + ka = *(index + 4); \ + kb = *(index + 5); \ + FIp2(l1, l2, ka, kb, h1, h2); \ + ka = *(index + 6); \ + kb = *(index + 7); \ + FIp2(h1, h2, ka, kb, l1, l2); \ + } while (0) + +#define FOp3(index, h1, l1, h2, l2, h3, l3) \ + do { \ + uint16_t ka = *(index + 2); \ + uint16_t kb = *(index + 3); \ + FIp2(h1, h2, ka, kb, l1, l2); \ + FIp1(h3, ka, kb, l3); \ + ka = *(index + 4); \ + kb = *(index + 5); \ + FIp2(l1, l2, ka, kb, h1, h2); \ + FIp1(l3, ka, kb, h3); \ + ka = *(index + 6); \ + kb = *(index + 7); \ + FIp2(h1, h2, ka, kb, l1, l2); \ + FIp1(h3, ka, kb, l3); \ + } while (0) + +#define FOp4(index, h1, l1, h2, l2, h3, l3, h4, l4) \ + do { \ + uint16_t ka = *(index + 2); \ + uint16_t kb = *(index + 3); \ + FIp2(h1, h2, ka, kb, l1, l2); \ + FIp2(h3, h4, ka, kb, l3, l4); \ + ka = *(index + 4); \ + kb = *(index + 5); \ + FIp2(l1, l2, ka, kb, h1, h2); \ + FIp2(l3, l4, ka, kb, h3, h4); \ + ka = *(index + 6); \ + kb = *(index + 7); \ + FIp2(h1, h2, ka, kb, l1, l2); \ + FIp2(h3, h4, ka, kb, l3, l4); \ + } while (0) + +/** + ******************************************************************************* + * @description + * This function performs the Kasumi operation on the given block using the key + * that is already scheduled in the context + * + * @param[in] pContext Context where the scheduled keys are stored + * @param[in/out] pData Block to be enc/dec + * + ******************************************************************************/ +static void kasumi_1_block(const uint16_t *context, uint16_t *data) +{ + const uint16_t *end = context + KASUMI_KEY_SCHEDULE_SIZE; + uint16_t temp_l, temp_h; + + /* 4 iterations odd/even */ + do { + temp_l = data[3]; + temp_h = data[2]; + FLp1(context, temp_h, temp_l); + FOp1(context, temp_h, temp_l); + context += 8; + data[1] ^= temp_l; + data[0] ^= temp_h; + + temp_h = data[1]; + temp_l = data[0]; + FOp1(context, temp_h, temp_l); + FLp1(context, temp_h, temp_l); + context += 8; + data[3] ^= temp_h; + data[2] ^= temp_l; + } while (context < end); +} + +/** + ****************************************************************************** + * @description + * This function performs the Kasumi operation on the given blocks using the key + * that is already scheduled in the context + * + * @param[in] pContext Context where the scheduled keys are stored + * @param[in/out] pData1 First block to be enc/dec + * @param[in/out] pData2 Second block to be enc/dec + * + ******************************************************************************/ +static void +kasumi_2_blocks(const uint16_t *context, uint16_t *data1, uint16_t *data2) +{ + const uint16_t *end = context + KASUMI_KEY_SCHEDULE_SIZE; + uint16_t temp1_l, temp1_h; + uint16_t temp2_l, temp2_h; + + /* 4 iterations odd/even , with fine grain interleave */ + do { + /* even */ + temp1_l = data1[3]; + temp1_h = data1[2]; + temp2_l = data2[3]; + temp2_h = data2[2]; + FLp2(context, temp1_h, temp1_l, temp2_h, temp2_l); + FOp2(context, temp1_h, temp1_l, temp2_h, temp2_l); + context += 8; + data1[1] ^= temp1_l; + data1[0] ^= temp1_h; + data2[1] ^= temp2_l; + data2[0] ^= temp2_h; + + /* odd */ + temp1_h = data1[1]; + temp1_l = data1[0]; + temp2_h = data2[1]; + temp2_l = data2[0]; + FOp2(context, temp1_h, temp1_l, temp2_h, temp2_l); + FLp2(context, temp1_h, temp1_l, temp2_h, temp2_l); + context += 8; + data1[3] ^= temp1_h; + data1[2] ^= temp1_l; + data2[3] ^= temp2_h; + data2[2] ^= temp2_l; + } while (context < end); +} + + +/** + ******************************************************************************* + * @description + * This function performs the Kasumi operation on the given blocks using the key + * that is already scheduled in the context + * + * @param[in] pContext Context where the scheduled keys are stored + * @param[in/out] pData1 First block to be enc/dec + * @param[in/out] pData2 Second block to be enc/dec + * @param[in/out] pData3 Third block to be enc/dec + * + ******************************************************************************/ +static void +kasumi_3_blocks(const uint16_t *context, uint16_t *data1, + uint16_t *data2, uint16_t *data3) +{ + /* Case when the conmpiler is able to interleave efficiently */ + const uint16_t *end = context + KASUMI_KEY_SCHEDULE_SIZE; + uint16_t temp1_l, temp1_h; + uint16_t temp2_l, temp2_h; + uint16_t temp3_l, temp3_h; + + /* 4 iterations odd/even , with fine grain interleave */ + do { + temp1_l = data1[3]; + temp1_h = data1[2]; + temp2_l = data2[3]; + temp2_h = data2[2]; + temp3_l = data3[3]; + temp3_h = data3[2]; + FLp3(context, temp1_h, temp1_l, temp2_h, temp2_l, temp3_h, + temp3_l); + FOp3(context, temp1_h, temp1_l, temp2_h, temp2_l, temp3_h, + temp3_l); + context += 8; + data1[1] ^= temp1_l; + data1[0] ^= temp1_h; + data2[1] ^= temp2_l; + data2[0] ^= temp2_h; + data3[1] ^= temp3_l; + data3[0] ^= temp3_h; + + temp1_h = data1[1]; + temp1_l = data1[0]; + temp2_h = data2[1]; + temp2_l = data2[0]; + temp3_h = data3[1]; + temp3_l = data3[0]; + FOp3(context, temp1_h, temp1_l, temp2_h, temp2_l, temp3_h, + temp3_l); + FLp3(context, temp1_h, temp1_l, temp2_h, temp2_l, temp3_h, + temp3_l); + context += 8; + data1[3] ^= temp1_h; + data1[2] ^= temp1_l; + data2[3] ^= temp2_h; + data2[2] ^= temp2_l; + data3[3] ^= temp3_h; + data3[2] ^= temp3_l; + } while (context < end); +} + +/** + ******************************************************************************* + * @description + * This function performs the Kasumi operation on the given blocks using the key + * that is already scheduled in the context + * + * @param[in] pContext Context where the scheduled keys are stored + * @param[in] ppData Pointer to an array of addresses of blocks + * + ******************************************************************************/ +static void +kasumi_4_blocks(const uint16_t *context, uint16_t **ppData) +{ + /* Case when the conmpiler is unable to interleave efficiently */ + kasumi_2_blocks (context, ppData[0], ppData[1]); + kasumi_2_blocks (context, ppData[2], ppData[3]); +} + +/** + ****************************************************************************** + * @description + * This function performs the Kasumi operation on the given blocks using the key + * that is already scheduled in the context + * + * @param[in] pContext Context where the scheduled keys are stored + * @param[in] ppData Pointer to an array of addresses of blocks + * + ******************************************************************************/ +static void +kasumi_8_blocks(const uint16_t *context, uint16_t **ppData) +{ + kasumi_4_blocks (context, &ppData[0]); + kasumi_4_blocks (context, &ppData[4]); +} + +/****************************************************************************** +* @description +* Multiple wrappers for the Kasumi rounds on up to 16 blocks of 64 bits at a +*time. +* +* Depending on the variable packet lengths, different wrappers get called. +* It has been measured that 1 packet is faster than 2, 2 packets is faster +*than 3 +* 3 packets is faster than 4, and so on ... +* It has also been measured that 6 = 4+2 packets is faster than 8 +* It has also been measured that 7 packets are processed faster as 8 packets, +* +* If the assumptions are not verified, it is easy to implmement +* the right function and reference it in wrapperArray. +* +*******************************************************************************/ +static void +kasumi_f8_1_buffer_wrapper(const uint16_t *context, uint16_t **data) +{ + kasumi_1_block(context, data[0]); +} + +static void +kasumi_f8_2_buffer_wrapper(const uint16_t *context, uint16_t **data) +{ + kasumi_2_blocks(context, data[0], data[1]); +} + +static void +kasumi_f8_3_buffer_wrapper(const uint16_t *context, uint16_t **data) +{ + kasumi_3_blocks(context, data[0], data[1], data[2]); +} + +static void +kasumi_f8_5_buffer_wrapper(const uint16_t *context, uint16_t **data) +{ + kasumi_4_blocks(context, &data[0]); + kasumi_1_block(context, data[4]); +} + +static void +kasumi_f8_6_buffer_wrapper(const uint16_t *context, uint16_t **data) +{ + /* It is also assumed 6 = 4+2 packets is faster than 8 */ + kasumi_4_blocks(context, &data[0]); + kasumi_2_blocks(context, data[4], data[5]); +} + +static void +kasumi_f8_7_buffer_wrapper(const uint16_t *context, uint16_t **data) +{ + kasumi_4_blocks(context, &data[0]); + kasumi_3_blocks(context, data[4], data[5], data[6]); +} + +static void +kasumi_f8_9_buffer_wrapper(const uint16_t *context, uint16_t **data) +{ + + kasumi_8_blocks(context, &data[0]); + kasumi_1_block(context, data[8]); +} + +static void +kasumi_f8_10_buffer_wrapper(const uint16_t *context, uint16_t **data) +{ + kasumi_8_blocks(context, &data[0]); + kasumi_2_blocks(context, data[8], data[9]); +} + +static void +kasumi_f8_11_buffer_wrapper(const uint16_t *context, uint16_t **data) +{ + kasumi_8_blocks(context, &data[0]); + kasumi_3_blocks(context, data[8], data[9], data[10]); +} + +static void +kasumi_f8_12_buffer_wrapper(const uint16_t *context, uint16_t **data) +{ + kasumi_8_blocks(context, &data[0]); + kasumi_4_blocks(context, &data[8]); +} + +static void +kasumi_f8_13_buffer_wrapper(const uint16_t *context, uint16_t **data) +{ + + kasumi_8_blocks(context, &data[0]); + kasumi_4_blocks(context, &data[8]); + kasumi_1_block(context, data[12]); +} + +static void +kasumi_f8_14_buffer_wrapper(const uint16_t *context, uint16_t **data) +{ + kasumi_8_blocks(context, &data[0]); + kasumi_4_blocks(context, &data[8]); + kasumi_2_blocks(context, data[12], data[13]); +} + +static void +kasumi_f8_15_buffer_wrapper(const uint16_t *context, uint16_t **data) +{ + kasumi_8_blocks(context, &data[0]); + kasumi_4_blocks(context, &data[8]); + kasumi_3_blocks(context, data[12], data[13], data[14]); +} + +static void +kasumi_f8_16_buffer_wrapper(const uint16_t *context, uint16_t **data) +{ + kasumi_8_blocks(context, &data[0]); + kasumi_8_blocks(context, &data[8]); +} + +typedef void (*kasumi_wrapper_t)(const uint16_t *, uint16_t **); + +static kasumi_wrapper_t kasumiWrapperArray[] = { + NULL, + kasumi_f8_1_buffer_wrapper, + kasumi_f8_2_buffer_wrapper, + kasumi_f8_3_buffer_wrapper, + kasumi_4_blocks, + kasumi_f8_5_buffer_wrapper, + kasumi_f8_6_buffer_wrapper, + kasumi_f8_7_buffer_wrapper, + kasumi_8_blocks, + kasumi_f8_9_buffer_wrapper, + kasumi_f8_10_buffer_wrapper, + kasumi_f8_11_buffer_wrapper, + kasumi_f8_12_buffer_wrapper, + kasumi_f8_13_buffer_wrapper, + kasumi_f8_14_buffer_wrapper, + kasumi_f8_15_buffer_wrapper, + kasumi_f8_16_buffer_wrapper}; + +/*--------------------------------------------------------------------- +* kasumi_key_schedule_sk() +* Build the key schedule. Most "key" operations use 16-bit +* +* Context is a flat array of 64 uint16. The context is built in the same order +* it will be used. +*---------------------------------------------------------------------*/ +static inline void +kasumi_key_schedule_sk(uint16_t *context, const void *pKey) +{ + + /* Kasumi constants*/ + static const uint16_t C[] = {0x0123, 0x4567, 0x89AB, 0xCDEF, + 0xFEDC, 0xBA98, 0x7654, 0x3210}; + + uint16_t k[8], kprime[8], n; + const uint8_t *pk = (const uint8_t *) pKey; + + /* Build K[] and K'[] keys */ + for (n = 0; n < 8; n++, pk += 2) { + k[n] = (pk[0] << 8) + pk[1]; + kprime[n] = k[n] ^ C[n]; + } + + /* + * Finally construct the various sub keys [Kli1, KlO ...) in the right + * order for easy usage at run-time + */ + for (n = 0; n < 8; n++) { + context[0] = ROL16(k[n], 1); + context[1] = kprime[(n + 2) & 0x7]; + context[2] = ROL16(k[(n + 1) & 0x7], 5); + context[3] = kprime[(n + 4) & 0x7]; + context[4] = ROL16(k[(n + 5) & 0x7], 8); + context[5] = kprime[(n + 3) & 0x7]; + context[6] = ROL16(k[(n + 6) & 0x7], 13); + context[7] = kprime[(n + 7) & 0x7]; + context += 8; + } +#ifdef SAFE_DATA + clear_mem(k, sizeof(k)); + clear_mem(kprime, sizeof(kprime)); +#endif +} + +/*--------------------------------------------------------------------- +* kasumi_compute_sched() +* Generic ksaumi key sched init function. +* +*---------------------------------------------------------------------*/ +static inline int +kasumi_compute_sched(const uint8_t modifier, + const void *const pKey, void *pCtx) +{ +#ifdef SAFE_PARAM + /* Check for NULL pointers */ + if (pKey == NULL || pCtx == NULL) + return -1; +#endif + uint32_t i = 0; + const uint8_t *const key = (const uint8_t * const)pKey; + uint8_t ModKey[KASUMI_KEY_SIZE] = {0}; /* Modified key */ + kasumi_key_sched_t *pLocalCtx = (kasumi_key_sched_t *)pCtx; + + /* Construct the modified key*/ + for (i = 0; i < KASUMI_KEY_SIZE; i++) + ModKey[i] = (uint8_t)key[i] ^ modifier; + + kasumi_key_schedule_sk(pLocalCtx->sk16, pKey); + kasumi_key_schedule_sk(pLocalCtx->msk16, ModKey); + +#ifdef SAFE_DATA + clear_mem(ModKey, sizeof(ModKey)); + CLEAR_SCRATCH_GPS(); + CLEAR_SCRATCH_SIMD_REGS(); +#endif + return 0; +} + +/*--------------------------------------------------------------------- +* kasumi_key_sched_size() +* Get the size of a kasumi key sched context. +* +*---------------------------------------------------------------------*/ +static inline size_t +kasumi_key_sched_size(void) +{ + /* + * There are two keys that need to be scheduled: the original one and + * the modified one (xored with the relevant modifier) + */ + return sizeof(kasumi_key_sched_t); +} + +/*--------------------------------------------------------------------- +* kasumi_init_f8_key_sched() +* Compute the kasumi f8 key schedule. +* +*---------------------------------------------------------------------*/ + +static inline int +kasumi_init_f8_key_sched(const void *const pKey, + kasumi_key_sched_t *pCtx) +{ + return kasumi_compute_sched(0x55, pKey, pCtx); +} + +/*--------------------------------------------------------------------- +* kasumi_init_f9_key_sched() +* Compute the kasumi f9 key schedule. +* +*---------------------------------------------------------------------*/ + +static inline int +kasumi_init_f9_key_sched(const void *const pKey, + kasumi_key_sched_t *pCtx) +{ + return kasumi_compute_sched(0xAA, pKey, pCtx); +} + +size_t +kasumi_key_sched_size_sse(void); + +int +kasumi_init_f8_key_sched_sse(const void *pKey, kasumi_key_sched_t *pCtx); + +int +kasumi_init_f9_key_sched_sse(const void *pKey, kasumi_key_sched_t *pCtx); + +size_t +kasumi_key_sched_size_avx(void); + +int +kasumi_init_f8_key_sched_avx(const void *pKey, kasumi_key_sched_t *pCtx); + +int +kasumi_init_f9_key_sched_avx(const void *pKey, kasumi_key_sched_t *pCtx); + + +static inline void +kasumi_f8_1_buffer(const kasumi_key_sched_t *pCtx, const uint64_t IV, + const void *pIn, void *pOut, + const uint32_t length) +{ + uint32_t blkcnt; + kasumi_union_t a, b; /* the modifier */ + SafeBuf safeInBuf; + const uint8_t *pBufferIn = (const uint8_t *) pIn; + uint8_t *pBufferOut = (uint8_t *) pOut; + uint32_t lengthInBytes = length; + + /* IV Endianity */ + a.b64[0] = BSWAP64(IV); + + /* First encryption to create modifier */ + kasumi_1_block(pCtx->msk16, a.b16 ); + + /* Final initialisation steps */ + blkcnt = 0; + b.b64[0] = a.b64[0]; + + /* Now run the block cipher */ + while (lengthInBytes) { + /* KASUMI it to produce the next block of keystream */ + kasumi_1_block(pCtx->sk16, b.b16 ); + + if (lengthInBytes > KASUMI_BLOCK_SIZE) { + pBufferIn = xor_keystrm_rev(pBufferOut, pBufferIn, + b.b64[0]); + pBufferOut += KASUMI_BLOCK_SIZE; + /* loop variant */ + /* done another 64 bits */ + lengthInBytes -= KASUMI_BLOCK_SIZE; + + /* apply the modifier and update the block count */ + b.b64[0] ^= a.b64[0]; + b.b16[0] ^= (uint16_t)++blkcnt; + } else if (lengthInBytes < KASUMI_BLOCK_SIZE) { + /* end of the loop, handle the last bytes */ + memcpy_keystrm(safeInBuf.b8, pBufferIn, + lengthInBytes); + xor_keystrm_rev(b.b8, safeInBuf.b8, b.b64[0]); + memcpy_keystrm(pBufferOut, b.b8, lengthInBytes); + lengthInBytes = 0; + /* lengthInBytes == KASUMI_BLOCK_SIZE */ + } else { + xor_keystrm_rev(pBufferOut, pBufferIn, b.b64[0]); + lengthInBytes = 0; + } + } +#ifdef SAFE_DATA + /* Clear sensitive data in stack */ + clear_mem(&a, sizeof(a)); + clear_mem(&b, sizeof(b)); + clear_mem(&safeInBuf, sizeof(safeInBuf)); +#endif +} + +static inline void +preserve_bits(kasumi_union_t *c, + const uint8_t *pcBufferOut, const uint8_t *pcBufferIn, + SafeBuf *safeOutBuf, SafeBuf *safeInBuf, + const uint8_t bit_len, const uint8_t byte_len) +{ + const uint64_t mask = UINT64_MAX << (KASUMI_BLOCK_SIZE * 8 - bit_len); + + /* Clear the last bits of the keystream and the input + * (input only in out-of-place case) */ + c->b64[0] &= mask; + if (pcBufferIn != pcBufferOut) { + const uint64_t swapMask = BSWAP64(mask); + + safeInBuf->b64 &= swapMask; + + /* + * Merge the last bits from the output, to be preserved, + * in the keystream, to be XOR'd with the input + * (which last bits are 0, maintaining the output bits) + */ + memcpy_keystrm(safeOutBuf->b8, pcBufferOut, byte_len); + c->b64[0] |= BSWAP64(safeOutBuf->b64 & ~swapMask); + } +} + +static inline void +kasumi_f8_1_buffer_bit(const kasumi_key_sched_t *pCtx, const uint64_t IV, + const void *pIn, void *pOut, + const uint32_t lengthInBits, + const uint32_t offsetInBits) +{ + const uint8_t *pBufferIn = (const uint8_t *) pIn; + uint8_t *pBufferOut = (uint8_t *) pOut; + uint32_t cipherLengthInBits = lengthInBits; + uint32_t blkcnt; + uint64_t shiftrem = 0; + kasumi_union_t a, b, c; /* the modifier */ + const uint8_t *pcBufferIn = pBufferIn + (offsetInBits / 8); + uint8_t *pcBufferOut = pBufferOut + (offsetInBits / 8); + /* Offset into the first byte (0 - 7 bits) */ + uint32_t remainOffset = offsetInBits % 8; + uint32_t byteLength = (cipherLengthInBits + 7) / 8; + SafeBuf safeOutBuf; + SafeBuf safeInBuf; + + /* IV Endianity */ + a.b64[0] = BSWAP64(IV); + + /* First encryption to create modifier */ + kasumi_1_block(pCtx->msk16, a.b16); + + /* Final initialisation steps */ + blkcnt = 0; + b.b64[0] = a.b64[0]; + /* Now run the block cipher */ + + /* Start with potential partial block (due to offset and length) */ + kasumi_1_block(pCtx->sk16, b.b16); + c.b64[0] = b.b64[0] >> remainOffset; + /* Only one block to encrypt */ + if (cipherLengthInBits < (64 - remainOffset)) { + byteLength = (cipherLengthInBits + 7) / 8; + memcpy_keystrm(safeInBuf.b8, pcBufferIn, byteLength); + /* + * If operation is Out-of-place and there is offset + * to be applied, "remainOffset" bits from the output buffer + * need to be preserved (only applicable to first byte, + * since remainOffset is up to 7 bits) + */ + if ((pIn != pOut) && remainOffset) { + const uint8_t mask8 = + (const uint8_t)(1 << (8 - remainOffset)) - 1; + + safeInBuf.b8[0] = (safeInBuf.b8[0] & mask8) | + (pcBufferOut[0] & ~mask8); + } + + /* If last byte is a partial byte, the last bits of the output + * need to be preserved */ + const uint8_t bitlen_with_off = remainOffset + + cipherLengthInBits; + + if ((bitlen_with_off & 0x7) != 0) { + preserve_bits(&c, pcBufferOut, pcBufferIn, &safeOutBuf, + &safeInBuf, bitlen_with_off, byteLength); + } + xor_keystrm_rev(safeOutBuf.b8, safeInBuf.b8, c.b64[0]); + memcpy_keystrm(pcBufferOut, safeOutBuf.b8, byteLength); + return; + } + + /* + * If operation is Out-of-place and there is offset + * to be applied, "remainOffset" bits from the output buffer + * need to be preserved (only applicable to first byte, + * since remainOffset is up to 7 bits) + */ + if ((pIn != pOut) && remainOffset) { + const uint8_t mask8 = + (const uint8_t)(1 << (8 - remainOffset)) - 1; + + memcpy_keystrm(safeInBuf.b8, pcBufferIn, 8); + safeInBuf.b8[0] = (safeInBuf.b8[0] & mask8) | + (pcBufferOut[0] & ~mask8); + xor_keystrm_rev(pcBufferOut, safeInBuf.b8, c.b64[0]); + pcBufferIn += KASUMI_BLOCK_SIZE; + } else { + /* At least 64 bits to produce (including offset) */ + pcBufferIn = xor_keystrm_rev(pcBufferOut, pcBufferIn, c.b64[0]); + } + + if (remainOffset != 0) + shiftrem = b.b64[0] << (64 - remainOffset); + cipherLengthInBits -= KASUMI_BLOCK_SIZE * 8 - remainOffset; + pcBufferOut += KASUMI_BLOCK_SIZE; + /* apply the modifier and update the block count */ + b.b64[0] ^= a.b64[0]; + b.b16[0] ^= (uint16_t)++blkcnt; + + while (cipherLengthInBits) { + /* KASUMI it to produce the next block of keystream */ + kasumi_1_block(pCtx->sk16, b.b16); + c.b64[0] = (b.b64[0] >> remainOffset) | shiftrem; + if (remainOffset != 0) + shiftrem = b.b64[0] << (64 - remainOffset); + if (cipherLengthInBits >= KASUMI_BLOCK_SIZE * 8) { + pcBufferIn = xor_keystrm_rev(pcBufferOut, + pcBufferIn, c.b64[0]); + cipherLengthInBits -= KASUMI_BLOCK_SIZE * 8; + pcBufferOut += KASUMI_BLOCK_SIZE; + /* loop variant */ + + /* apply the modifier and update the block count */ + b.b64[0] ^= a.b64[0]; + b.b16[0] ^= (uint16_t)++blkcnt; + } else { + /* end of the loop, handle the last bytes */ + byteLength = (cipherLengthInBits + 7) / 8; + memcpy_keystrm(safeInBuf.b8, pcBufferIn, + byteLength); + + /* If last byte is a partial byte, the last bits + * of the output need to be preserved */ + if ((cipherLengthInBits & 0x7) != 0) + preserve_bits(&c, pcBufferOut, pcBufferIn, + &safeOutBuf, &safeInBuf, + cipherLengthInBits, byteLength); + xor_keystrm_rev(safeOutBuf.b8, safeInBuf.b8, c.b64[0]); + memcpy_keystrm(pcBufferOut, safeOutBuf.b8, byteLength); + cipherLengthInBits = 0; + } + } +#ifdef SAFE_DATA + /* Clear sensitive data in stack */ + clear_mem(&a, sizeof(a)); + clear_mem(&b, sizeof(b)); + clear_mem(&c, sizeof(c)); + clear_mem(&safeInBuf, sizeof(safeInBuf)); + clear_mem(&safeOutBuf, sizeof(safeOutBuf)); +#endif +} + +static inline void +kasumi_f8_2_buffer(const kasumi_key_sched_t *pCtx, + const uint64_t IV1, const uint64_t IV2, + const void *pIn1, void *pOut1, + const uint32_t length1, + const void *pIn2, void *pOut2, + const uint32_t length2) +{ + const uint8_t *pBufferIn1 = (const uint8_t *) pIn1; + uint8_t *pBufferOut1 = (uint8_t *) pOut1; + uint32_t lengthInBytes1 = length1; + const uint8_t *pBufferIn2 = (const uint8_t *) pIn2; + uint8_t *pBufferOut2 = (uint8_t *) pOut2; + uint32_t lengthInBytes2 = length2; + uint32_t blkcnt, length; + kasumi_union_t a1, b1; /* the modifier */ + kasumi_union_t a2, b2; /* the modifier */ + SafeBuf safeInBuf; + + kasumi_union_t temp; + + /* IV Endianity */ + a1.b64[0] = BSWAP64(IV1); + a2.b64[0] = BSWAP64(IV2); + + kasumi_2_blocks(pCtx->msk16, a1.b16, a2.b16); + + /* Final initialisation steps */ + blkcnt = 0; + b1.b64[0] = a1.b64[0]; + b2.b64[0] = a2.b64[0]; + + /* check which packet is longer and save "common" shortest length */ + if (lengthInBytes1 > lengthInBytes2) + length = lengthInBytes2; + else + length = lengthInBytes1; + + /* Round down to to a whole number of qwords. (QWORDLENGTHINBYTES-1 */ + length &= ~7; + lengthInBytes1 -= length; + lengthInBytes2 -= length; + + /* Now run the block cipher for common packet length, a whole number of + * blocks */ + while (length) { + /* KASUMI it to produce the next block of keystream for both + * packets */ + kasumi_2_blocks(pCtx->sk16, b1.b16, b2.b16); + + /* xor and write keystream */ + pBufferIn1 = + xor_keystrm_rev(pBufferOut1, pBufferIn1, b1.b64[0]); + pBufferOut1 += KASUMI_BLOCK_SIZE; + pBufferIn2 = + xor_keystrm_rev(pBufferOut2, pBufferIn2, b2.b64[0]); + pBufferOut2 += KASUMI_BLOCK_SIZE; + /* loop variant */ + length -= KASUMI_BLOCK_SIZE; /* done another 64 bits */ + + /* apply the modifier and update the block count */ + b1.b64[0] ^= a1.b64[0]; + b1.b16[0] ^= (uint16_t)++blkcnt; + b2.b64[0] ^= a2.b64[0]; + b2.b16[0] ^= (uint16_t)blkcnt; + } + + /* + * Process common part at end of first packet and second packet. + * One of the packets has a length less than 8 bytes. + */ + if (lengthInBytes1 > 0 && lengthInBytes2 > 0) { + /* final round for 1 of the packets */ + kasumi_2_blocks(pCtx->sk16, b1.b16, b2.b16); + if (lengthInBytes1 > KASUMI_BLOCK_SIZE) { + pBufferIn1 = xor_keystrm_rev(pBufferOut1, + pBufferIn1, b1.b64[0]); + pBufferOut1 += KASUMI_BLOCK_SIZE; + b1.b64[0] ^= a1.b64[0]; + b1.b16[0] ^= (uint16_t)++blkcnt; + lengthInBytes1 -= KASUMI_BLOCK_SIZE; + } else if (lengthInBytes1 < KASUMI_BLOCK_SIZE) { + memcpy_keystrm(safeInBuf.b8, pBufferIn1, + lengthInBytes1); + xor_keystrm_rev(temp.b8, safeInBuf.b8, b1.b64[0]); + memcpy_keystrm(pBufferOut1, temp.b8, + lengthInBytes1); + lengthInBytes1 = 0; + /* lengthInBytes1 == KASUMI_BLOCK_SIZE */ + } else { + xor_keystrm_rev(pBufferOut1, pBufferIn1, b1.b64[0]); + lengthInBytes1 = 0; + } + if (lengthInBytes2 > KASUMI_BLOCK_SIZE) { + pBufferIn2 = xor_keystrm_rev(pBufferOut2, + pBufferIn2, b2.b64[0]); + pBufferOut2 += KASUMI_BLOCK_SIZE; + b2.b64[0] ^= a2.b64[0]; + b2.b16[0] ^= (uint16_t)++blkcnt; + lengthInBytes2 -= KASUMI_BLOCK_SIZE; + } else if (lengthInBytes2 < KASUMI_BLOCK_SIZE) { + memcpy_keystrm(safeInBuf.b8, pBufferIn2, + lengthInBytes2); + xor_keystrm_rev(temp.b8, safeInBuf.b8, b2.b64[0]); + memcpy_keystrm(pBufferOut2, temp.b8, + lengthInBytes2); + lengthInBytes2 = 0; + /* lengthInBytes2 == KASUMI_BLOCK_SIZE */ + } else { + xor_keystrm_rev(pBufferOut2, pBufferIn2, b2.b64[0]); + lengthInBytes2 = 0; + } + } + + if (lengthInBytes1 < lengthInBytes2) { + /* packet 2 is not completed since lengthInBytes2 > 0 + * packet 1 has less than 8 bytes. + */ + if (lengthInBytes1) { + kasumi_1_block(pCtx->sk16, b1.b16); + xor_keystrm_rev(pBufferOut1, pBufferIn1, b1.b64[0]); + } + /* move pointers to right variables for packet 1 */ + lengthInBytes1 = lengthInBytes2; + b1.b64[0] = b2.b64[0]; + a1.b64[0] = a2.b64[0]; + pBufferIn1 = pBufferIn2; + pBufferOut1 = pBufferOut2; + } else { /* lengthInBytes1 >= lengthInBytes2 */ + if (!lengthInBytes1) + /* both packets are completed */ + return; + /* process the remaining of packet 2 */ + if (lengthInBytes2) { + kasumi_1_block(pCtx->sk16, b2.b16); + xor_keystrm_rev(pBufferOut2, pBufferIn2, b2.b64[0]); + } + /* packet 1 is not completed */ + } + + /* process the length difference from ipkt1 and pkt2 */ + while (lengthInBytes1) { + /* KASUMI it to produce the next block of keystream */ + kasumi_1_block(pCtx->sk16, b1.b16); + + if (lengthInBytes1 > KASUMI_BLOCK_SIZE) { + pBufferIn1 = xor_keystrm_rev(pBufferOut1, + pBufferIn1, b1.b64[0]); + pBufferOut1 += KASUMI_BLOCK_SIZE; + /* loop variant */ + lengthInBytes1 -= KASUMI_BLOCK_SIZE; + + /* apply the modifier and update the block count */ + b1.b64[0] ^= a1.b64[0]; + b1.b16[0] ^= (uint16_t)++blkcnt; + } else if (lengthInBytes1 < KASUMI_BLOCK_SIZE) { + /* end of the loop, handle the last bytes */ + memcpy_keystrm(safeInBuf.b8, pBufferIn1, + lengthInBytes1); + xor_keystrm_rev(temp.b8, safeInBuf.b8, b1.b64[0]); + memcpy_keystrm(pBufferOut1, temp.b8, + lengthInBytes1); + lengthInBytes1 = 0; + /* lengthInBytes1 == KASUMI_BLOCK_SIZE */ + } else { + xor_keystrm_rev(pBufferOut1, pBufferIn1, b1.b64[0]); + lengthInBytes1 = 0; + } + } +#ifdef SAFE_DATA + /* Clear sensitive data in stack */ + clear_mem(&a1, sizeof(a1)); + clear_mem(&b1, sizeof(b1)); + clear_mem(&a2, sizeof(a2)); + clear_mem(&b2, sizeof(b2)); + clear_mem(&temp, sizeof(temp)); + clear_mem(&safeInBuf, sizeof(safeInBuf)); +#endif +} + +static inline void +kasumi_f8_3_buffer(const kasumi_key_sched_t *pCtx, + const uint64_t IV1, const uint64_t IV2, const uint64_t IV3, + const void *pIn1, void *pOut1, + const void *pIn2, void *pOut2, + const void *pIn3, void *pOut3, + const uint32_t length) +{ + const uint8_t *pBufferIn1 = (const uint8_t *) pIn1; + uint8_t *pBufferOut1 = (uint8_t *) pOut1; + const uint8_t *pBufferIn2 = (const uint8_t *) pIn2; + uint8_t *pBufferOut2 = (uint8_t *) pOut2; + const uint8_t *pBufferIn3 = (const uint8_t *) pIn3; + uint8_t *pBufferOut3 = (uint8_t *) pOut3; + uint32_t lengthInBytes = length; + uint32_t blkcnt; + kasumi_union_t a1, b1; /* the modifier */ + kasumi_union_t a2, b2; /* the modifier */ + kasumi_union_t a3, b3; /* the modifier */ + SafeBuf safeInBuf1, safeInBuf2, safeInBuf3; + + /* IV Endianity */ + a1.b64[0] = BSWAP64(IV1); + a2.b64[0] = BSWAP64(IV2); + a3.b64[0] = BSWAP64(IV3); + + kasumi_3_blocks(pCtx->msk16, a1.b16, a2.b16, a3.b16); + + /* Final initialisation steps */ + blkcnt = 0; + b1.b64[0] = a1.b64[0]; + b2.b64[0] = a2.b64[0]; + b3.b64[0] = a3.b64[0]; + + /* Now run the block cipher for common packet lengthInBytes, a whole + * number of blocks */ + while (lengthInBytes) { + /* KASUMI it to produce the next block of keystream for all the + * packets */ + kasumi_3_blocks(pCtx->sk16, b1.b16, b2.b16, b3.b16); + + if (lengthInBytes > KASUMI_BLOCK_SIZE) { + /* xor and write keystream */ + pBufferIn1 = xor_keystrm_rev(pBufferOut1, + pBufferIn1, b1.b64[0]); + pBufferOut1 += KASUMI_BLOCK_SIZE; + pBufferIn2 = xor_keystrm_rev(pBufferOut2, + pBufferIn2, b2.b64[0]); + pBufferOut2 += KASUMI_BLOCK_SIZE; + pBufferIn3 = xor_keystrm_rev(pBufferOut3, + pBufferIn3, b3.b64[0]); + pBufferOut3 += KASUMI_BLOCK_SIZE; + /* loop variant */ + lengthInBytes -= KASUMI_BLOCK_SIZE; + + /* apply the modifier and update the block count */ + b1.b64[0] ^= a1.b64[0]; + b1.b16[0] ^= (uint16_t)++blkcnt; + b2.b64[0] ^= a2.b64[0]; + b2.b16[0] ^= (uint16_t)blkcnt; + b3.b64[0] ^= a3.b64[0]; + b3.b16[0] ^= (uint16_t)blkcnt; + } else if (lengthInBytes < KASUMI_BLOCK_SIZE) { + /* end of the loop, handle the last bytes */ + memcpy_keystrm(safeInBuf1.b8, pBufferIn1, + lengthInBytes); + xor_keystrm_rev(b1.b8, safeInBuf1.b8, b1.b64[0]); + memcpy_keystrm(pBufferOut1, b1.b8, lengthInBytes); + + memcpy_keystrm(safeInBuf2.b8, pBufferIn2, + lengthInBytes); + xor_keystrm_rev(b2.b8, safeInBuf2.b8, b2.b64[0]); + memcpy_keystrm(pBufferOut2, b2.b8, lengthInBytes); + + memcpy_keystrm(safeInBuf3.b8, pBufferIn3, + lengthInBytes); + xor_keystrm_rev(b3.b8, safeInBuf3.b8, b3.b64[0]); + memcpy_keystrm(pBufferOut3, b3.b8, lengthInBytes); + lengthInBytes = 0; + /* lengthInBytes == KASUMI_BLOCK_SIZE */ + } else { + xor_keystrm_rev(pBufferOut1, pBufferIn1, b1.b64[0]); + xor_keystrm_rev(pBufferOut2, pBufferIn2, b2.b64[0]); + xor_keystrm_rev(pBufferOut3, pBufferIn3, b3.b64[0]); + lengthInBytes = 0; + } + } +#ifdef SAFE_DATA + /* Clear sensitive data in stack */ + clear_mem(&a1, sizeof(a1)); + clear_mem(&b1, sizeof(b1)); + clear_mem(&a2, sizeof(a2)); + clear_mem(&b2, sizeof(b2)); + clear_mem(&a3, sizeof(a3)); + clear_mem(&b3, sizeof(b3)); + clear_mem(&safeInBuf1, sizeof(safeInBuf1)); + clear_mem(&safeInBuf2, sizeof(safeInBuf2)); + clear_mem(&safeInBuf3, sizeof(safeInBuf3)); +#endif +} + +/*--------------------------------------------------------- +* @description +* Kasumi F8 4 packet: +* Four packets enc/dec with the same key schedule. +* The 4 Ivs are independent and are passed as an array of values +* The packets are separate, the datalength is common +*---------------------------------------------------------*/ + +static inline void +kasumi_f8_4_buffer(const kasumi_key_sched_t *pCtx, const uint64_t IV1, + const uint64_t IV2, const uint64_t IV3, const uint64_t IV4, + const void *pIn1, void *pOut1, + const void *pIn2, void *pOut2, + const void *pIn3, void *pOut3, + const void *pIn4, void *pOut4, + const uint32_t length) +{ + const uint8_t *pBufferIn1 = (const uint8_t *) pIn1; + uint8_t *pBufferOut1 = (uint8_t *) pOut1; + const uint8_t *pBufferIn2 = (const uint8_t *) pIn2; + uint8_t *pBufferOut2 = (uint8_t *) pOut2; + const uint8_t *pBufferIn3 = (const uint8_t *) pIn3; + uint8_t *pBufferOut3 = (uint8_t *) pOut3; + const uint8_t *pBufferIn4 = (const uint8_t *) pIn4; + uint8_t *pBufferOut4 = (uint8_t *) pOut4; + uint32_t lengthInBytes = length; + uint32_t blkcnt; + kasumi_union_t a1, b1; /* the modifier */ + kasumi_union_t a2, b2; /* the modifier */ + kasumi_union_t a3, b3; /* the modifier */ + kasumi_union_t a4, b4; /* the modifier */ + uint16_t *pTemp[4] = {b1.b16, b2.b16, b3.b16, b4.b16}; + SafeBuf safeInBuf1, safeInBuf2, safeInBuf3, safeInBuf4; + + /* IV Endianity */ + b1.b64[0] = BSWAP64(IV1); + b2.b64[0] = BSWAP64(IV2); + b3.b64[0] = BSWAP64(IV3); + b4.b64[0] = BSWAP64(IV4); + + kasumi_4_blocks(pCtx->msk16, pTemp); + + /* Final initialisation steps */ + blkcnt = 0; + a1.b64[0] = b1.b64[0]; + a2.b64[0] = b2.b64[0]; + a3.b64[0] = b3.b64[0]; + a4.b64[0] = b4.b64[0]; + + /* Now run the block cipher for common packet lengthInBytes, a whole + * number of blocks */ + while (lengthInBytes) { + /* KASUMI it to produce the next block of keystream for all the + * packets */ + kasumi_4_blocks(pCtx->sk16, pTemp); + + if (lengthInBytes > KASUMI_BLOCK_SIZE) { + /* xor and write keystream */ + pBufferIn1 = xor_keystrm_rev(pBufferOut1, + pBufferIn1, b1.b64[0]); + pBufferOut1 += KASUMI_BLOCK_SIZE; + pBufferIn2 = xor_keystrm_rev(pBufferOut2, + pBufferIn2, b2.b64[0]); + pBufferOut2 += KASUMI_BLOCK_SIZE; + pBufferIn3 = xor_keystrm_rev(pBufferOut3, + pBufferIn3, b3.b64[0]); + pBufferOut3 += KASUMI_BLOCK_SIZE; + pBufferIn4 = xor_keystrm_rev(pBufferOut4, + pBufferIn4, b4.b64[0]); + pBufferOut4 += KASUMI_BLOCK_SIZE; + /* loop variant */ + lengthInBytes -= KASUMI_BLOCK_SIZE; + + /* apply the modifier and update the block count */ + b1.b64[0] ^= a1.b64[0]; + b1.b16[0] ^= (uint16_t)++blkcnt; + b2.b64[0] ^= a2.b64[0]; + b2.b16[0] ^= (uint16_t)blkcnt; + b3.b64[0] ^= a3.b64[0]; + b3.b16[0] ^= (uint16_t)blkcnt; + b4.b64[0] ^= a4.b64[0]; + b4.b16[0] ^= (uint16_t)blkcnt; + } else if (lengthInBytes < KASUMI_BLOCK_SIZE) { + /* end of the loop, handle the last bytes */ + memcpy_keystrm(safeInBuf1.b8, pBufferIn1, + lengthInBytes); + xor_keystrm_rev(b1.b8, safeInBuf1.b8, b1.b64[0]); + memcpy_keystrm(pBufferOut1, b1.b8, lengthInBytes); + + memcpy_keystrm(safeInBuf2.b8, pBufferIn2, + lengthInBytes); + xor_keystrm_rev(b2.b8, safeInBuf2.b8, b2.b64[0]); + memcpy_keystrm(pBufferOut2, b2.b8, lengthInBytes); + + memcpy_keystrm(safeInBuf3.b8, pBufferIn3, + lengthInBytes); + xor_keystrm_rev(b3.b8, safeInBuf3.b8, b3.b64[0]); + memcpy_keystrm(pBufferOut3, b3.b8, lengthInBytes); + + memcpy_keystrm(safeInBuf4.b8, pBufferIn4, + lengthInBytes); + xor_keystrm_rev(b4.b8, safeInBuf4.b8, b4.b64[0]); + memcpy_keystrm(pBufferOut4, b4.b8, lengthInBytes); + lengthInBytes = 0; + /* lengthInBytes == KASUMI_BLOCK_SIZE */ + } else { + xor_keystrm_rev(pBufferOut1, pBufferIn1, b1.b64[0]); + xor_keystrm_rev(pBufferOut2, pBufferIn2, b2.b64[0]); + xor_keystrm_rev(pBufferOut3, pBufferIn3, b3.b64[0]); + xor_keystrm_rev(pBufferOut4, pBufferIn4, b4.b64[0]); + lengthInBytes = 0; + } + } +#ifdef SAFE_DATA + /* Clear sensitive data in stack */ + clear_mem(&a1, sizeof(a1)); + clear_mem(&b1, sizeof(b1)); + clear_mem(&a2, sizeof(a2)); + clear_mem(&b2, sizeof(b2)); + clear_mem(&a3, sizeof(a3)); + clear_mem(&b3, sizeof(b3)); + clear_mem(&a4, sizeof(a4)); + clear_mem(&b4, sizeof(b4)); + clear_mem(&safeInBuf1, sizeof(safeInBuf1)); + clear_mem(&safeInBuf2, sizeof(safeInBuf2)); + clear_mem(&safeInBuf3, sizeof(safeInBuf3)); + clear_mem(&safeInBuf4, sizeof(safeInBuf4)); +#endif +} + +/*--------------------------------------------------------- +* @description +* Kasumi F8 2 packet: +* Two packets enc/dec with the same key schedule. +* The 2 Ivs are independent and are passed as an array of values. +* The packets are separate, the datalength is common +*---------------------------------------------------------*/ +/****************************************************************************** +* @description +* Kasumi F8 n packet: +* Performs F8 enc/dec on [n] packets. The operation is performed in-place. +* The input IV's are passed in Big Endian format. +* The KeySchedule is in Little Endian format. +*******************************************************************************/ + +static inline void +kasumi_f8_n_buffer(const kasumi_key_sched_t *pKeySchedule, const uint64_t IV[], + const void * const pIn[], void *pOut[], + const uint32_t lengths[], const uint32_t bufCount) +{ + if (bufCount > 16) { + pOut[0] = NULL; + printf("dataCount too high (%d)\n", bufCount); + return; + } + + uint32_t dataCount = bufCount; + kasumi_union_t A[NUM_PACKETS_16], temp[NUM_PACKETS_16], tempSort; + uint16_t *data[NUM_PACKETS_16]; + uint32_t dataLen[NUM_PACKETS_16]; + uint8_t *pDataOut[NUM_PACKETS_16] = {NULL}; + const uint8_t *pDataIn[NUM_PACKETS_16] = {NULL}; + const uint8_t *srctempbuff; + uint8_t *dsttempbuff; + uint32_t blkcnt = 0; + uint32_t len = 0; + uint32_t packet_idx, inner_idx, same_size_blocks; + int sortNeeded = 0, tempLen = 0; + SafeBuf safeInBuf; + + memcpy((void *)dataLen, lengths, dataCount * sizeof(uint32_t)); + memcpy((void *)pDataIn, pIn, dataCount * sizeof(void *)); + memcpy((void *)pDataOut, pOut, dataCount * sizeof(void *)); + + /* save the IV to A for each packet */ + packet_idx = dataCount; + while (packet_idx--) { + /*copy IV in reverse endian order as input IV is BE */ + temp[packet_idx].b64[0] = BSWAP64(IV[packet_idx]); + + /* set LE IV pointers */ + data[packet_idx] = temp[packet_idx].b16; + + /* check if all packets are sorted by decreasing length */ + if (packet_idx > 0 && + dataLen[packet_idx - 1] < dataLen[packet_idx]) + /* this packet array is not correctly sorted */ + sortNeeded = 1; + } + + /* do 1st kasumi block on A with modified key, this overwrites A */ + kasumiWrapperArray[dataCount](pKeySchedule->msk16, data); + + if (sortNeeded) { + /* sort packets in decreasing buffer size from [0] to [n]th + packet, + ** where buffer[0] will contain longest buffer and + buffer[n] will + contain the shortest buffer. + 4 arrays are swapped : + - pointers to input buffers + - pointers to output buffers + - pointers to input IV's + - input buffer lengths + */ + packet_idx = dataCount; + while (packet_idx--) { + inner_idx = packet_idx; + while (inner_idx--) { + if (dataLen[packet_idx] > dataLen[inner_idx]) { + + /* swap buffers to arrange in descending + * order from [0]. */ + srctempbuff = pDataIn[packet_idx]; + dsttempbuff = pDataOut[packet_idx]; + tempSort = temp[packet_idx]; + tempLen = dataLen[packet_idx]; + + pDataIn[packet_idx] = + pDataIn[inner_idx]; + pDataOut[packet_idx] = + pDataOut[inner_idx]; + temp[packet_idx] = temp[inner_idx]; + dataLen[packet_idx] = + dataLen[inner_idx]; + + pDataIn[inner_idx] = srctempbuff; + pDataOut[inner_idx] = dsttempbuff; + temp[inner_idx] = tempSort; + dataLen[inner_idx] = tempLen; + } + } /* for inner packet idx (inner bubble-sort) */ + } /* for outer packet idx (outer bubble-sort) */ + } /* if sortNeeded */ + + packet_idx = dataCount; + while (packet_idx--) + /* copy the schedule */ + A[packet_idx].b64[0] = temp[packet_idx].b64[0]; + + while (dataCount > 0) { + /* max num of blocks left depends on roundUp(smallest packet), + * The shortest stream to process is always stored at location + * [dataCount - 1] + */ + same_size_blocks = + ((dataLen[dataCount - 1] + KASUMI_BLOCK_SIZE - 1) / + KASUMI_BLOCK_SIZE) - + blkcnt; + + /* process streams of complete blocks */ + while (same_size_blocks-- > 1) { + /* do kasumi block encryption */ + kasumiWrapperArray[dataCount](pKeySchedule->sk16, + data); + + packet_idx = dataCount; + while (packet_idx--) + xor_keystrm_rev(pDataOut[packet_idx] + len, + pDataIn[packet_idx] + len, + temp[packet_idx].b64[0]); + + /* length already done since the start of the packets */ + len += KASUMI_BLOCK_SIZE; + + /* block idx is incremented and rewritten in the + * keystream */ + blkcnt += 1; + packet_idx = dataCount; + while (packet_idx--) { + temp[packet_idx].b64[0] ^= A[packet_idx].b64[0]; + temp[packet_idx].b16[0] ^= (uint16_t)blkcnt; + } /* for packet_idx */ + + } /* while same_size_blocks (iteration on multiple blocks) */ + + /* keystream for last block of all packets */ + kasumiWrapperArray[dataCount](pKeySchedule->sk16, data); + + /* process incomplete blocks without overwriting past the buffer + * end */ + while ((dataCount > 0) && + (dataLen[dataCount - 1] < (len + KASUMI_BLOCK_SIZE))) { + + dataCount--; + /* incomplete block is copied into a temp buffer */ + memcpy_keystrm(safeInBuf.b8, pDataIn[dataCount] + len, + dataLen[dataCount] - len); + xor_keystrm_rev(temp[dataCount].b8, + safeInBuf.b8, + temp[dataCount].b64[0]); + + memcpy_keystrm(pDataOut[dataCount] + len, + temp[dataCount].b8, + dataLen[dataCount] - len); + } /* while dataCount */ + + /* process last blocks: it can be the last complete block of the + packets or, if + KASUMI_SAFE_BUFFER is defined, the last block (complete or not) + of the packets*/ + while ((dataCount > 0) && + (dataLen[dataCount - 1] <= (len + KASUMI_BLOCK_SIZE))) { + + dataCount--; + xor_keystrm_rev(pDataOut[dataCount] + len, + pDataIn[dataCount] + len, + temp[dataCount].b64[0]); + } /* while dataCount */ + /* block idx is incremented and rewritten in the keystream */ + blkcnt += 1; + + /* for the following packets, this block is not the last one: + dataCount is not decremented */ + packet_idx = dataCount; + while (packet_idx--) { + + xor_keystrm_rev(pDataOut[packet_idx] + len, + pDataIn[packet_idx] + len, + temp[packet_idx].b64[0]); + temp[packet_idx].b64[0] ^= A[packet_idx].b64[0]; + temp[packet_idx].b16[0] ^= (uint16_t)blkcnt; + } /* while packet_idx */ + + /* length already done since the start of the packets */ + len += KASUMI_BLOCK_SIZE; + + /* the remaining packets, if any, have now at least one valid + block, which might be complete or not */ + + } /* while (dataCount) */ +#ifdef SAFE_DATA + uint32_t i; + + /* Clear sensitive data in stack */ + for (i = 0; i < dataCount; i++) { + clear_mem(&A[i], sizeof(A[i])); + clear_mem(&temp[i], sizeof(temp[i])); + } + clear_mem(&tempSort, sizeof(tempSort)); + clear_mem(&safeInBuf, sizeof(safeInBuf)); +#endif +} + +static inline void +kasumi_f9_1_buffer(const kasumi_key_sched_t *pCtx, const void *dataIn, + const uint32_t length, void *pDigest) +{ + kasumi_union_t a, b, mask; + const uint64_t *pIn = (const uint64_t *)dataIn; + uint32_t lengthInBytes = length; + SafeBuf safeBuf; + + /* Init */ + a.b64[0] = 0; + b.b64[0] = 0; + mask.b64[0] = -1; + + /* Now run kasumi for all 8 byte blocks */ + while (lengthInBytes >= 8) { + + a.b64[0] ^= BSWAP64(*(pIn++)); + + /* KASUMI it */ + kasumi_1_block(pCtx->sk16, a.b16); + + /* loop variant */ + lengthInBytes -= 8; /* done another 64 bits */ + + /* update */ + b.b64[0] ^= a.b64[0]; + } + + if (lengthInBytes) { + /* Not a whole 8 byte block remaining */ + mask.b64[0] = ~(mask.b64[0] >> (BYTESIZE * lengthInBytes)); + memcpy(&safeBuf.b64, pIn, lengthInBytes); + mask.b64[0] &= BSWAP64(safeBuf.b64); + a.b64[0] ^= mask.b64[0]; + + /* KASUMI it */ + kasumi_1_block(pCtx->sk16, a.b16); + + /* update */ + b.b64[0] ^= a.b64[0]; + } + + /* Kasumi b */ + kasumi_1_block(pCtx->msk16, b.b16); + + /* swap result */ + *(uint32_t *)pDigest = bswap4(b.b32[1]); +#ifdef SAFE_DATA + /* Clear sensitive data in stack */ + clear_mem(&a, sizeof(a)); + clear_mem(&b, sizeof(b)); + clear_mem(&mask, sizeof(mask)); + clear_mem(&safeBuf, sizeof(safeBuf)); +#endif +} + +/*--------------------------------------------------------- +* @description +* Kasumi F9 1 packet with user config: +* Single packet digest with user defined IV, and precomputed key schedule. +* +* IV = swap32(count) << 32 | swap32(fresh) +* +*---------------------------------------------------------*/ + +static inline void +kasumi_f9_1_buffer_user(const kasumi_key_sched_t *pCtx, const uint64_t IV, + const void *pDataIn, const uint32_t length, + void *pDigest, const uint32_t direction) +{ + kasumi_union_t a, b, mask, message, temp; + uint32_t lengthInBits = length; + const uint64_t *pIn = (const uint64_t *)pDataIn; + kasumi_union_t safebuff; + + a.b64[0] = 0; + b.b64[0] = 0; + + /* Use the count and fresh for first round */ + a.b64[0] = BSWAP64(IV); + /* KASUMI it */ + kasumi_1_block(pCtx->sk16, a.b16); + /* update */ + b.b64[0] = a.b64[0]; + + /* Now run kasumi for all 8 byte blocks */ + while (lengthInBits >= QWORDSIZEINBITS) { + a.b64[0] ^= BSWAP64(*(pIn++)); + /* KASUMI it */ + kasumi_1_block(pCtx->sk16, a.b16); + /* loop variant */ + lengthInBits -= 64; /* done another 64 bits */ + /* update */ + b.b64[0] ^= a.b64[0]; + } + + /* Is there any non 8 byte blocks remaining ? */ + if (lengthInBits == 0) { + /* last block is : direct + 1 + 62 0's */ + a.b64[0] ^= ((uint64_t)direction + direction + LAST_PADDING_BIT) + << (QWORDSIZEINBITS - 2); + kasumi_1_block(pCtx->sk16, a.b16); + /* update */ + b.b64[0] ^= a.b64[0]; + } else if (lengthInBits <= (QWORDSIZEINBITS - 2)) { + /* last block is : message + direction + LAST_PADDING_BITS(1) + + * less than 62 0's */ + mask.b64[0] = -1; + temp.b64[0] = 0; + message.b64[0] = 0; + mask.b64[0] = ~(mask.b64[0] >> lengthInBits); + /*round up and copy last lengthInBits */ + memcpy(&safebuff.b64[0], pIn, (lengthInBits + 7) / 8); + message.b64[0] = BSWAP64(safebuff.b64[0]); + temp.b64[0] = mask.b64[0] & message.b64[0]; + temp.b64[0] |= + ((uint64_t)direction + direction + LAST_PADDING_BIT) + << ((QWORDSIZEINBITS - 2) - lengthInBits); + a.b64[0] ^= temp.b64[0]; + /* KASUMI it */ + kasumi_1_block(pCtx->sk16, a.b16); + + /* update */ + b.b64[0] ^= a.b64[0]; + } else if (lengthInBits == (QWORDSIZEINBITS - 1)) { + /* next block is : message + direct */ + /* last block is : 1 + 63 0's */ + a.b64[0] ^= direction | (~1 & BSWAP64(*(pIn++))); + /* KASUMI it */ + kasumi_1_block(pCtx->sk16, a.b16); + /* update */ + b.b64[0] ^= a.b64[0]; + a.b8[QWORDSIZEINBYTES - 1] ^= (LAST_PADDING_BIT) + << (QWORDSIZEINBYTES - 1); + /* KASUMI it */ + kasumi_1_block(pCtx->sk16, a.b16); + /* update */ + b.b64[0] ^= a.b64[0]; + } + /* Kasumi b */ + kasumi_1_block(pCtx->msk16, b.b16); + + /* swap result */ + *(uint32_t *)pDigest = bswap4(b.b32[1]); +#ifdef SAFE_DATA + /* Clear sensitive data in stack */ + clear_mem(&a, sizeof(a)); + clear_mem(&b, sizeof(b)); + clear_mem(&mask, sizeof(mask)); + clear_mem(&message, sizeof(message)); + clear_mem(&temp, sizeof(temp)); + clear_mem(&safebuff, sizeof(safebuff)); +#endif +} + +void kasumi_f8_1_buffer_sse(const kasumi_key_sched_t *pCtx, const uint64_t IV, + const void *pBufferIn, void *pBufferOut, + const uint32_t cipherLengthInBytes); + +void kasumi_f8_1_buffer_bit_sse(const kasumi_key_sched_t *pCtx, + const uint64_t IV, + const void *pBufferIn, void *pBufferOut, + const uint32_t cipherLengthInBits, + const uint32_t offsetInBits); + +void kasumi_f8_2_buffer_sse(const kasumi_key_sched_t *pCtx, + const uint64_t IV1, const uint64_t IV2, + const void *pBufferIn1, void *pBufferOut1, + const uint32_t lengthInBytes1, + const void *pBufferIn2, void *pBufferOut2, + const uint32_t lengthInBytes2); + +void kasumi_f8_3_buffer_sse(const kasumi_key_sched_t *pCtx, const uint64_t IV1, + const uint64_t IV2, const uint64_t IV3, + const void *pBufferIn1, void *pBufferOut1, + const void *pBufferIn2, void *pBufferOut2, + const void *pBufferIn3, void *pBufferOut3, + const uint32_t lengthInBytes); + +void kasumi_f8_4_buffer_sse(const kasumi_key_sched_t *pCtx, + const uint64_t IV1, const uint64_t IV2, + const uint64_t IV3, const uint64_t IV4, + const void *pBufferIn1, void *pBufferOut1, + const void *pBufferIn2, void *pBufferOut2, + const void *pBufferIn3, void *pBufferOut3, + const void *pBufferIn4, void *pBufferOut4, + const uint32_t lengthInBytes); + +void kasumi_f8_n_buffer_sse(const kasumi_key_sched_t *pKeySchedule, + const uint64_t IV[], + const void * const pDataIn[], void *pDataOut[], + const uint32_t dataLen[], const uint32_t dataCount); + +void kasumi_f9_1_buffer_sse(const kasumi_key_sched_t *pCtx, + const void *pBufferIn, + const uint32_t lengthInBytes, void *pDigest); + +void kasumi_f9_1_buffer_user_sse(const kasumi_key_sched_t *pCtx, + const uint64_t IV, const void *pBufferIn, + const uint32_t lengthInBits, + void *pDigest, const uint32_t direction); + + +void kasumi_f8_1_buffer_avx(const kasumi_key_sched_t *pCtx, const uint64_t IV, + const void *pBufferIn, void *pBufferOut, + const uint32_t cipherLengthInBytes); +void kasumi_f8_1_buffer_bit_avx(const kasumi_key_sched_t *pCtx, + const uint64_t IV, + const void *pBufferIn, void *pBufferOut, + const uint32_t cipherLengthInBits, + const uint32_t offsetInBits); +void kasumi_f8_2_buffer_avx(const kasumi_key_sched_t *pCtx, + const uint64_t IV1, const uint64_t IV2, + const void *pBufferIn1, void *pBufferOut1, + const uint32_t lengthInBytes1, + const void *pBufferIn2, void *pBufferOut2, + const uint32_t lengthInBytes2); +void kasumi_f8_3_buffer_avx(const kasumi_key_sched_t *pCtx, const uint64_t IV1, + const uint64_t IV2, const uint64_t IV3, + const void *pBufferIn1, void *pBufferOut1, + const void *pBufferIn2, void *pBufferOut2, + const void *pBufferIn3, void *pBufferOut3, + const uint32_t lengthInBytes); +void kasumi_f8_4_buffer_avx(const kasumi_key_sched_t *pCtx, + const uint64_t IV1, const uint64_t IV2, + const uint64_t IV3, const uint64_t IV4, + const void *pBufferIn1, void *pBufferOut1, + const void *pBufferIn2, void *pBufferOut2, + const void *pBufferIn3, void *pBufferOut3, + const void *pBufferIn4, void *pBufferOut4, + const uint32_t lengthInBytes); +void kasumi_f8_n_buffer_avx(const kasumi_key_sched_t *pKeySchedule, + const uint64_t IV[], + const void * const pDataIn[], void *pDataOut[], + const uint32_t dataLen[], const uint32_t dataCount); + +void kasumi_f9_1_buffer_avx(const kasumi_key_sched_t *pCtx, + const void *pBufferIn, + const uint32_t lengthInBytes, void *pDigest); + +void kasumi_f9_1_buffer_user_avx(const kasumi_key_sched_t *pCtx, + const uint64_t IV, const void *pBufferIn, + const uint32_t lengthInBits, + void *pDigest, const uint32_t direction); +#endif /*_KASUMI_INTERNAL_H_*/ + diff --git a/src/spdk/intel-ipsec-mb/include/memcpy.asm b/src/spdk/intel-ipsec-mb/include/memcpy.asm new file mode 100644 index 000000000..82e4f2cb2 --- /dev/null +++ b/src/spdk/intel-ipsec-mb/include/memcpy.asm @@ -0,0 +1,613 @@ +;; +;; Copyright (c) 2012-2018, Intel Corporation +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are met: +;; +;; * Redistributions of source code must retain the above copyright notice, +;; this list of conditions and the following disclaimer. +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; * Neither the name of Intel Corporation nor the names of its contributors +;; may be used to endorse or promote products derived from this software +;; without specific prior written permission. +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;; + +%ifndef __MEMCPY_ASM__ +%define __MEMCPY_ASM__ + +%include "include/reg_sizes.asm" + + +; This section defines a series of macros to copy small to medium amounts +; of data from memory to memory, where the size is variable but limited. +; +; The macros are all called as: +; memcpy DST, SRC, SIZE, TMP0, TMP1, XTMP0, XTMP1, XTMP2, XTMP3 +; with the parameters defined as: +; DST : register: pointer to dst (not modified) +; SRC : register: pointer to src (not modified) +; SIZE : register: length in bytes (not modified) +; TMP0 : 64-bit temp GPR (clobbered) +; TMP1 : 64-bit temp GPR (clobbered) +; XTMP0 : temp XMM (clobbered) +; XTMP1 : temp XMM (clobbered) +; XTMP2 : temp XMM (clobbered) +; XTMP3 : temp XMM (clobbered) +; +; The name indicates the options. The name is of the form: +; memcpy__ +; where: +; is either "sse" or "avx" or "avx2" +; is either "64" or "128" and defines largest value of SIZE +; is blank or "_1". If "_1" then the min SIZE is 1 (otherwise 0) +; is blank or "_ret". If blank, the code falls through. If "ret" +; it does a "ret" at the end +; +; For the avx2 versions, the temp XMM registers need to be YMM registers +; If the SZ is 64, then only two YMM temps are needed, i.e. it is called as: +; memcpy_avx2_64 DST, SRC, SIZE, TMP0, TMP1, YTMP0, YTMP1 +; memcpy_avx2_128 DST, SRC, SIZE, TMP0, TMP1, YTMP0, YTMP1, YTMP2, YTMP3 +; +; For example: +; memcpy_sse_64 : SSE, 0 <= size < 64, falls through +; memcpy_avx_64_1 : AVX1, 1 <= size < 64, falls through +; memcpy_sse_128_ret : SSE, 0 <= size < 128, ends with ret +; mempcy_avx_128_1_ret : AVX1, 1 <= size < 128, ends with ret +; + +%macro memcpy_sse_64 9 + __memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 0, 64, 0, 0 +%endm + +%macro memcpy_sse_64_1 9 + __memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 1, 64, 0, 0 +%endm + +%macro memcpy_sse_128 9 + __memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 0, 128, 0, 0 +%endm + +%macro memcpy_sse_128_1 9 + __memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 1, 128, 0, 0 +%endm + +%macro memcpy_sse_64_ret 9 + __memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 0, 64, 1, 0 +%endm + +%macro memcpy_sse_64_1_ret 9 + __memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 1, 64, 1, 0 +%endm + +%macro memcpy_sse_128_ret 9 + __memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 0, 128, 1, 0 +%endm + +%macro memcpy_sse_128_1_ret 9 + __memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 1, 128, 1, 0 +%endm + + +%macro memcpy_sse_16 5 + __memcpy_int %1,%2,%3,%4,%5,,,,, 0, 16, 0, 0 +%endm + +%macro memcpy_sse_16_1 5 + __memcpy_int %1,%2,%3,%4,%5,,,,, 1, 16, 0, 0 +%endm + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%macro memcpy_avx_64 9 + __memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 0, 64, 0, 1 +%endm + +%macro memcpy_avx_64_1 9 + __memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 1, 64, 0, 1 +%endm + +%macro memcpy_avx_128 9 + __memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 0, 128, 0, 1 +%endm + +%macro memcpy_avx_128_1 9 + __memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 1, 128, 0, 1 +%endm + +%macro memcpy_avx_64_ret 9 + __memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 0, 64, 1, 1 +%endm + +%macro memcpy_avx_64_1_ret 9 + __memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 1, 64, 1, 1 +%endm + +%macro memcpy_avx_128_ret 9 + __memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 0, 128, 1, 1 +%endm + +%macro memcpy_avx_128_1_ret 9 + __memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 1, 128, 1, 1 +%endm + + +%macro memcpy_avx_16 5 + __memcpy_int %1,%2,%3,%4,%5,,,,, 0, 16, 0, 1 +%endm + +%macro memcpy_avx_16_1 5 + __memcpy_int %1,%2,%3,%4,%5,,,,, 1, 16, 0, 1 +%endm + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%macro memcpy_avx2_64 7 + __memcpy_int %1,%2,%3,%4,%5,%6,%7,--,--, 0, 64, 0, 2 +%endm + +%macro memcpy_avx2_64_1 7 + __memcpy_int %1,%2,%3,%4,%5,%6,%7,--,--, 1, 64, 0, 2 +%endm + +%macro memcpy_avx2_128 9 + __memcpy_int %1,%2,%3,%4,%5,%6,%7, %8, %9, 0, 128, 0, 2 +%endm + +%macro memcpy_avx2_128_1 9 + __memcpy_int %1,%2,%3,%4,%5,%6,%7, %8, %9, 1, 128, 0, 2 +%endm + +%macro memcpy_avx2_64_ret 7 + __memcpy_int %1,%2,%3,%4,%5,%6,%7,--,--, 0, 64, 1, 2 +%endm + +%macro memcpy_avx2_64_1_ret 7 + __memcpy_int %1,%2,%3,%4,%5,%6,%7,--,--, 1, 64, 1, 2 +%endm + +%macro memcpy_avx2_128_ret 9 + __memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 0, 128, 1, 2 +%endm + +%macro memcpy_avx2_128_1_ret 9 + __memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 1, 128, 1, 2 +%endm + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + + +%macro __memcpy_int 13 +%define %%DST %1 ; register: pointer to dst (not modified) +%define %%SRC %2 ; register: pointer to src (not modified) +%define %%SIZE %3 ; register: length in bytes (not modified) +%define %%TMP0 %4 ; 64-bit temp GPR (clobbered) +%define %%TMP1 %5 ; 64-bit temp GPR (clobbered) +%define %%XTMP0 %6 ; temp XMM (clobbered) +%define %%XTMP1 %7 ; temp XMM (clobbered) +%define %%XTMP2 %8 ; temp XMM (clobbered) +%define %%XTMP3 %9 ; temp XMM (clobbered) +%define %%NOT0 %10 ; if not 0, then assume size cannot be zero +%define %%MAXSIZE %11 ; 128, 64, etc +%define %%USERET %12 ; if not 0, use "ret" at end +%define %%USEAVX %13 ; 0 = SSE, 1 = AVX1, 2 = AVX2 + +%if (%%USERET != 0) + %define %%DONE ret +%else + %define %%DONE jmp %%end +%endif + +%if (%%USEAVX != 0) + %define %%MOVDQU vmovdqu +%else + %define %%MOVDQU movdqu +%endif + +%if (%%MAXSIZE >= 128) + test %%SIZE, 64 + jz %%lt64 + %if (%%USEAVX >= 2) + %%MOVDQU %%XTMP0, [%%SRC + 0*32] + %%MOVDQU %%XTMP1, [%%SRC + 1*32] + %%MOVDQU %%XTMP2, [%%SRC + %%SIZE - 2*32] + %%MOVDQU %%XTMP3, [%%SRC + %%SIZE - 1*32] + + %%MOVDQU [%%DST + 0*32], %%XTMP0 + %%MOVDQU [%%DST + 1*32], %%XTMP1 + %%MOVDQU [%%DST + %%SIZE - 2*32], %%XTMP2 + %%MOVDQU [%%DST + %%SIZE - 1*32], %%XTMP3 + %else + %%MOVDQU %%XTMP0, [%%SRC + 0*16] + %%MOVDQU %%XTMP1, [%%SRC + 1*16] + %%MOVDQU %%XTMP2, [%%SRC + 2*16] + %%MOVDQU %%XTMP3, [%%SRC + 3*16] + %%MOVDQU [%%DST + 0*16], %%XTMP0 + %%MOVDQU [%%DST + 1*16], %%XTMP1 + %%MOVDQU [%%DST + 2*16], %%XTMP2 + %%MOVDQU [%%DST + 3*16], %%XTMP3 + + %%MOVDQU %%XTMP0, [%%SRC + %%SIZE - 4*16] + %%MOVDQU %%XTMP1, [%%SRC + %%SIZE - 3*16] + %%MOVDQU %%XTMP2, [%%SRC + %%SIZE - 2*16] + %%MOVDQU %%XTMP3, [%%SRC + %%SIZE - 1*16] + %%MOVDQU [%%DST + %%SIZE - 4*16], %%XTMP0 + %%MOVDQU [%%DST + %%SIZE - 3*16], %%XTMP1 + %%MOVDQU [%%DST + %%SIZE - 2*16], %%XTMP2 + %%MOVDQU [%%DST + %%SIZE - 1*16], %%XTMP3 + %endif + %%DONE +%endif + +%if (%%MAXSIZE >= 64) +%%lt64: + test %%SIZE, 32 + jz %%lt32 + %if (%%USEAVX >= 2) + %%MOVDQU %%XTMP0, [%%SRC + 0*32] + %%MOVDQU %%XTMP1, [%%SRC + %%SIZE - 1*32] + %%MOVDQU [%%DST + 0*32], %%XTMP0 + %%MOVDQU [%%DST + %%SIZE - 1*32], %%XTMP1 + %else + %%MOVDQU %%XTMP0, [%%SRC + 0*16] + %%MOVDQU %%XTMP1, [%%SRC + 1*16] + %%MOVDQU %%XTMP2, [%%SRC + %%SIZE - 2*16] + %%MOVDQU %%XTMP3, [%%SRC + %%SIZE - 1*16] + %%MOVDQU [%%DST + 0*16], %%XTMP0 + %%MOVDQU [%%DST + 1*16], %%XTMP1 + %%MOVDQU [%%DST + %%SIZE - 2*16], %%XTMP2 + %%MOVDQU [%%DST + %%SIZE - 1*16], %%XTMP3 + %endif + %%DONE +%endif + +%if (%%MAXSIZE >= 32) +%%lt32: + test %%SIZE, 16 + jz %%lt16 + %if (%%USEAVX >= 2) + %%MOVDQU XWORD(%%XTMP0), [%%SRC + 0*16] + %%MOVDQU XWORD(%%XTMP1), [%%SRC + %%SIZE - 1*16] + %%MOVDQU [%%DST + 0*16], XWORD(%%XTMP0) + %%MOVDQU [%%DST + %%SIZE - 1*16], XWORD(%%XTMP1) + %else + %%MOVDQU %%XTMP0, [%%SRC + 0*16] + %%MOVDQU %%XTMP1, [%%SRC + %%SIZE - 1*16] + %%MOVDQU [%%DST + 0*16], %%XTMP0 + %%MOVDQU [%%DST + %%SIZE - 1*16], %%XTMP1 + %endif + %%DONE +%endif + +%if (%%MAXSIZE >= 16) +%%lt16: + test %%SIZE, 8 + jz %%lt8 + mov %%TMP0, [%%SRC] + mov %%TMP1, [%%SRC + %%SIZE - 8] + mov [%%DST], %%TMP0 + mov [%%DST + %%SIZE - 8], %%TMP1 + %%DONE +%endif + +%if (%%MAXSIZE >= 8) +%%lt8: + test %%SIZE, 4 + jz %%lt4 + mov DWORD(%%TMP0), [%%SRC] + mov DWORD(%%TMP1), [%%SRC + %%SIZE - 4] + mov [%%DST], DWORD(%%TMP0) + mov [%%DST + %%SIZE - 4], DWORD(%%TMP1) + %%DONE +%endif + +%if (%%MAXSIZE >= 4) +%%lt4: + test %%SIZE, 2 + jz %%lt2 + movzx DWORD(%%TMP0), word [%%SRC] + movzx DWORD(%%TMP1), byte [%%SRC + %%SIZE - 1] + mov [%%DST], WORD(%%TMP0) + mov [%%DST + %%SIZE - 1], BYTE(%%TMP1) + %%DONE +%endif + +%%lt2: +%if (%%NOT0 == 0) + test %%SIZE, 1 + jz %%end +%endif + movzx DWORD(%%TMP0), byte [%%SRC] + mov [%%DST], BYTE(%%TMP0) +%%end: +%if (%%USERET != 0) + ret +%endif +%endm + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; Utility macro to assist with SIMD shifting +%macro _PSRLDQ 3 +%define %%VEC %1 +%define %%REG %2 +%define %%IMM %3 + +%ifidn %%VEC, SSE + psrldq %%REG, %%IMM +%else + vpsrldq %%REG, %%REG, %%IMM +%endif +%endm + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +; This section defines a series of macros to store small to medium amounts +; of data from SIMD registers to memory, where the size is variable but limited. +; +; The macros are all called as: +; memcpy DST, SRC, SIZE, TMP, IDX +; with the parameters defined as: +; DST : register: pointer to dst (not modified) +; SRC : register: src data (clobbered) +; SIZE : register: length in bytes (not modified) +; TMP : 64-bit temp GPR (clobbered) +; IDX : 64-bit GPR to store dst index/offset (clobbered) +; +; The name indicates the options. The name is of the form: +; simd_store_ +; where is the SIMD instruction type e.g. "sse" or "avx" + + +%macro simd_store_sse 5 + __simd_store %1,%2,%3,%4,%5,SSE +%endm + +%macro simd_store_avx 5 + __simd_store %1,%2,%3,%4,%5,AVX +%endm + +%macro simd_store_sse_15 5 + __simd_store %1,%2,%3,%4,%5,SSE,15 +%endm + +%macro simd_store_avx_15 5 + __simd_store %1,%2,%3,%4,%5,AVX,15 +%endm + +%macro __simd_store 6-7 +%define %%DST %1 ; register: pointer to dst (not modified) +%define %%SRC %2 ; register: src data (clobbered) +%define %%SIZE %3 ; register: length in bytes (not modified) +%define %%TMP %4 ; 64-bit temp GPR (clobbered) +%define %%IDX %5 ; 64-bit temp GPR to store dst idx (clobbered) +%define %%SIMDTYPE %6 ; "SSE" or "AVX" +%define %%MAX_LEN %7 ; [optional] maximum length to be stored, default 16 + +%define %%PSRLDQ _PSRLDQ %%SIMDTYPE, + +%ifidn %%SIMDTYPE, SSE + %define %%MOVDQU movdqu + %define %%MOVQ movq +%else + %define %%MOVDQU vmovdqu + %define %%MOVQ vmovq +%endif + +;; determine max byte size for store operation +%if %0 > 6 +%assign max_length_to_store %%MAX_LEN +%else +%assign max_length_to_store 16 +%endif + +%if max_length_to_store > 16 +%error "__simd_store macro invoked with MAX_LEN bigger than 16!" +%endif + + xor %%IDX, %%IDX ; zero idx + +%if max_length_to_store == 16 + test %%SIZE, 16 + jz %%lt16 + %%MOVDQU [%%DST], %%SRC + jmp %%end +%%lt16: +%endif + +%if max_length_to_store >= 8 + test %%SIZE, 8 + jz %%lt8 + %%MOVQ [%%DST + %%IDX], %%SRC + %%PSRLDQ %%SRC, 8 + add %%IDX, 8 +%%lt8: +%endif + + %%MOVQ %%TMP, %%SRC ; use GPR from now on + +%if max_length_to_store >= 4 + test %%SIZE, 4 + jz %%lt4 + mov [%%DST + %%IDX], DWORD(%%TMP) + shr %%TMP, 32 + add %%IDX, 4 +%%lt4: +%endif + + test %%SIZE, 2 + jz %%lt2 + mov [%%DST + %%IDX], WORD(%%TMP) + shr %%TMP, 16 + add %%IDX, 2 +%%lt2: + test %%SIZE, 1 + jz %%end + mov [%%DST + %%IDX], BYTE(%%TMP) +%%end: +%endm + +; This section defines a series of macros to load small to medium amounts +; (from 0 to 16 bytes) of data from memory to SIMD registers, +; where the size is variable but limited. +; +; The macros are all called as: +; simd_load DST, SRC, SIZE +; with the parameters defined as: +; DST : register: destination XMM register +; SRC : register: pointer to src data (not modified) +; SIZE : register: length in bytes (not modified) +; +; The name indicates the options. The name is of the form: +; simd_load__ +; where: +; is either "sse" or "avx" +; is either "15" or "16" and defines largest value of SIZE +; is blank or "_1". If "_1" then the min SIZE is 1 (otherwise 0) +; +; For example: +; simd_load_sse_16 : SSE, 0 <= size <= 16 +; simd_load_avx_15_1 : AVX, 1 <= size <= 15 + +%macro simd_load_sse_15_1 3 + __simd_load %1,%2,%3,0,0,SSE +%endm +%macro simd_load_sse_15 3 + __simd_load %1,%2,%3,1,0,SSE +%endm +%macro simd_load_sse_16_1 3 + __simd_load %1,%2,%3,0,1,SSE +%endm +%macro simd_load_sse_16 3 + __simd_load %1,%2,%3,1,1,SSE +%endm + +%macro simd_load_avx_15_1 3 + __simd_load %1,%2,%3,0,0,AVX +%endm +%macro simd_load_avx_15 3 + __simd_load %1,%2,%3,1,0,AVX +%endm +%macro simd_load_avx_16_1 3 + __simd_load %1,%2,%3,0,1,AVX +%endm +%macro simd_load_avx_16 3 + __simd_load %1,%2,%3,1,1,AVX +%endm + +%macro __simd_load 6 +%define %%DST %1 ; [out] destination XMM register +%define %%SRC %2 ; [in] pointer to src data +%define %%SIZE %3 ; [in] length in bytes (0-16 bytes) +%define %%ACCEPT_0 %4 ; 0 = min length = 1, 1 = min length = 0 +%define %%ACCEPT_16 %5 ; 0 = max length = 15 , 1 = max length = 16 +%define %%SIMDTYPE %6 ; "SSE" or "AVX" + +%ifidn %%SIMDTYPE, SSE + %define %%MOVDQU movdqu + %define %%PINSRB pinsrb + %define %%PINSRQ pinsrq + %define %%PXOR pxor +%else + %define %%MOVDQU vmovdqu + %define %%PINSRB vpinsrb + %define %%PINSRQ vpinsrq + %define %%PXOR vpxor +%endif + +%if (%%ACCEPT_16 != 0) + test %%SIZE, 16 + jz %%_skip_16 + %%MOVDQU %%DST, [%%SRC] + jmp %%end_load + +%%_skip_16: +%endif + %%PXOR %%DST, %%DST ; clear XMM register +%if (%%ACCEPT_0 != 0) + or %%SIZE, %%SIZE + je %%end_load +%endif + cmp %%SIZE, 1 + je %%_size_1 + cmp %%SIZE, 2 + je %%_size_2 + cmp %%SIZE, 3 + je %%_size_3 + cmp %%SIZE, 4 + je %%_size_4 + cmp %%SIZE, 5 + je %%_size_5 + cmp %%SIZE, 6 + je %%_size_6 + cmp %%SIZE, 7 + je %%_size_7 + cmp %%SIZE, 8 + je %%_size_8 + cmp %%SIZE, 9 + je %%_size_9 + cmp %%SIZE, 10 + je %%_size_10 + cmp %%SIZE, 11 + je %%_size_11 + cmp %%SIZE, 12 + je %%_size_12 + cmp %%SIZE, 13 + je %%_size_13 + cmp %%SIZE, 14 + je %%_size_14 + +%%_size_15: + %%PINSRB %%DST, [%%SRC + 14], 14 +%%_size_14: + %%PINSRB %%DST, [%%SRC + 13], 13 +%%_size_13: + %%PINSRB %%DST, [%%SRC + 12], 12 +%%_size_12: + %%PINSRB %%DST, [%%SRC + 11], 11 +%%_size_11: + %%PINSRB %%DST, [%%SRC + 10], 10 +%%_size_10: + %%PINSRB %%DST, [%%SRC + 9], 9 +%%_size_9: + %%PINSRB %%DST, [%%SRC + 8], 8 +%%_size_8: + %%PINSRQ %%DST, [%%SRC], 0 + jmp %%end_load +%%_size_7: + %%PINSRB %%DST, [%%SRC + 6], 6 +%%_size_6: + %%PINSRB %%DST, [%%SRC + 5], 5 +%%_size_5: + %%PINSRB %%DST, [%%SRC + 4], 4 +%%_size_4: + %%PINSRB %%DST, [%%SRC + 3], 3 +%%_size_3: + %%PINSRB %%DST, [%%SRC + 2], 2 +%%_size_2: + %%PINSRB %%DST, [%%SRC + 1], 1 +%%_size_1: + %%PINSRB %%DST, [%%SRC + 0], 0 +%%end_load: +%endm +%endif ; ifndef __MEMCPY_ASM__ diff --git a/src/spdk/intel-ipsec-mb/include/noaesni.h b/src/spdk/intel-ipsec-mb/include/noaesni.h new file mode 100644 index 000000000..30d970edf --- /dev/null +++ b/src/spdk/intel-ipsec-mb/include/noaesni.h @@ -0,0 +1,65 @@ +/******************************************************************************* + Copyright (c) 2018, Intel Corporation + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of Intel Corporation nor the names of its contributors + may be used to endorse or promote products derived from this software + without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE + FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#include "intel-ipsec-mb.h" + +#ifndef NOAESNI_H +#define NOAESNI_H + +IMB_DLL_EXPORT void init_mb_mgr_sse_no_aesni(MB_MGR *state); +IMB_DLL_EXPORT JOB_AES_HMAC *submit_job_sse_no_aesni(MB_MGR *state); +IMB_DLL_EXPORT JOB_AES_HMAC *submit_job_nocheck_sse_no_aesni(MB_MGR *state); +IMB_DLL_EXPORT JOB_AES_HMAC *flush_job_sse_no_aesni(MB_MGR *state); +IMB_DLL_EXPORT uint32_t queue_size_sse_no_aesni(MB_MGR *state); +IMB_DLL_EXPORT JOB_AES_HMAC *get_completed_job_sse_no_aesni(MB_MGR *state); +IMB_DLL_EXPORT JOB_AES_HMAC *get_next_job_sse_no_aesni(MB_MGR *state); + +IMB_DLL_EXPORT void +aes_keyexp_128_sse_no_aesni(const void *key, void *enc_exp_keys, + void *dec_exp_keys); +IMB_DLL_EXPORT void +aes_keyexp_192_sse_no_aesni(const void *key, void *enc_exp_keys, + void *dec_exp_keys); +IMB_DLL_EXPORT void +aes_keyexp_256_sse_no_aesni(const void *key, void *enc_exp_keys, + void *dec_exp_keys); +IMB_DLL_EXPORT void +aes_xcbc_expand_key_sse_no_aesni(const void *key, void *k1_exp, void *k2, + void *k3); +IMB_DLL_EXPORT void +aes_keyexp_128_enc_sse_no_aesni(const void *key, void *enc_exp_keys); +IMB_DLL_EXPORT void +aes_keyexp_192_enc_sse_no_aesni(const void *key, void *enc_exp_keys); +IMB_DLL_EXPORT void +aes_keyexp_256_enc_sse_no_aesni(const void *key, void *enc_exp_keys); +IMB_DLL_EXPORT void +aes_cmac_subkey_gen_sse_no_aesni(const void *key_exp, void *key1, void *key2); +IMB_DLL_EXPORT void +aes_cfb_128_one_sse_no_aesni(void *out, const void *in, const void *iv, + const void *keys, uint64_t len); + +#endif /* NOAESNI_H */ diff --git a/src/spdk/intel-ipsec-mb/include/os.asm b/src/spdk/intel-ipsec-mb/include/os.asm new file mode 100644 index 000000000..f54043ed2 --- /dev/null +++ b/src/spdk/intel-ipsec-mb/include/os.asm @@ -0,0 +1,58 @@ +;; +;; Copyright (c) 2017-2018, Intel Corporation +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are met: +;; +;; * Redistributions of source code must retain the above copyright notice, +;; this list of conditions and the following disclaimer. +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; * Neither the name of Intel Corporation nor the names of its contributors +;; may be used to endorse or promote products derived from this software +;; without specific prior written permission. +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;; +%ifndef OS_ASM_FILE +%define OS_ASM_FILE + +%ifndef WIN_ABI +%ifidn __OUTPUT_FORMAT__, win64 +%define WIN_ABI +%endif +%endif + +%ifndef LINUX +%ifidn __OUTPUT_FORMAT__, elf64 +%define LINUX +%endif +%endif + +%ifdef LINUX +;;; macro to declare global symbols +;;; - name : symbol name +;;; - type : funtion or data +;;; - scope : internal, private, default +%define MKGLOBAL(name,type,scope) global name %+ : %+ type scope +%endif ; LINUX + +%ifdef WIN_ABI +;;; macro to declare global symbols +;;; - name : symbol name +;;; - type : funtion or data +;;; - scope : internal, private, default (ignored in win64 coff format) +%define MKGLOBAL(name,type,scope) global name +%endif ; WIN_ABI + +%endif ; OS_ASM_FILE diff --git a/src/spdk/intel-ipsec-mb/include/reg_sizes.asm b/src/spdk/intel-ipsec-mb/include/reg_sizes.asm new file mode 100644 index 000000000..c9f9f8cd2 --- /dev/null +++ b/src/spdk/intel-ipsec-mb/include/reg_sizes.asm @@ -0,0 +1,300 @@ +;; +;; Copyright (c) 2012-2018, Intel Corporation +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are met: +;; +;; * Redistributions of source code must retain the above copyright notice, +;; this list of conditions and the following disclaimer. +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; * Neither the name of Intel Corporation nor the names of its contributors +;; may be used to endorse or promote products derived from this software +;; without specific prior written permission. +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;; + +; define d and w variants for registers + +%ifndef _REG_SIZES_ASM_ +%define _REG_SIZES_ASM_ + +%define raxd eax +%define raxw ax +%define raxb al + +%define rbxd ebx +%define rbxw bx +%define rbxb bl + +%define rcxd ecx +%define rcxw cx +%define rcxb cl + +%define rdxd edx +%define rdxw dx +%define rdxb dl + +%define rsid esi +%define rsiw si +%define rsib sil + +%define rdid edi +%define rdiw di +%define rdib dil + +%define rbpd ebp +%define rbpw bp +%define rbpb bpl + +%define zmm0x xmm0 +%define zmm1x xmm1 +%define zmm2x xmm2 +%define zmm3x xmm3 +%define zmm4x xmm4 +%define zmm5x xmm5 +%define zmm6x xmm6 +%define zmm7x xmm7 +%define zmm8x xmm8 +%define zmm9x xmm9 +%define zmm10x xmm10 +%define zmm11x xmm11 +%define zmm12x xmm12 +%define zmm13x xmm13 +%define zmm14x xmm14 +%define zmm15x xmm15 +%define zmm16x xmm16 +%define zmm17x xmm17 +%define zmm18x xmm18 +%define zmm19x xmm19 +%define zmm20x xmm20 +%define zmm21x xmm21 +%define zmm22x xmm22 +%define zmm23x xmm23 +%define zmm24x xmm24 +%define zmm25x xmm25 +%define zmm26x xmm26 +%define zmm27x xmm27 +%define zmm28x xmm28 +%define zmm29x xmm29 +%define zmm30x xmm30 +%define zmm31x xmm31 + +%define ymm0x xmm0 +%define ymm1x xmm1 +%define ymm2x xmm2 +%define ymm3x xmm3 +%define ymm4x xmm4 +%define ymm5x xmm5 +%define ymm6x xmm6 +%define ymm7x xmm7 +%define ymm8x xmm8 +%define ymm9x xmm9 +%define ymm10x xmm10 +%define ymm11x xmm11 +%define ymm12x xmm12 +%define ymm13x xmm13 +%define ymm14x xmm14 +%define ymm15x xmm15 +%define ymm16x xmm16 +%define ymm17x xmm17 +%define ymm18x xmm18 +%define ymm19x xmm19 +%define ymm20x xmm20 +%define ymm21x xmm21 +%define ymm22x xmm22 +%define ymm23x xmm23 +%define ymm24x xmm24 +%define ymm25x xmm25 +%define ymm26x xmm26 +%define ymm27x xmm27 +%define ymm28x xmm28 +%define ymm29x xmm29 +%define ymm30x xmm30 +%define ymm31x xmm31 + +%define xmm0x xmm0 +%define xmm1x xmm1 +%define xmm2x xmm2 +%define xmm3x xmm3 +%define xmm4x xmm4 +%define xmm5x xmm5 +%define xmm6x xmm6 +%define xmm7x xmm7 +%define xmm8x xmm8 +%define xmm9x xmm9 +%define xmm10x xmm10 +%define xmm11x xmm11 +%define xmm12x xmm12 +%define xmm13x xmm13 +%define xmm14x xmm14 +%define xmm15x xmm15 +%define xmm16x xmm16 +%define xmm17x xmm17 +%define xmm18x xmm18 +%define xmm19x xmm19 +%define xmm20x xmm20 +%define xmm21x xmm21 +%define xmm22x xmm22 +%define xmm23x xmm23 +%define xmm24x xmm24 +%define xmm25x xmm25 +%define xmm26x xmm26 +%define xmm27x xmm27 +%define xmm28x xmm28 +%define xmm29x xmm29 +%define xmm30x xmm30 +%define xmm31x xmm31 + +%define zmm0y ymm0 +%define zmm1y ymm1 +%define zmm2y ymm2 +%define zmm3y ymm3 +%define zmm4y ymm4 +%define zmm5y ymm5 +%define zmm6y ymm6 +%define zmm7y ymm7 +%define zmm8y ymm8 +%define zmm9y ymm9 +%define zmm10y ymm10 +%define zmm11y ymm11 +%define zmm12y ymm12 +%define zmm13y ymm13 +%define zmm14y ymm14 +%define zmm15y ymm15 +%define zmm16y ymm16 +%define zmm17y ymm17 +%define zmm18y ymm18 +%define zmm19y ymm19 +%define zmm20y ymm20 +%define zmm21y ymm21 +%define zmm22y ymm22 +%define zmm23y ymm23 +%define zmm24y ymm24 +%define zmm25y ymm25 +%define zmm26y ymm26 +%define zmm27y ymm27 +%define zmm28y ymm28 +%define zmm29y ymm29 +%define zmm30y ymm30 +%define zmm31y ymm31 + +%define xmm0y ymm0 +%define xmm1y ymm1 +%define xmm2y ymm2 +%define xmm3y ymm3 +%define xmm4y ymm4 +%define xmm5y ymm5 +%define xmm6y ymm6 +%define xmm7y ymm7 +%define xmm8y ymm8 +%define xmm9y ymm9 +%define xmm10y ymm10 +%define xmm11y ymm11 +%define xmm12y ymm12 +%define xmm13y ymm13 +%define xmm14y ymm14 +%define xmm15y ymm15 +%define xmm16y ymm16 +%define xmm17y ymm17 +%define xmm18y ymm18 +%define xmm19y ymm19 +%define xmm20y ymm20 +%define xmm21y ymm21 +%define xmm22y ymm22 +%define xmm23y ymm23 +%define xmm24y ymm24 +%define xmm25y ymm25 +%define xmm26y ymm26 +%define xmm27y ymm27 +%define xmm28y ymm28 +%define xmm29y ymm29 +%define xmm30y ymm30 +%define xmm31y ymm31 + +%define xmm0z zmm0 +%define xmm1z zmm1 +%define xmm2z zmm2 +%define xmm3z zmm3 +%define xmm4z zmm4 +%define xmm5z zmm5 +%define xmm6z zmm6 +%define xmm7z zmm7 +%define xmm8z zmm8 +%define xmm9z zmm9 +%define xmm10z zmm10 +%define xmm11z zmm11 +%define xmm12z zmm12 +%define xmm13z zmm13 +%define xmm14z zmm14 +%define xmm15z zmm15 +%define xmm16z zmm16 +%define xmm17z zmm17 +%define xmm18z zmm18 +%define xmm19z zmm19 +%define xmm20z zmm20 +%define xmm21z zmm21 +%define xmm22z zmm22 +%define xmm23z zmm23 +%define xmm24z zmm24 +%define xmm25z zmm25 +%define xmm26z zmm26 +%define xmm27z zmm27 +%define xmm28z zmm28 +%define xmm29z zmm29 +%define xmm30z zmm30 +%define xmm31z zmm31 + +%define ymm0z zmm0 +%define ymm1z zmm1 +%define ymm2z zmm2 +%define ymm3z zmm3 +%define ymm4z zmm4 +%define ymm5z zmm5 +%define ymm6z zmm6 +%define ymm7z zmm7 +%define ymm8z zmm8 +%define ymm9z zmm9 +%define ymm10z zmm10 +%define ymm11z zmm11 +%define ymm12z zmm12 +%define ymm13z zmm13 +%define ymm14z zmm14 +%define ymm15z zmm15 +%define ymm16z zmm16 +%define ymm17z zmm17 +%define ymm18z zmm18 +%define ymm19z zmm19 +%define ymm20z zmm20 +%define ymm21z zmm21 +%define ymm22z zmm22 +%define ymm23z zmm23 +%define ymm24z zmm24 +%define ymm25z zmm25 +%define ymm26z zmm26 +%define ymm27z zmm27 +%define ymm28z zmm28 +%define ymm29z zmm29 +%define ymm30z zmm30 +%define ymm31z zmm31 + +%define DWORD(reg) reg %+ d +%define WORD(reg) reg %+ w +%define BYTE(reg) reg %+ b + +%define XWORD(reg) reg %+ x +%define YWORD(reg) reg %+ y +%define ZWORD(reg) reg %+ z + +%endif ;; _REG_SIZES_ASM_ diff --git a/src/spdk/intel-ipsec-mb/include/save_xmms.asm b/src/spdk/intel-ipsec-mb/include/save_xmms.asm new file mode 100644 index 000000000..c9fd67eb5 --- /dev/null +++ b/src/spdk/intel-ipsec-mb/include/save_xmms.asm @@ -0,0 +1,132 @@ +;; +;; Copyright (c) 2012-2018, Intel Corporation +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are met: +;; +;; * Redistributions of source code must retain the above copyright notice, +;; this list of conditions and the following disclaimer. +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; * Neither the name of Intel Corporation nor the names of its contributors +;; may be used to endorse or promote products derived from this software +;; without specific prior written permission. +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;; + +%include "include/os.asm" + +%ifdef LINUX +%define ARG1 rdi +%else +%define ARG1 rcx +%endif + +section .text +; void save_xmms(UINT128 array[10]) +MKGLOBAL(save_xmms,function,internal) +save_xmms: + movdqa [ARG1 + 0*16], xmm6 + movdqa [ARG1 + 1*16], xmm7 + movdqa [ARG1 + 2*16], xmm8 + movdqa [ARG1 + 3*16], xmm9 + movdqa [ARG1 + 4*16], xmm10 + movdqa [ARG1 + 5*16], xmm11 + movdqa [ARG1 + 6*16], xmm12 + movdqa [ARG1 + 7*16], xmm13 + movdqa [ARG1 + 8*16], xmm14 + movdqa [ARG1 + 9*16], xmm15 + ret + + +; void restore_xmms(UINT128 array[10]) +MKGLOBAL(restore_xmms,function,internal) +restore_xmms: + movdqa xmm6, [ARG1 + 0*16] + movdqa xmm7, [ARG1 + 1*16] + movdqa xmm8, [ARG1 + 2*16] + movdqa xmm9, [ARG1 + 3*16] + movdqa xmm10, [ARG1 + 4*16] + movdqa xmm11, [ARG1 + 5*16] + movdqa xmm12, [ARG1 + 6*16] + movdqa xmm13, [ARG1 + 7*16] + movdqa xmm14, [ARG1 + 8*16] + movdqa xmm15, [ARG1 + 9*16] +%ifdef SAFE_DATA + ;; Clear potential sensitive data stored in stack + pxor xmm0, xmm0 + movdqa [ARG1 + 0 * 16], xmm0 + movdqa [ARG1 + 1 * 16], xmm0 + movdqa [ARG1 + 2 * 16], xmm0 + movdqa [ARG1 + 3 * 16], xmm0 + movdqa [ARG1 + 4 * 16], xmm0 + movdqa [ARG1 + 5 * 16], xmm0 + movdqa [ARG1 + 6 * 16], xmm0 + movdqa [ARG1 + 7 * 16], xmm0 + movdqa [ARG1 + 8 * 16], xmm0 + movdqa [ARG1 + 9 * 16], xmm0 +%endif + + ret + + + ; void save_xmms_avx(UINT128 array[10]) +MKGLOBAL(save_xmms_avx,function,internal) +save_xmms_avx: + vmovdqa [ARG1 + 0*16], xmm6 + vmovdqa [ARG1 + 1*16], xmm7 + vmovdqa [ARG1 + 2*16], xmm8 + vmovdqa [ARG1 + 3*16], xmm9 + vmovdqa [ARG1 + 4*16], xmm10 + vmovdqa [ARG1 + 5*16], xmm11 + vmovdqa [ARG1 + 6*16], xmm12 + vmovdqa [ARG1 + 7*16], xmm13 + vmovdqa [ARG1 + 8*16], xmm14 + vmovdqa [ARG1 + 9*16], xmm15 + ret + + +; void restore_xmms_avx(UINT128 array[10]) +MKGLOBAL(restore_xmms_avx,function,internal) +restore_xmms_avx: + vmovdqa xmm6, [ARG1 + 0*16] + vmovdqa xmm7, [ARG1 + 1*16] + vmovdqa xmm8, [ARG1 + 2*16] + vmovdqa xmm9, [ARG1 + 3*16] + vmovdqa xmm10, [ARG1 + 4*16] + vmovdqa xmm11, [ARG1 + 5*16] + vmovdqa xmm12, [ARG1 + 6*16] + vmovdqa xmm13, [ARG1 + 7*16] + vmovdqa xmm14, [ARG1 + 8*16] + vmovdqa xmm15, [ARG1 + 9*16] + +%ifdef SAFE_DATA + ;; Clear potential sensitive data stored in stack + vpxor xmm0, xmm0 + vmovdqa [ARG1 + 0 * 16], xmm0 + vmovdqa [ARG1 + 1 * 16], xmm0 + vmovdqa [ARG1 + 2 * 16], xmm0 + vmovdqa [ARG1 + 3 * 16], xmm0 + vmovdqa [ARG1 + 4 * 16], xmm0 + vmovdqa [ARG1 + 5 * 16], xmm0 + vmovdqa [ARG1 + 6 * 16], xmm0 + vmovdqa [ARG1 + 7 * 16], xmm0 + vmovdqa [ARG1 + 8 * 16], xmm0 + vmovdqa [ARG1 + 9 * 16], xmm0 +%endif + ret + +%ifdef LINUX +section .note.GNU-stack noalloc noexec nowrite progbits +%endif diff --git a/src/spdk/intel-ipsec-mb/include/save_xmms.h b/src/spdk/intel-ipsec-mb/include/save_xmms.h new file mode 100644 index 000000000..e711958da --- /dev/null +++ b/src/spdk/intel-ipsec-mb/include/save_xmms.h @@ -0,0 +1,39 @@ +/******************************************************************************* + Copyright (c) 2012-2018, Intel Corporation + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of Intel Corporation nor the names of its contributors + may be used to endorse or promote products derived from this software + without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE + FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#ifndef SAVE_XMMS_H +#define SAVE_XMMS_H + +#include "intel-ipsec-mb.h" + +void save_xmms(uint128_t array[10]); +void restore_xmms(uint128_t array[10]); + +void save_xmms_avx(uint128_t array[10]); +void restore_xmms_avx(uint128_t array[10]); + +#endif /* SAVE_XMMS_H */ diff --git a/src/spdk/intel-ipsec-mb/include/snow3g.h b/src/spdk/intel-ipsec-mb/include/snow3g.h new file mode 100644 index 000000000..520a4b41f --- /dev/null +++ b/src/spdk/intel-ipsec-mb/include/snow3g.h @@ -0,0 +1,511 @@ +/******************************************************************************* + Copyright (c) 2009-2019, Intel Corporation + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of Intel Corporation nor the names of its contributors + may be used to endorse or promote products derived from this software + without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE + FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#ifndef _SNOW3G_H_ +#define _SNOW3G_H_ + +/******************************************************************************* + * SSE + ******************************************************************************/ +void +snow3g_f8_1_buffer_bit_sse(const snow3g_key_schedule_t *pCtx, + const void *pIV, + const void *pBufferIn, + void *pBufferOut, + const uint32_t cipherLengthInBits, + const uint32_t offsetInBits); + +void +snow3g_f8_1_buffer_sse(const snow3g_key_schedule_t *pCtx, + const void *pIV, + const void *pBufferIn, + void *pBufferOut, + const uint32_t lengthInBytes); + +void +snow3g_f8_2_buffer_sse(const snow3g_key_schedule_t *pCtx, + const void *pIV1, + const void *pIV2, + const void *pBufferIn1, + void *pBufferOut1, + const uint32_t lengthInBytes1, + const void *pBufferIn2, + void *pBufferOut2, + const uint32_t lengthInBytes2); + +void +snow3g_f8_4_buffer_sse(const snow3g_key_schedule_t *pCtx, + const void *pIV1, + const void *pIV2, + const void *pIV3, + const void *pIV4, + const void *pBufferIn1, + void *pBufferOut1, + const uint32_t lengthInBytes1, + const void *pBufferIn2, + void *pBufferOut2, + const uint32_t lengthInBytes2, + const void *pBufferIn3, + void *pBufferOut3, + const uint32_t lengthInBytes3, + const void *pBufferIn4, + void *pBufferOut4, + const uint32_t lengthInBytes4); + +void +snow3g_f8_8_buffer_sse(const snow3g_key_schedule_t *pCtx, + const void *pIV1, + const void *pIV2, + const void *pIV3, + const void *pIV4, + const void *pIV5, + const void *pIV6, + const void *pIV7, + const void *pIV8, + const void *pBufferIn1, + void *pBufferOut1, + const uint32_t lengthInBytes1, + const void *pBufferIn2, + void *pBufferOut2, + const uint32_t lengthInBytes2, + const void *pBufferIn3, + void *pBufferOut3, + const uint32_t lengthInBytes3, + const void *pBufferIn4, + void *pBufferOut4, + const uint32_t lengthInBytes4, + const void *pBufferIn5, + void *pBufferOut5, + const uint32_t lengthInBytes5, + const void *pBufferIn6, + void *pBufferOut6, + const uint32_t lengthInBytes6, + const void *pBufferIn7, + void *pBufferOut7, + const uint32_t lengthInBytes7, + const void *pBufferIn8, + void *pBufferOut8, + const uint32_t lengthInBytes8); + +void +snow3g_f8_8_buffer_multikey_sse(const snow3g_key_schedule_t * const pCtx[], + const void * const pIV[], + const void * const pBufferIn[], + void *pBufferOut[], + const uint32_t lengthInBytes[]); + +void +snow3g_f8_n_buffer_sse(const snow3g_key_schedule_t *pCtx, + const void * const IV[], + const void * const pBufferIn[], + void *pBufferOut[], + const uint32_t bufferLenInBytes[], + const uint32_t bufferCount); + +void +snow3g_f8_n_buffer_multikey_sse(const snow3g_key_schedule_t * const pCtx[], + const void * const IV[], + const void * const pBufferIn[], + void *pBufferOut[], + const uint32_t bufferLenInBytes[], + const uint32_t bufferCount); + +void +snow3g_f9_1_buffer_sse(const snow3g_key_schedule_t *pCtx, + const void *pIV, + const void *pBufferIn, + const uint64_t lengthInBits, + void *pDigest); + +size_t +snow3g_key_sched_size_sse(void); + +int +snow3g_init_key_sched_sse(const void *pKey, snow3g_key_schedule_t *pCtx); + +/******************************************************************************* + * SSE NO-AESNI + ******************************************************************************/ +void +snow3g_f8_1_buffer_bit_sse_no_aesni(const snow3g_key_schedule_t *pCtx, + const void *pIV, + const void *pBufferIn, + void *pBufferOut, + const uint32_t cipherLengthInBits, + const uint32_t offsetInBits); + +void +snow3g_f8_1_buffer_sse_no_aesni(const snow3g_key_schedule_t *pCtx, + const void *pIV, + const void *pBufferIn, + void *pBufferOut, + const uint32_t lengthInBytes); + +void +snow3g_f8_2_buffer_sse_no_aesni(const snow3g_key_schedule_t *pCtx, + const void *pIV1, + const void *pIV2, + const void *pBufferIn1, + void *pBufferOut1, + const uint32_t lengthInBytes1, + const void *pBufferIn2, + void *pBufferOut2, + const uint32_t lengthInBytes2); + +void +snow3g_f8_4_buffer_sse_no_aesni(const snow3g_key_schedule_t *pCtx, + const void *pIV1, + const void *pIV2, + const void *pIV3, + const void *pIV4, + const void *pBufferIn1, + void *pBufferOut1, + const uint32_t lengthInBytes1, + const void *pBufferIn2, + void *pBufferOut2, + const uint32_t lengthInBytes2, + const void *pBufferIn3, + void *pBufferOut3, + const uint32_t lengthInBytes3, + const void *pBufferIn4, + void *pBufferOut4, + const uint32_t lengthInBytes4); + +void +snow3g_f8_8_buffer_sse_no_aesni(const snow3g_key_schedule_t *pCtx, + const void *pIV1, + const void *pIV2, + const void *pIV3, + const void *pIV4, + const void *pIV5, + const void *pIV6, + const void *pIV7, + const void *pIV8, + const void *pBufferIn1, + void *pBufferOut1, + const uint32_t lengthInBytes1, + const void *pBufferIn2, + void *pBufferOut2, + const uint32_t lengthInBytes2, + const void *pBufferIn3, + void *pBufferOut3, + const uint32_t lengthInBytes3, + const void *pBufferIn4, + void *pBufferOut4, + const uint32_t lengthInBytes4, + const void *pBufferIn5, + void *pBufferOut5, + const uint32_t lengthInBytes5, + const void *pBufferIn6, + void *pBufferOut6, + const uint32_t lengthInBytes6, + const void *pBufferIn7, + void *pBufferOut7, + const uint32_t lengthInBytes7, + const void *pBufferIn8, + void *pBufferOut8, + const uint32_t lengthInBytes8); + +void +snow3g_f8_8_buffer_multikey_sse_no_aesni(const snow3g_key_schedule_t * const + pCtx[], + const void * const pIV[], + const void * const pBufferIn[], + void *pBufferOut[], + const uint32_t lengthInBytes[]); + +void +snow3g_f8_n_buffer_sse_no_aesni(const snow3g_key_schedule_t *pCtx, + const void * const IV[], + const void * const pBufferIn[], + void *pBufferOut[], + const uint32_t bufferLenInBytes[], + const uint32_t bufferCount); + +void +snow3g_f8_n_buffer_multikey_sse_no_aesni(const snow3g_key_schedule_t * const + pCtx[], + const void * const IV[], + const void * const pBufferIn[], + void *pBufferOut[], + const uint32_t bufferLenInBytes[], + const uint32_t bufferCount); + +void +snow3g_f9_1_buffer_sse_no_aesni(const snow3g_key_schedule_t *pCtx, + const void *pIV, + const void *pBufferIn, + const uint64_t lengthInBits, + void *pDigest); + +size_t +snow3g_key_sched_size_sse_no_aesni(void); + +int +snow3g_init_key_sched_sse_no_aesni(const void *pKey, + snow3g_key_schedule_t *pCtx); + +/******************************************************************************* + * AVX + ******************************************************************************/ +void +snow3g_f8_1_buffer_bit_avx(const snow3g_key_schedule_t *pCtx, + const void *pIV, + const void *pBufferIn, + void *pBufferOut, + const uint32_t cipherLengthInBits, + const uint32_t offsetInBits); + +void +snow3g_f8_1_buffer_avx(const snow3g_key_schedule_t *pCtx, + const void *pIV, + const void *pBufferIn, + void *pBufferOut, + const uint32_t lengthInBytes); + +void +snow3g_f8_2_buffer_avx(const snow3g_key_schedule_t *pCtx, + const void *pIV1, + const void *pIV2, + const void *pBufferIn1, + void *pBufferOut1, + const uint32_t lengthInBytes1, + const void *pBufferIn2, + void *pBufferOut2, + const uint32_t lengthInBytes2); + +void +snow3g_f8_4_buffer_avx(const snow3g_key_schedule_t *pCtx, + const void *pIV1, + const void *pIV2, + const void *pIV3, + const void *pIV4, + const void *pBufferIn1, + void *pBufferOut1, + const uint32_t lengthInBytes1, + const void *pBufferIn2, + void *pBufferOut2, + const uint32_t lengthInBytes2, + const void *pBufferIn3, + void *pBufferOut3, + const uint32_t lengthInBytes3, + const void *pBufferIn4, + void *pBufferOut4, + const uint32_t lengthInBytes4); + +void +snow3g_f8_8_buffer_avx(const snow3g_key_schedule_t *pCtx, + const void *pIV1, + const void *pIV2, + const void *pIV3, + const void *pIV4, + const void *pIV5, + const void *pIV6, + const void *pIV7, + const void *pIV8, + const void *pBufferIn1, + void *pBufferOut1, + const uint32_t lengthInBytes1, + const void *pBufferIn2, + void *pBufferOut2, + const uint32_t lengthInBytes2, + const void *pBufferIn3, + void *pBufferOut3, + const uint32_t lengthInBytes3, + const void *pBufferIn4, + void *pBufferOut4, + const uint32_t lengthInBytes4, + const void *pBufferIn5, + void *pBufferOut5, + const uint32_t lengthInBytes5, + const void *pBufferIn6, + void *pBufferOut6, + const uint32_t lengthInBytes6, + const void *pBufferIn7, + void *pBufferOut7, + const uint32_t lengthInBytes7, + const void *pBufferIn8, + void *pBufferOut8, + const uint32_t lengthInBytes8); + +void +snow3g_f8_8_buffer_multikey_avx(const snow3g_key_schedule_t * const pCtx[], + const void * const pIV[], + const void * const pBufferIn[], + void *pBufferOut[], + const uint32_t lengthInBytes[]); + +void +snow3g_f8_n_buffer_avx(const snow3g_key_schedule_t *pCtx, + const void * const IV[], + const void * const pBufferIn[], + void *pBufferOut[], + const uint32_t bufferLenInBytes[], + const uint32_t bufferCount); + +void +snow3g_f8_n_buffer_multikey_avx(const snow3g_key_schedule_t * const pCtx[], + const void * const IV[], + const void * const pBufferIn[], + void *pBufferOut[], + const uint32_t bufferLenInBytes[], + const uint32_t bufferCount); + +void +snow3g_f9_1_buffer_avx(const snow3g_key_schedule_t *pCtx, + const void *pIV, + const void *pBufferIn, + const uint64_t lengthInBits, + void *pDigest); + +size_t +snow3g_key_sched_size_avx(void); + +int +snow3g_init_key_sched_avx(const void *pKey, snow3g_key_schedule_t *pCtx); + +/******************************************************************************* + * AVX2 + ******************************************************************************/ + +void +snow3g_f8_1_buffer_bit_avx2(const snow3g_key_schedule_t *pCtx, + const void *pIV, + const void *pBufferIn, + void *pBufferOut, + const uint32_t cipherLengthInBits, + const uint32_t offsetInBits); + +void +snow3g_f8_1_buffer_avx2(const snow3g_key_schedule_t *pCtx, + const void *pIV, + const void *pBufferIn, + void *pBufferOut, + const uint32_t lengthInBytes); + +void +snow3g_f8_2_buffer_avx2(const snow3g_key_schedule_t *pCtx, + const void *pIV1, + const void *pIV2, + const void *pBufferIn1, + void *pBufferOut1, + const uint32_t lengthInBytes1, + const void *pBufferIn2, + void *pBufferOut2, + const uint32_t lengthInBytes2); + +void +snow3g_f8_4_buffer_avx2(const snow3g_key_schedule_t *pCtx, + const void *pIV1, + const void *pIV2, + const void *pIV3, + const void *pIV4, + const void *pBufferIn1, + void *pBufferOut1, + const uint32_t lengthInBytes1, + const void *pBufferIn2, + void *pBufferOut2, + const uint32_t lengthInBytes2, + const void *pBufferIn3, + void *pBufferOut3, + const uint32_t lengthInBytes3, + const void *pBufferIn4, + void *pBufferOut4, + const uint32_t lengthInBytes4); + +void +snow3g_f8_8_buffer_avx2(const snow3g_key_schedule_t *pCtx, + const void *pIV1, + const void *pIV2, + const void *pIV3, + const void *pIV4, + const void *pIV5, + const void *pIV6, + const void *pIV7, + const void *pIV8, + const void *pBufferIn1, + void *pBufferOut1, + const uint32_t lengthInBytes1, + const void *pBufferIn2, + void *pBufferOut2, + const uint32_t lengthInBytes2, + const void *pBufferIn3, + void *pBufferOut3, + const uint32_t lengthInBytes3, + const void *pBufferIn4, + void *pBufferOut4, + const uint32_t lengthInBytes4, + const void *pBufferIn5, + void *pBufferOut5, + const uint32_t lengthInBytes5, + const void *pBufferIn6, + void *pBufferOut6, + const uint32_t lengthInBytes6, + const void *pBufferIn7, + void *pBufferOut7, + const uint32_t lengthInBytes7, + const void *pBufferIn8, + void *pBufferOut8, + const uint32_t lengthInBytes8); + +void +snow3g_f8_8_buffer_multikey_avx2(const snow3g_key_schedule_t * const pCtx[], + const void * const pIV[], + const void * const pBufferIn[], + void *pBufferOut[], + const uint32_t lengthInBytes[]); + +void +snow3g_f8_n_buffer_avx2(const snow3g_key_schedule_t *pCtx, + const void * const IV[], + const void * const pBufferIn[], + void *pBufferOut[], + const uint32_t bufferLenInBytes[], + const uint32_t bufferCount); + +void +snow3g_f8_n_buffer_multikey_avx2(const snow3g_key_schedule_t * const pCtx[], + const void * const IV[], + const void * const pBufferIn[], + void *pBufferOut[], + const uint32_t bufferLenInBytes[], + const uint32_t bufferCount); + +void +snow3g_f9_1_buffer_avx2(const snow3g_key_schedule_t *pCtx, + const void *pIV, + const void *pBufferIn, + const uint64_t lengthInBits, + void *pDigest); + +size_t +snow3g_key_sched_size_avx2(void); + +int +snow3g_init_key_sched_avx2(const void *pKey, snow3g_key_schedule_t *pCtx); + +#endif /* _SNOW3G_H_ */ diff --git a/src/spdk/intel-ipsec-mb/include/snow3g_common.h b/src/spdk/intel-ipsec-mb/include/snow3g_common.h new file mode 100644 index 000000000..d7c7e63c1 --- /dev/null +++ b/src/spdk/intel-ipsec-mb/include/snow3g_common.h @@ -0,0 +1,2840 @@ +/******************************************************************************* + Copyright (c) 2009-2019, Intel Corporation + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of Intel Corporation nor the names of its contributors + may be used to endorse or promote products derived from this software + without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE + FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +/*----------------------------------------------------------------------- + * + * An implementation of SNOW 3G, the core algorithm for the + * 3GPP Confidentiality and Integrity algorithms. + * + *-----------------------------------------------------------------------*/ + +#ifndef SNOW3G_COMMON_H +#define SNOW3G_COMMON_H + +#include +#include +#include + +#include "intel-ipsec-mb.h" +#include "include/snow3g.h" +#include "include/snow3g_internal.h" +#include "clear_regs_mem.h" + +#define CLEAR_MEM clear_mem +#define CLEAR_VAR clear_var + +/* ------------------------------------------------------------------- + * LFSR array shift by 1 position, 4 packets at a time + * ------------------------------------------------------------------ */ + +#ifdef AVX2 +/* LFSR array shift */ +static inline void ShiftLFSR_8(snow3gKeyState8_t *pCtx) +{ + pCtx->iLFSR_X = (pCtx->iLFSR_X + 1) & 15; +} +#endif /* AVX2 */ + +/* LFSR array shift */ +static inline void ShiftLFSR_4(snow3gKeyState4_t *pCtx) +{ + pCtx->iLFSR_X = (pCtx->iLFSR_X + 1) % 16; +} + +/*--------------------------------------------------------- + * @description + * Gf2 modular multiplication/reduction + * + *---------------------------------------------------------*/ +static inline uint64_t multiply_and_reduce64(uint64_t a, uint64_t b) +{ + uint64_t msk; + uint64_t res = 0; + uint64_t i = 64; + + while (i--) { + msk = ((int64_t)res >> 63) & 0x1b; + res <<= 1; + res ^= msk; + msk = ((int64_t)b >> 63) & a; + b <<= 1; + res ^= msk; + } + return res; +} + +#ifdef AVX2 +/* ------------------------------------------------------------------- + * ClockLFSR sub-function as defined in snow3g standard + * S = LFSR[2] + * ^ table_Alpha_div[LFSR[11] & 0xff] + * ^ table_Alpha_mul[LFSR[0] & 0xff] + * ------------------------------------------------------------------ */ +static void C0_C11_8(__m256i *S, const __m256i *L0, const __m256i *L11) +{ + __m256i mask, Sx, B11, B0, offset; + + offset = _mm256_set1_epi32(3); + mask = _mm256_setr_epi32(0xF0F0F000, 0xF0F0F004, 0xF0F0F008, 0xF0F0F00C, + 0xF0F0F000, 0xF0F0F004, 0xF0F0F008, + 0xF0F0F00C); + B11 = _mm256_shuffle_epi8(*L11, mask); + *S = _mm256_i32gather_epi32(snow3g_table_A_div, B11, 4); + + mask = _mm256_add_epi32(mask, offset); + B0 = _mm256_shuffle_epi8(*L0, mask); + Sx = _mm256_i32gather_epi32(snow3g_table_A_mul, B0, 4); + *S = _mm256_xor_si256(*S, Sx); +} +#endif /* AVX2 */ + +/* ------------------------------------------------------------------- + * ClockLFSR sub-function as defined in snow3g standard + * S = LFSR[2] + * ^ table_Alpha_div[LFSR[11] & 0xff] + * ^ table_Alpha_mul[LFSR[0] & 0xff] + * ------------------------------------------------------------------ */ +static inline void C0_C11_4(uint32_t *S, const __m128i *L0, const __m128i *L11) +{ + unsigned B11[4], B0[4]; + + B11[0] = _mm_extract_epi8(*L11, 0); + B11[1] = _mm_extract_epi8(*L11, 4); + B11[2] = _mm_extract_epi8(*L11, 8); + B11[3] = _mm_extract_epi8(*L11, 12); + + S[0] = snow3g_table_A_div[B11[0]]; + S[1] = snow3g_table_A_div[B11[1]]; + S[2] = snow3g_table_A_div[B11[2]]; + S[3] = snow3g_table_A_div[B11[3]]; + + B0[0] = _mm_extract_epi8(*L0, 3); + B0[1] = _mm_extract_epi8(*L0, 7); + B0[2] = _mm_extract_epi8(*L0, 11); + B0[3] = _mm_extract_epi8(*L0, 15); + + S[0] ^= snow3g_table_A_mul[B0[0]]; + S[1] ^= snow3g_table_A_mul[B0[1]]; + S[2] ^= snow3g_table_A_mul[B0[2]]; + S[3] ^= snow3g_table_A_mul[B0[3]]; +} + +#ifdef AVX2 +/* ------------------------------------------------------------------- + * ClockLFSR function as defined in snow3g standard + * S = table_Alpha_div[LFSR[11] & 0xff] + * ^ table_Alpha_mul[LFSR[0] >> 24] + * ^ LFSR[2] ^ LFSR[0] << 8 ^ LFSR[11] >> 8 + * ------------------------------------------------------------------ */ +static inline void ClockLFSR_8(snow3gKeyState8_t *pCtx) +{ + __m256i X2; + __m256i S, T, U; + + U = pCtx->LFSR_X[pCtx->iLFSR_X]; + S = pCtx->LFSR_X[(pCtx->iLFSR_X + 11) % 16]; + + C0_C11_8(&X2, &U, &S); + + T = _mm256_slli_epi32(U, 8); + S = _mm256_srli_epi32(S, 8); + U = _mm256_xor_si256(T, pCtx->LFSR_X[(pCtx->iLFSR_X + 2) % 16]); + + ShiftLFSR_8(pCtx); + + S = _mm256_xor_si256(S, U); + S = _mm256_xor_si256(S, X2); + pCtx->LFSR_X[(pCtx->iLFSR_X + 15) % 16] = S; +} +#endif /* AVX2 */ + +/* ------------------------------------------------------------------- + * ClockLFSR function as defined in snow3g standard + * S = table_Alpha_div[LFSR[11] & 0xff] + * ^ table_Alpha_mul[LFSR[0] >> 24] + * ^ LFSR[2] ^ LFSR[0] << 8 ^ LFSR[11] >> 8 + * ------------------------------------------------------------------ */ +static inline void ClockLFSR_4(snow3gKeyState4_t *pCtx) +{ + uint32_t X2[4]; + __m128i S, T, U; + + U = pCtx->LFSR_X[pCtx->iLFSR_X]; + S = pCtx->LFSR_X[(pCtx->iLFSR_X + 11) % 16]; + C0_C11_4(X2, &U, &S); + + T = _mm_slli_epi32(U, 8); + S = _mm_srli_epi32(S, 8); + U = _mm_xor_si128(T, pCtx->LFSR_X[(pCtx->iLFSR_X + 2) % 16]); + ShiftLFSR_4(pCtx); + + /* (SSE4) */ + T = _mm_insert_epi32(T, X2[0], 0); + T = _mm_insert_epi32(T, X2[1], 1); + T = _mm_insert_epi32(T, X2[2], 2); + T = _mm_insert_epi32(T, X2[3], 3); + S = _mm_xor_si128(S, U); + S = _mm_xor_si128(S, T); + pCtx->LFSR_X[(pCtx->iLFSR_X + 15) % 16] = S; +} + +#ifdef AVX2 +/* ------------------------------------------------------------------- + * ClockFSM function as defined in snow3g standard + * 8 packets at a time + * ------------------------------------------------------------------ */ +static inline void ClockFSM_8(snow3gKeyState8_t *pCtx, __m256i *data) +{ + __m256i F, R, S2T0, S2T1, S2T2, S2T3, S1T0, S1T1, S1T2, S1T3; + __m256i w3, w2, w1, w0, offset, mask; + + F = _mm256_add_epi32(pCtx->LFSR_X[(pCtx->iLFSR_X + 15)%16], + pCtx->FSM_X[0]); + R = _mm256_xor_si256(pCtx->LFSR_X[(pCtx->iLFSR_X + 5)%16], + pCtx->FSM_X[2]); + *data = _mm256_xor_si256(F, pCtx->FSM_X[1]); + R = _mm256_add_epi32(R, pCtx->FSM_X[1]); + offset = _mm256_set1_epi32(0x1); + + F = pCtx->FSM_X[1]; + w3 = _mm256_setr_epi32(0xF0F0F000, 0xF0F0F004, 0xF0F0F008, + 0xF0F0F00C, 0xF0F0F000, 0xF0F0F004, + 0xF0F0F008, 0xF0F0F00C); + mask = _mm256_shuffle_epi8(F,w3); + S2T0 = _mm256_i32gather_epi32(S2_T0,mask,4); + + w2 = _mm256_add_epi32(w3,offset); + mask = _mm256_shuffle_epi8(F,w2); + S2T1 = _mm256_i32gather_epi32(S2_T1,mask,4); + + w1 = _mm256_add_epi32(w2,offset); + mask = _mm256_shuffle_epi8(pCtx->FSM_X[1],w1); + S2T2 = _mm256_i32gather_epi32(S2_T2,mask,4); + + w0 = _mm256_add_epi32(w1,offset); + mask = _mm256_shuffle_epi8(F,w0); + S2T3 = _mm256_i32gather_epi32(S2_T3,mask,4); + + + F = pCtx->FSM_X[0]; + w3 = _mm256_setr_epi32(0xF0F0F000, 0xF0F0F004, 0xF0F0F008, + 0xF0F0F00C, 0xF0F0F010, 0xF0F0F014, + 0xF0F0F018, 0xF0F0F01C); + mask = _mm256_shuffle_epi8(F,w3); + S1T0 = _mm256_i32gather_epi32(S1_T0,mask,4); + + w2 = _mm256_add_epi32(w3,offset); + mask = _mm256_shuffle_epi8(F,w2); + S1T1 = _mm256_i32gather_epi32(S1_T1,mask,4); + + w1 = _mm256_add_epi32(w2,offset); + mask = _mm256_shuffle_epi8(F,w1); + S1T2 = _mm256_i32gather_epi32(S1_T2,mask,4); + + w0 = _mm256_add_epi32(w1,offset); + mask = _mm256_shuffle_epi8(F,w0); + S1T3 = _mm256_i32gather_epi32(S1_T3,mask,4); + + S2T0 = _mm256_xor_si256(S2T0, S2T1); + S2T2 = _mm256_xor_si256(S2T2, S2T3); + S2T0 = _mm256_xor_si256(S2T0, S2T2); + + S1T0 = _mm256_xor_si256(S1T0, S1T1); + S1T2 = _mm256_xor_si256(S1T2, S1T3); + S1T0 = _mm256_xor_si256(S1T0, S1T2); + + + pCtx->FSM_X[2] = S2T0; + pCtx->FSM_X[1] = S1T0; + pCtx->FSM_X[2] = S2T0; + pCtx->FSM_X[0] = R; +} + +#endif /* AVX2 */ + +/* ------------------------------------------------------------------- + * ClockFSM function as defined in snow3g standard + * 4 packets at a time + * ------------------------------------------------------------------ */ +static inline void ClockFSM_4(snow3gKeyState4_t *pCtx, __m128i *data) +{ + __m128i F, R; +#ifdef _WIN32 +#pragma warning(push) +#pragma warning(disable:4556) +#endif +#if defined (NO_AESNI) || defined (SAFE_LOOKUP) + uint32_t L = 0; +#endif + uint32_t K = 0; + + F = _mm_add_epi32(pCtx->LFSR_X[(pCtx->iLFSR_X + 15) % 16], + pCtx->FSM_X[0]); + R = _mm_xor_si128(pCtx->LFSR_X[(pCtx->iLFSR_X + 5) % 16], + pCtx->FSM_X[2]); + *data = _mm_xor_si128(F, pCtx->FSM_X[1]); + R = _mm_add_epi32(R, pCtx->FSM_X[1]); +#if defined (NO_AESNI) || defined (SAFE_LOOKUP) + S1_S2_4(pCtx->FSM_X[2], pCtx->FSM_X[1], pCtx->FSM_X[0], K, L, 0); + S1_S2_4(pCtx->FSM_X[2], pCtx->FSM_X[1], pCtx->FSM_X[0], K, L, 1); + S1_S2_4(pCtx->FSM_X[2], pCtx->FSM_X[1], pCtx->FSM_X[0], K, L, 2); + S1_S2_4(pCtx->FSM_X[2], pCtx->FSM_X[1], pCtx->FSM_X[0], K, L, 3); +#else + S1_S2_4(pCtx->FSM_X[2], pCtx->FSM_X[1], pCtx->FSM_X[0], K, 0); + S1_S2_4(pCtx->FSM_X[2], pCtx->FSM_X[1], pCtx->FSM_X[0], K, 1); + S1_S2_4(pCtx->FSM_X[2], pCtx->FSM_X[1], pCtx->FSM_X[0], K, 2); + S1_S2_4(pCtx->FSM_X[2], pCtx->FSM_X[1], pCtx->FSM_X[0], K, 3); +#endif /* NO_AESNI */ + pCtx->FSM_X[0] = R; + +#ifdef _WIN32 +#pragma warning(pop) +#endif +} + +/** +******************************************************************************* +* @description +* This function generates 4 bytes of keystream 1 buffer at a time +* +* @param[in] pCtx Context where the scheduled keys are stored +* @param[in/out] pKeyStream Pointer to generated keystream +* +*******************************************************************************/ +static inline void snow3g_keystream_1_4(snow3gKeyState1_t *pCtx, + uint32_t *pKeyStream) +{ + uint32_t F; + + ClockFSM_1(pCtx, &F); + *pKeyStream = F ^ pCtx->LFSR_S[0]; + ClockLFSR_1(pCtx); +} + +/** +******************************************************************************* +* @description +* This function generates 8 bytes of keystream 1 buffer at a time +* +* @param[in] pCtx Context where the scheduled keys are stored +* @param[in/out] pKeyStream Pointer to generated keystream +* +*******************************************************************************/ +static inline void snow3g_keystream_1_8(snow3gKeyState1_t *pCtx, + uint64_t *pKeyStream) +{ + uint64_t F; + uint32_t FSM4; + uint32_t V0, V1; + uint32_t F0, F1; + uint32_t R0, R1; + uint32_t L0, L1, L11, L12; + + /* Merged clock FSM + clock LFSR + clock FSM + clockLFSR + * in order to avoid redundancies in function processing + * and less instruction immediate dependencies + */ + L0 = pCtx->LFSR_S[0]; + V0 = pCtx->LFSR_S[2]; + L1 = pCtx->LFSR_S[1]; + V1 = pCtx->LFSR_S[3]; + R1 = pCtx->FSM_R1; + L11 = pCtx->LFSR_S[11]; + L12 = pCtx->LFSR_S[12]; + V0 ^= snow3g_table_A_mul[L0 >> 24]; + V1 ^= snow3g_table_A_mul[L1 >> 24]; + V0 ^= snow3g_table_A_div[L11 & 0xff]; + V1 ^= snow3g_table_A_div[L12 & 0xff]; + V0 ^= L0 << 8; + V1 ^= L1 << 8; + V0 ^= L11 >> 8; + V1 ^= L12 >> 8; + F0 = pCtx->LFSR_S[15] + R1; + F0 ^= L0; + F0 ^= pCtx->FSM_R2; + R0 = pCtx->FSM_R3 ^ pCtx->LFSR_S[5]; + R0 += pCtx->FSM_R2; + S1_S2_S3_1(pCtx->FSM_R3, pCtx->FSM_R2, R1, FSM4, R0); + R1 = pCtx->FSM_R3 ^ pCtx->LFSR_S[6]; + F1 = V0 + R0; + F1 ^= L1; + F1 ^= pCtx->FSM_R2; + R1 += pCtx->FSM_R2; + pCtx->FSM_R3 = Snow3g_S2(pCtx->FSM_R2); + pCtx->FSM_R2 = FSM4; + pCtx->FSM_R1 = R1; + + /* Shift LFSR twice */ + ShiftTwiceLFSR_1(pCtx); + + /* keystream mode LFSR update */ + pCtx->LFSR_S[14] = V0; + pCtx->LFSR_S[15] = V1; + + F = F0; + F <<= 32; + F |= (uint64_t)F1; + + *pKeyStream = F; +} + +#ifdef AVX2 +/** +******************************************************************************* +* @description +* This function generates 8 bytes of keystream 8 buffers at a time +* +* @param[in] pCtx Context where the scheduled keys are stored +* @param[in/out] pKeyStream Pointer to generated keystream +* +*******************************************************************************/ +static inline void snow3g_keystream_8_8(snow3gKeyState8_t *pCtx, + __m256i *pKeyStreamLo, + __m256i *pKeyStreamHi) +{ + __m256i H, L; + + /* first set of 4 bytes */ + ClockFSM_8(pCtx, &L); + L = _mm256_xor_si256(L, pCtx->LFSR_X[pCtx->iLFSR_X]); + ClockLFSR_8(pCtx); + + /* second set of 4 bytes */ + ClockFSM_8(pCtx, &H); + H = _mm256_xor_si256(H, pCtx->LFSR_X[pCtx->iLFSR_X]); + ClockLFSR_8(pCtx); + + /* merge the 2 sets */ + *pKeyStreamLo = _mm256_unpacklo_epi32(H, L); + *pKeyStreamHi = _mm256_unpackhi_epi32(H, L); +} + +/** +******************************************************************************* +* @description +* This function generates 4 bytes of keystream 8 buffers at a time +* +* @param[in] pCtx Context where the scheduled keys are stored +* @param[in/out] pKeyStream Pointer to generated keystream +* +*******************************************************************************/ +static inline void snow3g_keystream_8_4(snow3gKeyState8_t *pCtx, + __m256i *pKeyStream) +{ + __m256i F; + + ClockFSM_8(pCtx, &F); + *pKeyStream = _mm256_xor_si256(F, pCtx->LFSR_X[pCtx->iLFSR_X]); + ClockLFSR_8(pCtx); +} + +/** +***************************************************************************** +* @description +* This function generates 32 bytes of keystream 8 buffers at a time +* +* @param[in] pCtx Context where the scheduled keys are stored +* @param[in/out] pKeyStream Array of generated keystreams +* +******************************************************************************/ +static inline void snow3g_keystream_8_32(snow3gKeyState8_t *pCtx, + __m256i *pKeyStream) +{ + + __m256i temp[8]; + + /** produces the next 4 bytes for each buffer */ + int i; + + /** Byte reversal on each KS */ + __m256i mask1 = {0x0001020304050607ULL, 0x08090a0b0c0d0e0fULL, + 0x0001020304050607ULL, 0x08090a0b0c0d0e0fULL}; + /** Reversal, shifted 4 bytes right */ + __m256i mask2 = {0x0405060708090a0bULL, 0x0c0d0e0f00010203ULL, + 0x0405060708090a0bULL, 0x0c0d0e0f00010203ULL}; + /** Reversal, shifted 8 bytes right */ + __m256i mask3 = {0x08090a0b0c0d0e0fULL, 0x0001020304050607ULL, + 0x08090a0b0c0d0e0fULL, 0x0001020304050607ULL}; + /** Reversal, shifted 12 bytes right */ + __m256i mask4 = {0x0c0d0e0f00010203ULL, 0x0405060708090a0bULL, + 0x0c0d0e0f00010203ULL, 0x0405060708090a0bULL}; + + snow3g_keystream_8_4(pCtx, &temp[0]); + snow3g_keystream_8_4(pCtx, &temp[1]); + snow3g_keystream_8_4(pCtx, &temp[2]); + snow3g_keystream_8_4(pCtx, &temp[3]); + snow3g_keystream_8_4(pCtx, &temp[4]); + snow3g_keystream_8_4(pCtx, &temp[5]); + snow3g_keystream_8_4(pCtx, &temp[6]); + snow3g_keystream_8_4(pCtx, &temp[7]); + + temp[0] = _mm256_shuffle_epi8(temp[0], mask1); + temp[1] = _mm256_shuffle_epi8(temp[1], mask2); + temp[2] = _mm256_shuffle_epi8(temp[2], mask3); + temp[3] = _mm256_shuffle_epi8(temp[3], mask4); + temp[4] = _mm256_shuffle_epi8(temp[4], mask1); + temp[5] = _mm256_shuffle_epi8(temp[5], mask2); + temp[6] = _mm256_shuffle_epi8(temp[6], mask3); + temp[7] = _mm256_shuffle_epi8(temp[7], mask4); + + __m256i blended[8]; + /* blends KS together: 128bit slice consists + of 4 32-bit words for one packet */ + blended[0] = _mm256_blend_epi32(temp[0], temp[1], 0xaa); + blended[1] = _mm256_blend_epi32(temp[0], temp[1], 0x55); + blended[2] = _mm256_blend_epi32(temp[2], temp[3], 0xaa); + blended[3] = _mm256_blend_epi32(temp[2], temp[3], 0x55); + blended[4] = _mm256_blend_epi32(temp[4], temp[5], 0xaa); + blended[5] = _mm256_blend_epi32(temp[4], temp[5], 0x55); + blended[6] = _mm256_blend_epi32(temp[6], temp[7], 0xaa); + blended[7] = _mm256_blend_epi32(temp[6], temp[7], 0x55); + + temp[0] = _mm256_blend_epi32(blended[0], blended[2], 0xcc); + temp[1] = _mm256_blend_epi32(blended[1], blended[3], 0x99); + temp[2] = _mm256_blend_epi32(blended[0], blended[2], 0x33); + temp[3] = _mm256_blend_epi32(blended[1], blended[3], 0x66); + temp[4] = _mm256_blend_epi32(blended[4], blended[6], 0xcc); + temp[5] = _mm256_blend_epi32(blended[5], blended[7], 0x99); + temp[6] = _mm256_blend_epi32(blended[4], blended[6], 0x33); + temp[7] = _mm256_blend_epi32(blended[5], blended[7], 0x66); + + /** sorts 32 bit words back into order */ + blended[0] = temp[0]; + blended[1] = _mm256_shuffle_epi32(temp[1], 0x39); + blended[2] = _mm256_shuffle_epi32(temp[2], 0x4e); + blended[3] = _mm256_shuffle_epi32(temp[3], 0x93); + blended[4] = temp[4]; + blended[5] = _mm256_shuffle_epi32(temp[5], 0x39); + blended[6] = _mm256_shuffle_epi32(temp[6], 0x4e); + blended[7] = _mm256_shuffle_epi32(temp[7], 0x93); + + for (i = 0; i < 4; i++) { + pKeyStream[i] = _mm256_permute2x128_si256(blended[i], + blended[i + 4], 0x20); + pKeyStream[i + 4] = _mm256_permute2x128_si256( + blended[i], blended[i + 4], 0x31); + } +} + +#endif /* AVX2 */ + +/** +******************************************************************************* +* @description +* This function generates 4 bytes of keystream 4 buffers at a time +* +* @param[in] pCtx Context where the scheduled keys are stored +* @param[in/out] pKeyStream Pointer to generated keystream +* +*******************************************************************************/ +static inline void snow3g_keystream_4_4(snow3gKeyState4_t *pCtx, + __m128i *pKeyStream) +{ + __m128i F; + + ClockFSM_4(pCtx, &F); + *pKeyStream = _mm_xor_si128(F, pCtx->LFSR_X[pCtx->iLFSR_X]); + ClockLFSR_4(pCtx); +} + +/** +******************************************************************************* +* @description +* This function generates 8 bytes of keystream 4 buffers at a time +* +* @param[in] pCtx Context where the scheduled keys are stored +* @param[in/out] pKeyStreamLo Pointer to lower end of generated keystream +* @param[in/out] pKeyStreamHi Pointer to higer end of generated keystream +* +*******************************************************************************/ +static inline void snow3g_keystream_4_8(snow3gKeyState4_t *pCtx, + __m128i *pKeyStreamLo, + __m128i *pKeyStreamHi) +{ + __m128i H, L; + + /* first set of 4 bytes */ + ClockFSM_4(pCtx, &L); + L = _mm_xor_si128(L, pCtx->LFSR_X[pCtx->iLFSR_X]); + ClockLFSR_4(pCtx); + + /* second set of 4 bytes */ + ClockFSM_4(pCtx, &H); + H = _mm_xor_si128(H, pCtx->LFSR_X[pCtx->iLFSR_X]); + ClockLFSR_4(pCtx); + + /* merge the 2 sets */ + *pKeyStreamLo = _mm_unpacklo_epi32(H, L); + *pKeyStreamHi = _mm_unpackhi_epi32(H, L); +} + +/** +******************************************************************************* +* @description +* This function initializes the key schedule for 4 buffers for snow3g f8/f9. +* +* @param [in] pCtx Context where the scheduled keys are stored +* @param [in] pKeySched Key schedule +* @param [in] pIV1 IV for buffer 1 +* @param [in] pIV2 IV for buffer 2 +* @param [in] pIV3 IV for buffer 3 +* @param [in] pIV4 IV for buffer 4 +* +*******************************************************************************/ +static inline void +snow3gStateInitialize_4(snow3gKeyState4_t *pCtx, + const snow3g_key_schedule_t *pKeySched, + const void *pIV1, const void *pIV2, + const void *pIV3, const void *pIV4) +{ + uint32_t K, L; + int i; + __m128i R, S, T, U; + __m128i V0, V1, T0, T1; + + /* Initialize the LFSR table from constants, Keys, and IV */ + + /* Load complete 128b IV into register (SSE2)*/ + uint64_t sm[2] = {0x0405060700010203ULL, 0x0c0d0e0f08090a0bULL}; + __m128i *swapMask = (__m128i *) sm; + + R = _mm_loadu_si128((const __m128i *)pIV1); + S = _mm_loadu_si128((const __m128i *)pIV2); + T = _mm_loadu_si128((const __m128i *)pIV3); + U = _mm_loadu_si128((const __m128i *)pIV4); + + /* initialize the array block (SSE4) */ + for (i = 0; i < 4; i++) { + K = pKeySched->k[i]; + L = ~K; + V0 = _mm_set1_epi32(K); + V1 = _mm_set1_epi32(L); + pCtx->LFSR_X[i + 4] = V0; + pCtx->LFSR_X[i + 12] = V0; + pCtx->LFSR_X[i + 0] = V1; + pCtx->LFSR_X[i + 8] = V1; + } + /* Update the schedule structure with IVs */ + /* Store the 4 IVs in LFSR by a column/row matrix swap + * after endianness correction */ + + /* endianness swap (SSSE3) */ + R = _mm_shuffle_epi8(R, *swapMask); + S = _mm_shuffle_epi8(S, *swapMask); + T = _mm_shuffle_epi8(T, *swapMask); + U = _mm_shuffle_epi8(U, *swapMask); + + /* row/column dword inversion (SSE2) */ + T0 = _mm_unpacklo_epi32(R, S); + R = _mm_unpackhi_epi32(R, S); + T1 = _mm_unpacklo_epi32(T, U); + T = _mm_unpackhi_epi32(T, U); + + /* row/column qword inversion (SSE2) */ + U = _mm_unpackhi_epi64(R, T); + T = _mm_unpacklo_epi64(R, T); + S = _mm_unpackhi_epi64(T0, T1); + R = _mm_unpacklo_epi64(T0, T1); + + /*IV ^ LFSR (SSE2) */ + pCtx->LFSR_X[15] = _mm_xor_si128(pCtx->LFSR_X[15], U); + pCtx->LFSR_X[12] = _mm_xor_si128(pCtx->LFSR_X[12], T); + pCtx->LFSR_X[10] = _mm_xor_si128(pCtx->LFSR_X[10], S); + pCtx->LFSR_X[9] = _mm_xor_si128(pCtx->LFSR_X[9], R); + pCtx->iLFSR_X = 0; + /* FSM initialization (SSE2) */ + S = _mm_setzero_si128(); + for (i = 0; i < 3; i++) + pCtx->FSM_X[i] = S; + + /* Initialisation rounds */ + for (i = 0; i < 32; i++) { + ClockFSM_4(pCtx, &S); + ClockLFSR_4(pCtx); + pCtx->LFSR_X[(pCtx->iLFSR_X + 15) % 16] = _mm_xor_si128( + pCtx->LFSR_X[(pCtx->iLFSR_X + 15) % 16], S); + } +} + +#ifdef AVX2 +/** +******************************************************************************* +* @description +* This function intializes the key schedule for 8 buffers with +* individual keys, for snow3g f8/f9. +* +* @param [in] pCtx Context where scheduled keys are stored +* @param [in] pKeySched Key schedule +* @param [in] pIV1 IV for buffer 1 +* @param [in] pIV2 IV for buffer 2 +* @param [in] pIV3 IV for buffer 3 +* @param [in] pIV4 IV for buffer 4 +* @param [in] pIV5 IV for buffer 5 +* @param [in] pIV6 IV for buffer 6 +* @param [in] pIV7 IV for buffer 7 +* @param [in] pIV8 IV for buffer 8 +* +*******************************************************************************/ +static inline void +snow3gStateInitialize_8_multiKey(snow3gKeyState8_t *pCtx, + const snow3g_key_schedule_t * const KeySched[], + const void * const pIV[]) +{ + DECLARE_ALIGNED(uint32_t k[8], 32); + DECLARE_ALIGNED(uint32_t l[8], 32); + __m256i *K = (__m256i *)k; + __m256i *L = (__m256i *)l; + + int i, j; + __m256i mR, mS, mT, mU, T0, T1; + + /* Initialize the LFSR table from constants, Keys, and IV */ + + /* Load complete 256b IV into register (SSE2)*/ + __m256i swapMask = {0x0405060700010203ULL, 0x0c0d0e0f08090a0bULL, + 0x0405060700010203ULL, 0x0c0d0e0f08090a0bULL}; + mR = _mm256_loadu2_m128i((const __m128i *)pIV[4], + (const __m128i *)pIV[0]); + mS = _mm256_loadu2_m128i((const __m128i *)pIV[5], + (const __m128i *)pIV[1]); + mT = _mm256_loadu2_m128i((const __m128i *)pIV[6], + (const __m128i *)pIV[2]); + mU = _mm256_loadu2_m128i((const __m128i *)pIV[7], + (const __m128i *)pIV[3]); + + /* initialize the array block (SSE4) */ + for (i = 0; i < 4; i++) { + for (j = 0; j < 8; j++) { + k[j] = KeySched[j]->k[i]; + l[j] = ~k[j]; + } + + pCtx->LFSR_X[i + 4] = *K; + pCtx->LFSR_X[i + 12] = *K; + pCtx->LFSR_X[i + 0] = *L; + pCtx->LFSR_X[i + 8] = *L; + } + + /* Update the schedule structure with IVs */ + /* Store the 4 IVs in LFSR by a column/row matrix swap + * after endianness correction */ + + /* endianness swap (SSSE3) */ + mR = _mm256_shuffle_epi8(mR, swapMask); + mS = _mm256_shuffle_epi8(mS, swapMask); + mT = _mm256_shuffle_epi8(mT, swapMask); + mU = _mm256_shuffle_epi8(mU, swapMask); + + /* row/column dword inversion (SSE2) */ + T0 = _mm256_unpacklo_epi32(mR, mS); + mR = _mm256_unpackhi_epi32(mR, mS); + T1 = _mm256_unpacklo_epi32(mT, mU); + mT = _mm256_unpackhi_epi32(mT, mU); + + /* row/column qword inversion (SSE2) */ + mU = _mm256_unpackhi_epi64(mR, mT); + mT = _mm256_unpacklo_epi64(mR, mT); + mS = _mm256_unpackhi_epi64(T0, T1); + mR = _mm256_unpacklo_epi64(T0, T1); + + /*IV ^ LFSR (SSE2) */ + pCtx->LFSR_X[15] = _mm256_xor_si256(pCtx->LFSR_X[15], mU); + pCtx->LFSR_X[12] = _mm256_xor_si256(pCtx->LFSR_X[12], mT); + pCtx->LFSR_X[10] = _mm256_xor_si256(pCtx->LFSR_X[10], mS); + pCtx->LFSR_X[9] = _mm256_xor_si256(pCtx->LFSR_X[9], mR); + pCtx->iLFSR_X = 0; + /* FSM initialization (SSE2) */ + mS = _mm256_setzero_si256(); + for (i = 0; i < 3; i++) + pCtx->FSM_X[i] = mS; + + /* Initialisation rounds */ + for (i = 0; i < 32; i++) { + ClockFSM_8(pCtx, &mS); + ClockLFSR_8(pCtx); + pCtx->LFSR_X[(pCtx->iLFSR_X + 15) % 16] = _mm256_xor_si256( + pCtx->LFSR_X[(pCtx->iLFSR_X + 15) % 16], mS); + } +} + +/** +******************************************************************************* +* @description +* This function initializes the key schedule for 8 buffers for snow3g f8/f9. +* +* @param [in] pCtx Context where the scheduled keys are stored +* @param [in] pKeySched Key schedule +* @param [in] pIV1 IV for buffer 1 +* @param [in] pIV2 IV for buffer 2 +* @param [in] pIV3 IV for buffer 3 +* @param [in] pIV4 IV for buffer 4 +* @param [in] pIV5 IV for buffer 5 +* @param [in] pIV6 IV for buffer 6 +* @param [in] pIV7 IV for buffer 7 +* @param [in] pIV8 IV for buffer 8 +* +*******************************************************************************/ +static inline void +snow3gStateInitialize_8(snow3gKeyState8_t *pCtx, + const snow3g_key_schedule_t *pKeySched, + const void *pIV1, const void *pIV2, + const void *pIV3, const void *pIV4, + const void *pIV5, const void *pIV6, + const void *pIV7, const void *pIV8) +{ + uint32_t K, L; + int i; + __m256i mR, mS, mT, mU, V0, V1, T0, T1; + + /* Initialize the LFSR table from constants, Keys, and IV */ + + /* Load complete 256b IV into register (SSE2)*/ + __m256i swapMask = {0x0405060700010203ULL, 0x0c0d0e0f08090a0bULL, + 0x0405060700010203ULL, 0x0c0d0e0f08090a0bULL}; + mR = _mm256_loadu2_m128i((const __m128i *)pIV5, (const __m128i *)pIV1); + mS = _mm256_loadu2_m128i((const __m128i *)pIV6, (const __m128i *)pIV2); + mT = _mm256_loadu2_m128i((const __m128i *)pIV7, (const __m128i *)pIV3); + mU = _mm256_loadu2_m128i((const __m128i *)pIV8, (const __m128i *)pIV4); + + /* initialize the array block (SSE4) */ + for (i = 0; i < 4; i++) { + K = pKeySched->k[i]; + L = ~K; + V0 = _mm256_set1_epi32(K); + V1 = _mm256_set1_epi32(L); + pCtx->LFSR_X[i + 4] = V0; + pCtx->LFSR_X[i + 12] = V0; + pCtx->LFSR_X[i + 0] = V1; + pCtx->LFSR_X[i + 8] = V1; + } + + /* Update the schedule structure with IVs */ + /* Store the 4 IVs in LFSR by a column/row matrix swap + * after endianness correction */ + + /* endianness swap (SSSE3) */ + mR = _mm256_shuffle_epi8(mR, swapMask); + mS = _mm256_shuffle_epi8(mS, swapMask); + mT = _mm256_shuffle_epi8(mT, swapMask); + mU = _mm256_shuffle_epi8(mU, swapMask); + + /* row/column dword inversion (SSE2) */ + T0 = _mm256_unpacklo_epi32(mR, mS); + mR = _mm256_unpackhi_epi32(mR, mS); + T1 = _mm256_unpacklo_epi32(mT, mU); + mT = _mm256_unpackhi_epi32(mT, mU); + + /* row/column qword inversion (SSE2) */ + mU = _mm256_unpackhi_epi64(mR, mT); + mT = _mm256_unpacklo_epi64(mR, mT); + mS = _mm256_unpackhi_epi64(T0, T1); + mR = _mm256_unpacklo_epi64(T0, T1); + + /*IV ^ LFSR (SSE2) */ + pCtx->LFSR_X[15] = _mm256_xor_si256(pCtx->LFSR_X[15], mU); + pCtx->LFSR_X[12] = _mm256_xor_si256(pCtx->LFSR_X[12], mT); + pCtx->LFSR_X[10] = _mm256_xor_si256(pCtx->LFSR_X[10], mS); + pCtx->LFSR_X[9] = _mm256_xor_si256(pCtx->LFSR_X[9], mR); + pCtx->iLFSR_X = 0; + /* FSM initialization (SSE2) */ + mS = _mm256_setzero_si256(); + for (i = 0; i < 3; i++) + pCtx->FSM_X[i] = mS; + + /* Initialisation rounds */ + for (i = 0; i < 32; i++) { + ClockFSM_8(pCtx, &mS); + ClockLFSR_8(pCtx); + pCtx->LFSR_X[(pCtx->iLFSR_X + 15) % 16] = _mm256_xor_si256( + pCtx->LFSR_X[(pCtx->iLFSR_X + 15) % 16], mS); + } +} +#endif /* AVX2 */ + +static inline void +preserve_bits(uint64_t *KS, + const uint8_t *pcBufferOut, const uint8_t *pcBufferIn, + SafeBuf *safeOutBuf, SafeBuf *safeInBuf, + const uint8_t bit_len, const uint8_t byte_len) +{ + const uint64_t mask = UINT64_MAX << (SNOW3G_BLOCK_SIZE * 8 - bit_len); + + /* Clear the last bits of the keystream and the input + * (input only in out-of-place case) */ + *KS &= mask; + if (pcBufferIn != pcBufferOut) { + const uint64_t swapMask = BSWAP64(mask); + + safeInBuf->b64 &= swapMask; + + /* + * Merge the last bits from the output, to be preserved, + * in the keystream, to be XOR'd with the input + * (which last bits are 0, maintaining the output bits) + */ + memcpy_keystrm(safeOutBuf->b8, pcBufferOut, byte_len); + *KS |= BSWAP64(safeOutBuf->b64 & ~swapMask); + } +} + +/** +******************************************************************************* +* @description +* This function is the core snow3g bit algorithm +* for the 3GPP confidentiality algorithm +* +* @param[in] pCtx Context where the scheduled keys are stored +* @param[in] pBufferIn Input buffer +* @param[out] pBufferOut Output buffer +* @param[in] cipherLengthInBits length in bits of the data to be encrypted +* @param[in] bitOffset offset in input buffer, where data are valid +* +*******************************************************************************/ +static inline void f8_snow3g_bit(snow3gKeyState1_t *pCtx, + const void *pIn, + void *pOut, + const uint32_t lengthInBits, + const uint32_t offsetInBits) +{ + const uint8_t *pBufferIn = pIn; + uint8_t *pBufferOut = pOut; + uint32_t cipherLengthInBits = lengthInBits; + uint64_t shiftrem = 0; + uint64_t KS8, KS8bit; /* 8 bytes of keystream */ + const uint8_t *pcBufferIn = pBufferIn + (offsetInBits / 8); + uint8_t *pcBufferOut = pBufferOut + (offsetInBits / 8); + /* Offset into the first byte (0 - 7 bits) */ + uint32_t remainOffset = offsetInBits % 8; + uint32_t byteLength = (cipherLengthInBits + 7) / 8; + SafeBuf safeInBuf = {0}; + SafeBuf safeOutBuf = {0}; + + /* Now run the block cipher */ + + /* Start with potential partial block (due to offset and length) */ + snow3g_keystream_1_8(pCtx, &KS8); + KS8bit = KS8 >> remainOffset; + /* Only one block to encrypt */ + if (cipherLengthInBits < (64 - remainOffset)) { + byteLength = (cipherLengthInBits + 7) / 8; + memcpy_keystrm(safeInBuf.b8, pcBufferIn, byteLength); + /* + * If operation is Out-of-place and there is offset + * to be applied, "remainOffset" bits from the output buffer + * need to be preserved (only applicable to first byte, + * since remainOffset is up to 7 bits) + */ + if ((pIn != pOut) && remainOffset) { + const uint8_t mask8 = (uint8_t) + (1 << (8 - remainOffset)) - 1; + + safeInBuf.b8[0] = (safeInBuf.b8[0] & mask8) | + (pcBufferOut[0] & ~mask8); + } + /* If last byte is a partial byte, the last bits of the output + * need to be preserved */ + const uint8_t bitlen_with_off = remainOffset + + cipherLengthInBits; + + if ((bitlen_with_off & 0x7) != 0) + preserve_bits(&KS8bit, pcBufferOut, pcBufferIn, + &safeOutBuf, &safeInBuf, + bitlen_with_off, byteLength); + + xor_keystrm_rev(safeOutBuf.b8, safeInBuf.b8, KS8bit); + memcpy_keystrm(pcBufferOut, safeOutBuf.b8, byteLength); + return; + } + /* + * If operation is Out-of-place and there is offset + * to be applied, "remainOffset" bits from the output buffer + * need to be preserved (only applicable to first byte, + * since remainOffset is up to 7 bits) + */ + if ((pIn != pOut) && remainOffset) { + const uint8_t mask8 = (uint8_t)(1 << (8 - remainOffset)) - 1; + + memcpy_keystrm(safeInBuf.b8, pcBufferIn, 8); + safeInBuf.b8[0] = (safeInBuf.b8[0] & mask8) | + (pcBufferOut[0] & ~mask8); + xor_keystrm_rev(pcBufferOut, safeInBuf.b8, KS8bit); + pcBufferIn += SNOW3G_BLOCK_SIZE; + } else { + /* At least 64 bits to produce (including offset) */ + pcBufferIn = xor_keystrm_rev(pcBufferOut, pcBufferIn, KS8bit); + } + + if (remainOffset != 0) + shiftrem = KS8 << (64 - remainOffset); + cipherLengthInBits -= SNOW3G_BLOCK_SIZE * 8 - remainOffset; + pcBufferOut += SNOW3G_BLOCK_SIZE; + + while (cipherLengthInBits) { + /* produce the next block of keystream */ + snow3g_keystream_1_8(pCtx, &KS8); + KS8bit = (KS8 >> remainOffset) | shiftrem; + if (remainOffset != 0) + shiftrem = KS8 << (64 - remainOffset); + if (cipherLengthInBits >= SNOW3G_BLOCK_SIZE * 8) { + pcBufferIn = xor_keystrm_rev(pcBufferOut, + pcBufferIn, KS8bit); + cipherLengthInBits -= SNOW3G_BLOCK_SIZE * 8; + pcBufferOut += SNOW3G_BLOCK_SIZE; + /* loop variant */ + } else { + /* end of the loop, handle the last bytes */ + byteLength = (cipherLengthInBits + 7) / 8; + memcpy_keystrm(safeInBuf.b8, pcBufferIn, + byteLength); + + /* If last byte is a partial byte, the last bits + * of the output need to be preserved */ + if ((cipherLengthInBits & 0x7) != 0) + preserve_bits(&KS8bit, pcBufferOut, pcBufferIn, + &safeOutBuf, &safeInBuf, + cipherLengthInBits, byteLength); + + xor_keystrm_rev(safeOutBuf.b8, safeInBuf.b8, KS8bit); + memcpy_keystrm(pcBufferOut, safeOutBuf.b8, byteLength); + cipherLengthInBits = 0; + } + } +#ifdef SAFE_DATA + CLEAR_VAR(&KS8, sizeof(KS8)); + CLEAR_VAR(&KS8bit, sizeof(KS8bit)); + CLEAR_MEM(&safeInBuf, sizeof(safeInBuf)); + CLEAR_MEM(&safeOutBuf, sizeof(safeOutBuf)); +#endif +} + +/** +******************************************************************************* +* @description +* This function is the core snow3g algorithm for +* the 3GPP confidentiality and integrity algorithm. +* +* @param[in] pCtx Context where the scheduled keys are stored +* @param[in] pBufferIn Input buffer +* @param[out] pBufferOut Output buffer +* @param[in] lengthInBytes length in bytes of the data to be encrypted +* +*******************************************************************************/ +static inline void f8_snow3g(snow3gKeyState1_t *pCtx, + const void *pIn, + void *pOut, + const uint32_t lengthInBytes) +{ + uint32_t qwords = lengthInBytes / SNOW3G_8_BYTES; /* number of qwords */ + uint32_t words = lengthInBytes & 4; /* remaining word if not 0 */ + uint32_t bytes = lengthInBytes & 3; /* remaining bytes */ + uint32_t KS4; /* 4 bytes of keystream */ + uint64_t KS8; /* 8 bytes of keystream */ + const uint8_t *pBufferIn = pIn; + uint8_t *pBufferOut = pOut; + + /* process 64 bits at a time */ + while (qwords--) { + /* generate keystream 8 bytes at a time */ + snow3g_keystream_1_8(pCtx, &KS8); + + /* xor keystream 8 bytes at a time */ + pBufferIn = xor_keystrm_rev(pBufferOut, pBufferIn, KS8); + pBufferOut += SNOW3G_8_BYTES; + } + + /* check for remaining 0 to 7 bytes */ + if (0 != words) { + if (bytes) { + /* 5 to 7 last bytes, process 8 bytes */ + uint8_t buftemp[8]; + uint8_t safeBuff[8]; + + memset(safeBuff, 0, SNOW3G_8_BYTES); + snow3g_keystream_1_8(pCtx, &KS8); + memcpy_keystrm(safeBuff, pBufferIn, 4 + bytes); + xor_keystrm_rev(buftemp, safeBuff, KS8); + memcpy_keystrm(pBufferOut, buftemp, 4 + bytes); +#ifdef SAFE_DATA + CLEAR_MEM(&safeBuff, sizeof(safeBuff)); + CLEAR_MEM(&buftemp, sizeof(buftemp)); +#endif + } else { + /* exactly 4 last bytes */ + snow3g_keystream_1_4(pCtx, &KS4); + xor_keystream_reverse_32(pBufferOut, pBufferIn, KS4); + } + } else if (0 != bytes) { + /* 1 to 3 last bytes */ + uint8_t buftemp[4]; + uint8_t safeBuff[4]; + + memset(safeBuff, 0, SNOW3G_4_BYTES); + snow3g_keystream_1_4(pCtx, &KS4); + memcpy_keystream_32(safeBuff, pBufferIn, bytes); + xor_keystream_reverse_32(buftemp, safeBuff, KS4); + memcpy_keystream_32(pBufferOut, buftemp, bytes); +#ifdef SAFE_DATA + CLEAR_MEM(&safeBuff, sizeof(safeBuff)); + CLEAR_MEM(&buftemp, sizeof(buftemp)); +#endif + } + +#ifdef SAFE_DATA + CLEAR_VAR(&KS4, sizeof(KS4)); + CLEAR_VAR(&KS8, sizeof(KS8)); +#endif +} + +#ifdef AVX2 +/** +******************************************************************************* +* @description +* This function converts the state from a 4 buffer state structure to 1 +* buffer state structure. +* +* @param[in] pSrcState Pointer to the source state +* @param[in] pDstState Pointer to the destination state +* @param[in] NumBuffers Number of buffers +* +*******************************************************************************/ +static inline void snow3gStateConvert_8(snow3gKeyState8_t *pSrcState, + snow3gKeyState1_t *pDstState, + uint32_t NumBuffers) +{ + uint32_t T = 0, iLFSR_X = pSrcState->iLFSR_X; + __m256i *LFSR_X = pSrcState->LFSR_X; + int i; + + for (i = 0; i < 16; i++) { + switch (NumBuffers) { + case 0: + T = _mm256_extract_epi32(LFSR_X[(i + iLFSR_X) % 16], 0); + break; + case 1: + T = _mm256_extract_epi32(LFSR_X[(i + iLFSR_X) % 16], 1); + break; + case 2: + T = _mm256_extract_epi32(LFSR_X[(i + iLFSR_X) % 16], 2); + break; + case 3: + T = _mm256_extract_epi32(LFSR_X[(i + iLFSR_X) % 16], 3); + break; + case 4: + T = _mm256_extract_epi32(LFSR_X[(i + iLFSR_X) % 16], 4); + break; + case 5: + T = _mm256_extract_epi32(LFSR_X[(i + iLFSR_X) % 16], 5); + break; + case 6: + T = _mm256_extract_epi32(LFSR_X[(i + iLFSR_X) % 16], 6); + break; + case 7: + T = _mm256_extract_epi32(LFSR_X[(i + iLFSR_X) % 16], 7); + break; + } + pDstState->LFSR_S[i] = T; + } + i = 0; + switch (NumBuffers) { + case 0: + T = _mm256_extract_epi32(pSrcState->FSM_X[i], 0); + break; + case 1: + T = _mm256_extract_epi32(pSrcState->FSM_X[i], 1); + break; + case 2: + T = _mm256_extract_epi32(pSrcState->FSM_X[i], 2); + break; + case 3: + T = _mm256_extract_epi32(pSrcState->FSM_X[i], 3); + break; + case 4: + T = _mm256_extract_epi32(pSrcState->FSM_X[i], 4); + break; + case 5: + T = _mm256_extract_epi32(pSrcState->FSM_X[i], 5); + break; + case 6: + T = _mm256_extract_epi32(pSrcState->FSM_X[i], 6); + break; + case 7: + T = _mm256_extract_epi32(pSrcState->FSM_X[i], 7); + break; + } + pDstState->FSM_R1 = T; + + i = 1; + switch (NumBuffers) { + case 0: + T = _mm256_extract_epi32(pSrcState->FSM_X[i], 0); + break; + case 1: + T = _mm256_extract_epi32(pSrcState->FSM_X[i], 1); + break; + case 2: + T = _mm256_extract_epi32(pSrcState->FSM_X[i], 2); + break; + case 3: + T = _mm256_extract_epi32(pSrcState->FSM_X[i], 3); + break; + case 4: + T = _mm256_extract_epi32(pSrcState->FSM_X[i], 4); + break; + case 5: + T = _mm256_extract_epi32(pSrcState->FSM_X[i], 5); + break; + case 6: + T = _mm256_extract_epi32(pSrcState->FSM_X[i], 6); + break; + case 7: + T = _mm256_extract_epi32(pSrcState->FSM_X[i], 7); + break; + } + pDstState->FSM_R2 = T; + + i = 2; + switch (NumBuffers) { + case 0: + T = _mm256_extract_epi32(pSrcState->FSM_X[i], 0); + break; + case 1: + T = _mm256_extract_epi32(pSrcState->FSM_X[i], 1); + break; + case 2: + T = _mm256_extract_epi32(pSrcState->FSM_X[i], 2); + break; + case 3: + T = _mm256_extract_epi32(pSrcState->FSM_X[i], 3); + break; + case 4: + T = _mm256_extract_epi32(pSrcState->FSM_X[i], 4); + break; + case 5: + T = _mm256_extract_epi32(pSrcState->FSM_X[i], 5); + break; + case 6: + T = _mm256_extract_epi32(pSrcState->FSM_X[i], 6); + break; + case 7: + T = _mm256_extract_epi32(pSrcState->FSM_X[i], 7); + break; + } + pDstState->FSM_R3 = T; +} +#endif /* AVX2 */ + +/** +******************************************************************************* +* @description +* This function converts the state from a 4 buffer state structure to 1 +* buffer state structure. +* +* @param[in] pSrcState Pointer to the source state +* @param[in] pDstState Pointer to the destination state +* @param[in] NumBuffers Number of buffers +* +*******************************************************************************/ +static inline void snow3gStateConvert_4(snow3gKeyState4_t *pSrcState, + snow3gKeyState1_t *pDstState, + uint32_t NumBuffers) +{ + uint32_t i; + uint32_t T = 0, iLFSR_X = pSrcState->iLFSR_X; + __m128i *LFSR_X = pSrcState->LFSR_X; + + for (i = 0; i < 16; i++) { + switch (NumBuffers) { + case 0: + T = _mm_extract_epi32(LFSR_X[(i + iLFSR_X) % 16], 0); + break; + case 1: + T = _mm_extract_epi32(LFSR_X[(i + iLFSR_X) % 16], 1); + break; + case 2: + T = _mm_extract_epi32(LFSR_X[(i + iLFSR_X) % 16], 2); + break; + case 3: + T = _mm_extract_epi32(LFSR_X[(i + iLFSR_X) % 16], 3); + break; + } + pDstState->LFSR_S[i] = T; + } + + i = 0; + switch (NumBuffers) { + case 0: + T = _mm_extract_epi32(pSrcState->FSM_X[i], 0); + break; + case 1: + T = _mm_extract_epi32(pSrcState->FSM_X[i], 1); + break; + case 2: + T = _mm_extract_epi32(pSrcState->FSM_X[i], 2); + break; + case 3: + T = _mm_extract_epi32(pSrcState->FSM_X[i], 3); + break; + } + pDstState->FSM_R1 = T; + + i = 1; + switch (NumBuffers) { + case 0: + T = _mm_extract_epi32(pSrcState->FSM_X[i], 0); + break; + case 1: + T = _mm_extract_epi32(pSrcState->FSM_X[i], 1); + break; + case 2: + T = _mm_extract_epi32(pSrcState->FSM_X[i], 2); + break; + case 3: + T = _mm_extract_epi32(pSrcState->FSM_X[i], 3); + break; + } + pDstState->FSM_R2 = T; + + i = 2; + switch (NumBuffers) { + case 0: + T = _mm_extract_epi32(pSrcState->FSM_X[i], 0); + break; + case 1: + T = _mm_extract_epi32(pSrcState->FSM_X[i], 1); + break; + case 2: + T = _mm_extract_epi32(pSrcState->FSM_X[i], 2); + break; + case 3: + T = _mm_extract_epi32(pSrcState->FSM_X[i], 3); + break; + } + pDstState->FSM_R3 = T; +} + +/*--------------------------------------------------------- + * f8() + * Initializations and Context size definitions + *---------------------------------------------------------*/ +size_t SNOW3G_KEY_SCHED_SIZE(void) { return sizeof(snow3g_key_schedule_t); } + +int SNOW3G_INIT_KEY_SCHED(const void *pKey, snow3g_key_schedule_t *pCtx) +{ +#ifdef SAFE_PARAM + if ((pKey == NULL) || (pCtx == NULL)) + return -1; +#endif + + const uint32_t *pKey32 = pKey; + + pCtx->k[3] = BSWAP32(pKey32[0]); + pCtx->k[2] = BSWAP32(pKey32[1]); + pCtx->k[1] = BSWAP32(pKey32[2]); + pCtx->k[0] = BSWAP32(pKey32[3]); + + return 0; +} + +/*--------------------------------------------------------- + * @description + * Snow3G F8 1 buffer: + * Single buffer enc/dec with IV and precomputed key schedule + *---------------------------------------------------------*/ +void SNOW3G_F8_1_BUFFER(const snow3g_key_schedule_t *pHandle, + const void *pIV, + const void *pBufferIn, + void *pBufferOut, + const uint32_t lengthInBytes) +{ +#ifdef SAFE_PARAM + if ((pHandle == NULL) || (pIV == NULL) || + (pBufferIn == NULL) || (pBufferOut == NULL) || + (lengthInBytes == 0) || (lengthInBytes > SNOW3G_MAX_BYTELEN)) + return; +#endif + snow3gKeyState1_t ctx; + uint32_t KS4; /* 4 bytes of keystream */ + + /* Initialize the schedule from the IV */ + snow3gStateInitialize_1(&ctx, pHandle, pIV); + + /* Clock FSM and LFSR once, ignore the keystream */ + snow3g_keystream_1_4(&ctx, &KS4); + + f8_snow3g(&ctx, pBufferIn, pBufferOut, lengthInBytes); + +#ifdef SAFE_DATA + CLEAR_VAR(&KS4, sizeof(KS4)); + CLEAR_MEM(&ctx, sizeof(ctx)); + CLEAR_SCRATCH_GPS(); + CLEAR_SCRATCH_SIMD_REGS(); +#endif /* SAFE_DATA */ +} + +/*--------------------------------------------------------- + * @description + * Snow3G F8 bit 1 buffer: + * Single buffer enc/dec with IV and precomputed key schedule + *---------------------------------------------------------*/ +void SNOW3G_F8_1_BUFFER_BIT(const snow3g_key_schedule_t *pHandle, + const void *pIV, + const void *pBufferIn, + void *pBufferOut, + const uint32_t lengthInBits, + const uint32_t offsetInBits) +{ +#ifdef SAFE_PARAM + if ((pHandle == NULL) || (pIV == NULL) || + (pBufferIn == NULL) || (pBufferOut == NULL) || + (lengthInBits == 0)) + return; +#endif + + snow3gKeyState1_t ctx; + uint32_t KS4; /* 4 bytes of keystream */ + + /* Initialize the schedule from the IV */ + snow3gStateInitialize_1(&ctx, pHandle, pIV); + + /* Clock FSM and LFSR once, ignore the keystream */ + snow3g_keystream_1_4(&ctx, &KS4); + + f8_snow3g_bit(&ctx, pBufferIn, pBufferOut, lengthInBits, offsetInBits); + +#ifdef SAFE_DATA + CLEAR_VAR(&KS4, sizeof(KS4)); + CLEAR_MEM(&ctx, sizeof(ctx)); + CLEAR_SCRATCH_GPS(); + CLEAR_SCRATCH_SIMD_REGS(); +#endif /* SAFE_DATA */ +} + +/*--------------------------------------------------------- + * @description + * Snow3G F8 2 buffer: + * Two buffers enc/dec with the same key schedule. + * The 3 IVs are independent and are passed as an array of pointers. + * Each buffer and data length are separate. + *---------------------------------------------------------*/ +void SNOW3G_F8_2_BUFFER(const snow3g_key_schedule_t *pHandle, + const void *pIV1, + const void *pIV2, + const void *pBufIn1, + void *pBufOut1, + const uint32_t lenInBytes1, + const void *pBufIn2, + void *pBufOut2, + const uint32_t lenInBytes2) +{ +#ifdef SAFE_PARAM + if ((pHandle == NULL) || (pIV1 == NULL) || (pIV2 == NULL) || + (pBufIn1 == NULL) || (pBufOut1 == NULL) || + (pBufIn2 == NULL) || (pBufOut2 == NULL) || + (lenInBytes1 == 0) || (lenInBytes1 > SNOW3G_MAX_BYTELEN) || + (lenInBytes2 == 0) || (lenInBytes2 > SNOW3G_MAX_BYTELEN)) + return; +#endif + + snow3gKeyState1_t ctx1, ctx2; + uint32_t KS4; /* 4 bytes of keystream */ + + /* Initialize the schedule from the IV */ + snow3gStateInitialize_1(&ctx1, pHandle, pIV1); + + /* Clock FSM and LFSR once, ignore the keystream */ + snow3g_keystream_1_4(&ctx1, &KS4); + + /* data processing for packet 1 */ + f8_snow3g(&ctx1, pBufIn1, pBufOut1, lenInBytes1); + + /* Initialize the schedule from the IV */ + snow3gStateInitialize_1(&ctx2, pHandle, pIV2); + + /* Clock FSM and LFSR once, ignore the keystream */ + snow3g_keystream_1_4(&ctx2, &KS4); + + /* data processing for packet 2 */ + f8_snow3g(&ctx2, pBufIn2, pBufOut2, lenInBytes2); + +#ifdef SAFE_DATA + CLEAR_VAR(&KS4, sizeof(KS4)); + CLEAR_MEM(&ctx1, sizeof(ctx1)); + CLEAR_MEM(&ctx2, sizeof(ctx2)); + CLEAR_SCRATCH_GPS(); + CLEAR_SCRATCH_SIMD_REGS(); +#endif /* SAFE_DATA */ + +} + +/*--------------------------------------------------------- + * @description + * Snow3G F8 4 buffer: + * Four packets enc/dec with the same key schedule. + * The 4 IVs are independent and are passed as an array of pointers. + * Each buffer and data length are separate. + *---------------------------------------------------------*/ +void SNOW3G_F8_4_BUFFER(const snow3g_key_schedule_t *pHandle, + const void *pIV1, + const void *pIV2, + const void *pIV3, + const void *pIV4, + const void *pBufferIn1, + void *pBufferOut1, + const uint32_t lengthInBytes1, + const void *pBufferIn2, + void *pBufferOut2, + const uint32_t lengthInBytes2, + const void *pBufferIn3, + void *pBufferOut3, + const uint32_t lengthInBytes3, + const void *pBufferIn4, + void *pBufferOut4, + const uint32_t lengthInBytes4) +{ +#ifdef SAFE_PARAM + if ((pHandle == NULL) || + (pIV1 == NULL) || (pIV2 == NULL) || + (pIV3 == NULL) || (pIV4 == NULL) || + (pBufferIn1 == NULL) || (pBufferOut1 == NULL) || + (pBufferIn2 == NULL) || (pBufferOut2 == NULL) || + (pBufferIn3 == NULL) || (pBufferOut3 == NULL) || + (pBufferIn4 == NULL) || (pBufferOut4 == NULL) || + (lengthInBytes1 == 0) || (lengthInBytes1 > SNOW3G_MAX_BYTELEN) || + (lengthInBytes2 == 0) || (lengthInBytes2 > SNOW3G_MAX_BYTELEN) || + (lengthInBytes3 == 0) || (lengthInBytes3 > SNOW3G_MAX_BYTELEN) || + (lengthInBytes4 == 0) || (lengthInBytes4 > SNOW3G_MAX_BYTELEN)) + return; +#endif + + snow3gKeyState4_t ctx; + __m128i H, L; /* 4 bytes of keystream */ + uint32_t lenInBytes1 = lengthInBytes1; + uint32_t lenInBytes2 = lengthInBytes2; + uint32_t lenInBytes3 = lengthInBytes3; + uint32_t lenInBytes4 = lengthInBytes4; + uint32_t bytes1 = + (lenInBytes1 < lenInBytes2 ? lenInBytes1 + : lenInBytes2); /* number of bytes */ + uint32_t bytes2 = + (lenInBytes3 < lenInBytes4 ? lenInBytes3 + : lenInBytes4); /* number of bytes */ + /* min num of bytes */ + uint32_t bytes = (bytes1 < bytes2) ? bytes1 : bytes2; + uint32_t qwords = bytes / SNOW3G_8_BYTES; + uint8_t *pBufOut1 = pBufferOut1; + uint8_t *pBufOut2 = pBufferOut2; + uint8_t *pBufOut3 = pBufferOut3; + uint8_t *pBufOut4 = pBufferOut4; + const uint8_t *pBufIn1 = pBufferIn1; + const uint8_t *pBufIn2 = pBufferIn2; + const uint8_t *pBufIn3 = pBufferIn3; + const uint8_t *pBufIn4 = pBufferIn4; + + bytes = qwords * SNOW3G_8_BYTES; /* rounded down minimum length */ + + /* Initialize the schedule from the IV */ + snow3gStateInitialize_4(&ctx, pHandle, pIV1, pIV2, pIV3, pIV4); + + /* Clock FSM and LFSR once, ignore the keystream */ + snow3g_keystream_4_4(&ctx, &L); + + lenInBytes1 -= bytes; + lenInBytes2 -= bytes; + lenInBytes3 -= bytes; + lenInBytes4 -= bytes; + + /* generates 4 bytes at a time on all streams */ + while (qwords--) { + snow3g_keystream_4_8(&ctx, &L, &H); + pBufIn1 = xor_keystrm_rev(pBufOut1, pBufIn1, + _mm_extract_epi64(L, 0)); + pBufIn2 = xor_keystrm_rev(pBufOut2, pBufIn2, + _mm_extract_epi64(L, 1)); + pBufIn3 = xor_keystrm_rev(pBufOut3, pBufIn3, + _mm_extract_epi64(H, 0)); + pBufIn4 = xor_keystrm_rev(pBufOut4, pBufIn4, + _mm_extract_epi64(H, 1)); + + pBufOut1 += SNOW3G_8_BYTES; + pBufOut2 += SNOW3G_8_BYTES; + pBufOut3 += SNOW3G_8_BYTES; + pBufOut4 += SNOW3G_8_BYTES; + } + + /* process the remaining of each buffer + * - extract the LFSR and FSM structures + * - Continue process 1 buffer + */ + if (lenInBytes1) { + snow3gKeyState1_t ctx1; + + snow3gStateConvert_4(&ctx, &ctx1, 0); + f8_snow3g(&ctx1, pBufIn1, pBufOut1, lenInBytes1); + } + + if (lenInBytes2) { + snow3gKeyState1_t ctx2; + + snow3gStateConvert_4(&ctx, &ctx2, 1); + f8_snow3g(&ctx2, pBufIn2, pBufOut2, lenInBytes2); + } + + if (lenInBytes3) { + snow3gKeyState1_t ctx3; + + snow3gStateConvert_4(&ctx, &ctx3, 2); + f8_snow3g(&ctx3, pBufIn3, pBufOut3, lenInBytes3); + } + + if (lenInBytes4) { + snow3gKeyState1_t ctx4; + + snow3gStateConvert_4(&ctx, &ctx4, 3); + f8_snow3g(&ctx4, pBufIn4, pBufOut4, lenInBytes4); + } + +#ifdef SAFE_DATA + H = _mm_setzero_si128(); + L = _mm_setzero_si128(); + CLEAR_MEM(&ctx, sizeof(ctx)); + CLEAR_SCRATCH_GPS(); + CLEAR_SCRATCH_SIMD_REGS(); +#endif /* SAFE_DATA */ + +} + +#ifdef AVX2 +/*--------------------------------------------------------- + * @description + * Snow3G 8 buffer ks 8 multi: + * Processes 8 packets 8 bytes at a time. + * Uses individual key schedule for each buffer. + *---------------------------------------------------------*/ +static inline void +snow3g_8_buffer_ks_8_multi(uint32_t bytes, + const snow3g_key_schedule_t * const pKey[], + const void * const IV[], + const void * const pBufferIn[], + void *pBufferOut[], const uint32_t *lengthInBytes) +{ + uint32_t qwords = bytes / SNOW3G_8_BYTES; + __m256i H, L; /* 8 bytes of keystream */ + snow3gKeyState8_t ctx; + int i; + const uint8_t *tBufferIn[8]; + uint8_t *tBufferOut[8]; + uint32_t tLenInBytes[8]; + + bytes = qwords * SNOW3G_8_BYTES; /* rounded down minimum length */ + + for (i = 0; i < 8; i++) { + tBufferIn[i] = pBufferIn[i]; + tBufferOut[i] = pBufferOut[i]; + tLenInBytes[i] = lengthInBytes[i]; + } + + /* Initialize the schedule from the IV */ + snow3gStateInitialize_8_multiKey(&ctx, pKey, IV); + + /* Clock FSM and LFSR once, ignore the keystream */ + snow3g_keystream_8_4(&ctx, &L); + + for (i = 0; i < 8; i++) + tLenInBytes[i] -= bytes; + + /* generates 8 sets at a time on all streams */ + for (i = qwords; i != 0; i--) { + int j; + + snow3g_keystream_8_8(&ctx, &L, &H); + + tBufferIn[0] = xor_keystrm_rev(tBufferOut[0], tBufferIn[0], + _mm256_extract_epi64(L, 0)); + tBufferIn[1] = xor_keystrm_rev(tBufferOut[1], tBufferIn[1], + _mm256_extract_epi64(L, 1)); + tBufferIn[2] = xor_keystrm_rev(tBufferOut[2], tBufferIn[2], + _mm256_extract_epi64(H, 0)); + tBufferIn[3] = xor_keystrm_rev(tBufferOut[3], tBufferIn[3], + _mm256_extract_epi64(H, 1)); + tBufferIn[4] = xor_keystrm_rev(tBufferOut[4], tBufferIn[4], + _mm256_extract_epi64(L, 2)); + tBufferIn[5] = xor_keystrm_rev(tBufferOut[5], tBufferIn[5], + _mm256_extract_epi64(L, 3)); + tBufferIn[6] = xor_keystrm_rev(tBufferOut[6], tBufferIn[6], + _mm256_extract_epi64(H, 2)); + tBufferIn[7] = xor_keystrm_rev(tBufferOut[7], tBufferIn[7], + _mm256_extract_epi64(H, 3)); + + for (j = 0; j < 8; j++) + tBufferOut[j] += SNOW3G_8_BYTES; + } + + /* process the remaining of each buffer + * - extract the LFSR and FSM structures + * - Continue process 1 buffer + */ + if (tLenInBytes[0]) { + snow3gKeyState1_t ctx1; + + snow3gStateConvert_8(&ctx, &ctx1, 0); + f8_snow3g(&ctx1, tBufferIn[0], tBufferOut[0], tLenInBytes[0]); + } + if (tLenInBytes[1]) { + snow3gKeyState1_t ctx2; + + snow3gStateConvert_8(&ctx, &ctx2, 1); + f8_snow3g(&ctx2, tBufferIn[1], tBufferOut[1], tLenInBytes[1]); + } + if (tLenInBytes[2]) { + snow3gKeyState1_t ctx3; + + snow3gStateConvert_8(&ctx, &ctx3, 2); + f8_snow3g(&ctx3, tBufferIn[2], tBufferOut[2], tLenInBytes[2]); + } + if (tLenInBytes[3]) { + snow3gKeyState1_t ctx4; + + snow3gStateConvert_8(&ctx, &ctx4, 3); + f8_snow3g(&ctx4, tBufferIn[3], tBufferOut[3], tLenInBytes[3]); + } + if (tLenInBytes[4]) { + snow3gKeyState1_t ctx5; + + snow3gStateConvert_8(&ctx, &ctx5, 4); + f8_snow3g(&ctx5, tBufferIn[4], tBufferOut[4], tLenInBytes[4]); + } + if (tLenInBytes[5]) { + snow3gKeyState1_t ctx6; + + snow3gStateConvert_8(&ctx, &ctx6, 5); + f8_snow3g(&ctx6, tBufferIn[5], tBufferOut[5], tLenInBytes[5]); + } + if (tLenInBytes[6]) { + snow3gKeyState1_t ctx7; + + snow3gStateConvert_8(&ctx, &ctx7, 6); + f8_snow3g(&ctx7, tBufferIn[6], tBufferOut[6], tLenInBytes[6]); + } + if (tLenInBytes[7]) { + snow3gKeyState1_t ctx8; + + snow3gStateConvert_8(&ctx, &ctx8, 7); + f8_snow3g(&ctx8, tBufferIn[7], tBufferOut[7], tLenInBytes[7]); + } + +#ifdef SAFE_DATA + H = _mm256_setzero_si256(); + L = _mm256_setzero_si256(); + CLEAR_MEM(&ctx, sizeof(ctx)); +#endif /* SAFE_DATA */ +} + +/*--------------------------------------------------------- + * @description + * Snow3G 8 buffer ks 32 multi: + * Processes 8 packets 32 bytes at a time. + * Uses individual key schedule for each buffer. + *---------------------------------------------------------*/ +static inline void +snow3g_8_buffer_ks_32_multi(uint32_t bytes, + const snow3g_key_schedule_t * const pKey[], + const void * const IV[], + const void * const pBufferIn[], + void *pBufferOut[], const uint32_t *lengthInBytes) +{ + + snow3gKeyState8_t ctx; + uint32_t i; + + const uint8_t *tBufferIn[8]; + uint8_t *tBufferOut[8]; + uint32_t tLenInBytes[8]; + + for (i = 0; i < 8; i++) { + tBufferIn[i] = pBufferIn[i]; + tBufferOut[i] = pBufferOut[i]; + tLenInBytes[i] = lengthInBytes[i]; + } + + uint32_t blocks = bytes / 32; + + bytes = blocks * 32; /* rounded down minimum length */ + + /* Initialize the schedule from the IV */ + snow3gStateInitialize_8_multiKey(&ctx, pKey, IV); + + /* Clock FSM and LFSR once, ignore the keystream */ + __m256i ks[8]; + + snow3g_keystream_8_4(&ctx, ks); + + for (i = 0; i < 8; i++) + tLenInBytes[i] -= bytes; + + __m256i in[8]; + + /* generates 8 sets at a time on all streams */ + for (i = 0; i < blocks; i++) { + int j; + + in[0] = _mm256_loadu_si256((const __m256i *)tBufferIn[0]); + in[1] = _mm256_loadu_si256((const __m256i *)tBufferIn[1]); + in[2] = _mm256_loadu_si256((const __m256i *)tBufferIn[2]); + in[3] = _mm256_loadu_si256((const __m256i *)tBufferIn[3]); + in[4] = _mm256_loadu_si256((const __m256i *)tBufferIn[4]); + in[5] = _mm256_loadu_si256((const __m256i *)tBufferIn[5]); + in[6] = _mm256_loadu_si256((const __m256i *)tBufferIn[6]); + in[7] = _mm256_loadu_si256((const __m256i *)tBufferIn[7]); + + snow3g_keystream_8_32(&ctx, ks); + + _mm256_storeu_si256((__m256i *)tBufferOut[0], + _mm256_xor_si256(in[0], ks[0])); + _mm256_storeu_si256((__m256i *)tBufferOut[1], + _mm256_xor_si256(in[1], ks[1])); + _mm256_storeu_si256((__m256i *)tBufferOut[2], + _mm256_xor_si256(in[2], ks[2])); + _mm256_storeu_si256((__m256i *)tBufferOut[3], + _mm256_xor_si256(in[3], ks[3])); + _mm256_storeu_si256((__m256i *)tBufferOut[4], + _mm256_xor_si256(in[4], ks[4])); + _mm256_storeu_si256((__m256i *)tBufferOut[5], + _mm256_xor_si256(in[5], ks[5])); + _mm256_storeu_si256((__m256i *)tBufferOut[6], + _mm256_xor_si256(in[6], ks[6])); + _mm256_storeu_si256((__m256i *)tBufferOut[7], + _mm256_xor_si256(in[7], ks[7])); + + for (j = 0; j < 8; j++) { + tBufferIn[i] += 32; + tBufferOut[i] += 32; + } + } + + /* process the remaining of each buffer + * - extract the LFSR and FSM structures + * - Continue process 1 buffer + */ + if (tLenInBytes[0]) { + snow3gKeyState1_t ctx1; + + snow3gStateConvert_8(&ctx, &ctx1, 0); + f8_snow3g(&ctx1, tBufferIn[0], tBufferOut[0], tLenInBytes[0]); + } + if (tLenInBytes[1]) { + snow3gKeyState1_t ctx2; + + snow3gStateConvert_8(&ctx, &ctx2, 1); + f8_snow3g(&ctx2, tBufferIn[1], tBufferOut[1], tLenInBytes[1]); + } + if (tLenInBytes[2]) { + snow3gKeyState1_t ctx3; + + snow3gStateConvert_8(&ctx, &ctx3, 2); + f8_snow3g(&ctx3, tBufferIn[2], tBufferOut[2], tLenInBytes[2]); + } + if (tLenInBytes[3]) { + snow3gKeyState1_t ctx4; + + snow3gStateConvert_8(&ctx, &ctx4, 3); + f8_snow3g(&ctx4, tBufferIn[3], tBufferOut[3], tLenInBytes[3]); + } + if (tLenInBytes[4]) { + snow3gKeyState1_t ctx5; + + snow3gStateConvert_8(&ctx, &ctx5, 4); + f8_snow3g(&ctx5, tBufferIn[4], tBufferOut[4], tLenInBytes[4]); + } + if (tLenInBytes[5]) { + snow3gKeyState1_t ctx6; + + snow3gStateConvert_8(&ctx, &ctx6, 5); + f8_snow3g(&ctx6, tBufferIn[5], tBufferOut[5], tLenInBytes[5]); + } + if (tLenInBytes[6]) { + snow3gKeyState1_t ctx7; + + snow3gStateConvert_8(&ctx, &ctx7, 6); + f8_snow3g(&ctx7, tBufferIn[6], tBufferOut[6], tLenInBytes[6]); + } + if (tLenInBytes[7]) { + snow3gKeyState1_t ctx8; + + snow3gStateConvert_8(&ctx, &ctx8, 7); + f8_snow3g(&ctx8, tBufferIn[7], tBufferOut[7], tLenInBytes[7]); + } + +#ifdef SAFE_DATA + CLEAR_MEM(&ctx, sizeof(ctx)); + CLEAR_MEM(&ks, sizeof(ks)); + CLEAR_MEM(&in, sizeof(in)); +#endif /* SAFE_DATA */ +} + +/*--------------------------------------------------------- + * @description + * Snow3G 8 buffer ks 8 multi: + * Processes 8 packets 8 bytes at a time. + * Uses same key schedule for each buffer. + *---------------------------------------------------------*/ +static inline void +snow3g_8_buffer_ks_8(uint32_t bytes, + const snow3g_key_schedule_t *pHandle, + const void *pIV1, + const void *pIV2, + const void *pIV3, + const void *pIV4, + const void *pIV5, + const void *pIV6, + const void *pIV7, + const void *pIV8, + const void *pBufferIn1, void *pBufferOut1, + const uint32_t lengthInBytes1, + const void *pBufferIn2, void *pBufferOut2, + const uint32_t lengthInBytes2, + const void *pBufferIn3, void *pBufferOut3, + const uint32_t lengthInBytes3, + const void *pBufferIn4, void *pBufferOut4, + const uint32_t lengthInBytes4, + const void *pBufferIn5, void *pBufferOut5, + const uint32_t lengthInBytes5, + const void *pBufferIn6, void *pBufferOut6, + const uint32_t lengthInBytes6, + const void *pBufferIn7, void *pBufferOut7, + const uint32_t lengthInBytes7, + const void *pBufferIn8, void *pBufferOut8, + const uint32_t lengthInBytes8) +{ + + uint32_t qwords = bytes / SNOW3G_8_BYTES; + __m256i H, L; /* 8 bytes of keystream */ + snow3gKeyState8_t ctx; + int i; + uint32_t lenInBytes1 = lengthInBytes1; + uint32_t lenInBytes2 = lengthInBytes2; + uint32_t lenInBytes3 = lengthInBytes3; + uint32_t lenInBytes4 = lengthInBytes4; + uint32_t lenInBytes5 = lengthInBytes5; + uint32_t lenInBytes6 = lengthInBytes6; + uint32_t lenInBytes7 = lengthInBytes7; + uint32_t lenInBytes8 = lengthInBytes8; + uint8_t *pBufOut1 = pBufferOut1; + uint8_t *pBufOut2 = pBufferOut2; + uint8_t *pBufOut3 = pBufferOut3; + uint8_t *pBufOut4 = pBufferOut4; + uint8_t *pBufOut5 = pBufferOut5; + uint8_t *pBufOut6 = pBufferOut6; + uint8_t *pBufOut7 = pBufferOut7; + uint8_t *pBufOut8 = pBufferOut8; + const uint8_t *pBufIn1 = pBufferIn1; + const uint8_t *pBufIn2 = pBufferIn2; + const uint8_t *pBufIn3 = pBufferIn3; + const uint8_t *pBufIn4 = pBufferIn4; + const uint8_t *pBufIn5 = pBufferIn5; + const uint8_t *pBufIn6 = pBufferIn6; + const uint8_t *pBufIn7 = pBufferIn7; + const uint8_t *pBufIn8 = pBufferIn8; + + bytes = qwords * SNOW3G_8_BYTES; /* rounded down minimum length */ + + /* Initialize the schedule from the IV */ + snow3gStateInitialize_8(&ctx, pHandle, pIV1, pIV2, pIV3, + pIV4, pIV5, pIV6, pIV7, pIV8); + + /* Clock FSM and LFSR once, ignore the keystream */ + snow3g_keystream_8_4(&ctx, &L); + + lenInBytes1 -= bytes; + lenInBytes2 -= bytes; + lenInBytes3 -= bytes; + lenInBytes4 -= bytes; + lenInBytes5 -= bytes; + lenInBytes6 -= bytes; + lenInBytes7 -= bytes; + lenInBytes8 -= bytes; + + /* generates 8 sets at a time on all streams */ + for (i = qwords; i != 0; i--) { + snow3g_keystream_8_8(&ctx, &L, &H); + + pBufIn1 = xor_keystrm_rev(pBufOut1, pBufIn1, + _mm256_extract_epi64(L, 0)); + pBufIn2 = xor_keystrm_rev(pBufOut2, pBufIn2, + _mm256_extract_epi64(L, 1)); + pBufIn3 = xor_keystrm_rev(pBufOut3, pBufIn3, + _mm256_extract_epi64(H, 0)); + pBufIn4 = xor_keystrm_rev(pBufOut4, pBufIn4, + _mm256_extract_epi64(H, 1)); + pBufIn5 = xor_keystrm_rev(pBufOut5, pBufIn5, + _mm256_extract_epi64(L, 2)); + pBufIn6 = xor_keystrm_rev(pBufOut6, pBufIn6, + _mm256_extract_epi64(L, 3)); + pBufIn7 = xor_keystrm_rev(pBufOut7, pBufIn7, + _mm256_extract_epi64(H, 2)); + pBufIn8 = xor_keystrm_rev(pBufOut8, pBufIn8, + _mm256_extract_epi64(H, 3)); + + pBufOut1 += SNOW3G_8_BYTES; + pBufOut2 += SNOW3G_8_BYTES; + pBufOut3 += SNOW3G_8_BYTES; + pBufOut4 += SNOW3G_8_BYTES; + pBufOut5 += SNOW3G_8_BYTES; + pBufOut6 += SNOW3G_8_BYTES; + pBufOut7 += SNOW3G_8_BYTES; + pBufOut8 += SNOW3G_8_BYTES; + } + + /* process the remaining of each buffer + * - extract the LFSR and FSM structures + * - Continue process 1 buffer + */ + if (lenInBytes1) { + snow3gKeyState1_t ctx1; + + snow3gStateConvert_8(&ctx, &ctx1, 0); + f8_snow3g(&ctx1, pBufIn1, pBufOut1, lenInBytes1); + } + + if (lenInBytes2) { + snow3gKeyState1_t ctx2; + + snow3gStateConvert_8(&ctx, &ctx2, 1); + f8_snow3g(&ctx2, pBufIn2, pBufOut2, lenInBytes2); + } + + if (lenInBytes3) { + snow3gKeyState1_t ctx3; + + snow3gStateConvert_8(&ctx, &ctx3, 2); + f8_snow3g(&ctx3, pBufIn3, pBufOut3, lenInBytes3); + } + + if (lenInBytes4) { + snow3gKeyState1_t ctx4; + + snow3gStateConvert_8(&ctx, &ctx4, 3); + f8_snow3g(&ctx4, pBufIn4, pBufOut4, lenInBytes4); + } + + if (lenInBytes5) { + snow3gKeyState1_t ctx5; + + snow3gStateConvert_8(&ctx, &ctx5, 4); + f8_snow3g(&ctx5, pBufIn5, pBufOut5, lenInBytes5); + } + + if (lenInBytes6) { + snow3gKeyState1_t ctx6; + + snow3gStateConvert_8(&ctx, &ctx6, 5); + f8_snow3g(&ctx6, pBufIn6, pBufOut6, lenInBytes6); + } + + if (lenInBytes7) { + snow3gKeyState1_t ctx7; + + snow3gStateConvert_8(&ctx, &ctx7, 6); + f8_snow3g(&ctx7, pBufIn7, pBufOut7, lenInBytes7); + } + + if (lenInBytes8) { + snow3gKeyState1_t ctx8; + + snow3gStateConvert_8(&ctx, &ctx8, 7); + f8_snow3g(&ctx8, pBufIn8, pBufOut8, lenInBytes8); + } + +#ifdef SAFE_DATA + H = _mm256_setzero_si256(); + L = _mm256_setzero_si256(); + CLEAR_MEM(&ctx, sizeof(ctx)); +#endif /* SAFE_DATA */ +} + +/*--------------------------------------------------------- + * @description + * Snow3G 8 buffer ks 32 multi: + * Processes 8 packets 32 bytes at a time. + * Uses same key schedule for each buffer. + *---------------------------------------------------------*/ +static inline void +snow3g_8_buffer_ks_32(uint32_t bytes, + const snow3g_key_schedule_t *pKey, + const void *pIV1, const void *pIV2, + const void *pIV3, const void *pIV4, + const void *pIV5, const void *pIV6, + const void *pIV7, const void *pIV8, + const void *pBufferIn1, void *pBufferOut1, + const uint32_t lengthInBytes1, + const void *pBufferIn2, void *pBufferOut2, + const uint32_t lengthInBytes2, + const void *pBufferIn3, void *pBufferOut3, + const uint32_t lengthInBytes3, + const void *pBufferIn4, void *pBufferOut4, + const uint32_t lengthInBytes4, + const void *pBufferIn5, void *pBufferOut5, + const uint32_t lengthInBytes5, + const void *pBufferIn6, void *pBufferOut6, + const uint32_t lengthInBytes6, + const void *pBufferIn7, void *pBufferOut7, + const uint32_t lengthInBytes7, + const void *pBufferIn8, void *pBufferOut8, + const uint32_t lengthInBytes8) +{ + snow3gKeyState8_t ctx; + uint32_t i; + uint32_t lenInBytes1 = lengthInBytes1; + uint32_t lenInBytes2 = lengthInBytes2; + uint32_t lenInBytes3 = lengthInBytes3; + uint32_t lenInBytes4 = lengthInBytes4; + uint32_t lenInBytes5 = lengthInBytes5; + uint32_t lenInBytes6 = lengthInBytes6; + uint32_t lenInBytes7 = lengthInBytes7; + uint32_t lenInBytes8 = lengthInBytes8; + uint8_t *pBufOut1 = pBufferOut1; + uint8_t *pBufOut2 = pBufferOut2; + uint8_t *pBufOut3 = pBufferOut3; + uint8_t *pBufOut4 = pBufferOut4; + uint8_t *pBufOut5 = pBufferOut5; + uint8_t *pBufOut6 = pBufferOut6; + uint8_t *pBufOut7 = pBufferOut7; + uint8_t *pBufOut8 = pBufferOut8; + const uint8_t *pBufIn1 = pBufferIn1; + const uint8_t *pBufIn2 = pBufferIn2; + const uint8_t *pBufIn3 = pBufferIn3; + const uint8_t *pBufIn4 = pBufferIn4; + const uint8_t *pBufIn5 = pBufferIn5; + const uint8_t *pBufIn6 = pBufferIn6; + const uint8_t *pBufIn7 = pBufferIn7; + const uint8_t *pBufIn8 = pBufferIn8; + + uint32_t blocks = bytes / 32; + + bytes = blocks * 32; /* rounded down minimum length */ + + /* Initialize the schedule from the IV */ + snow3gStateInitialize_8(&ctx, pKey, pIV1, pIV2, pIV3, pIV4, pIV5, pIV6, + pIV7, pIV8); + + /* Clock FSM and LFSR once, ignore the keystream */ + __m256i ks[8]; + + snow3g_keystream_8_4(&ctx, ks); + + lenInBytes1 -= bytes; + lenInBytes2 -= bytes; + lenInBytes3 -= bytes; + lenInBytes4 -= bytes; + lenInBytes5 -= bytes; + lenInBytes6 -= bytes; + lenInBytes7 -= bytes; + lenInBytes8 -= bytes; + + __m256i in[8]; + + /* generates 8 sets at a time on all streams */ + for (i = 0; i < blocks; i++) { + + in[0] = _mm256_loadu_si256((const __m256i *)pBufIn1); + in[1] = _mm256_loadu_si256((const __m256i *)pBufIn2); + in[2] = _mm256_loadu_si256((const __m256i *)pBufIn3); + in[3] = _mm256_loadu_si256((const __m256i *)pBufIn4); + in[4] = _mm256_loadu_si256((const __m256i *)pBufIn5); + in[5] = _mm256_loadu_si256((const __m256i *)pBufIn6); + in[6] = _mm256_loadu_si256((const __m256i *)pBufIn7); + in[7] = _mm256_loadu_si256((const __m256i *)pBufIn8); + + snow3g_keystream_8_32(&ctx, ks); + + _mm256_storeu_si256((__m256i *)pBufOut1, + _mm256_xor_si256(in[0], ks[0])); + _mm256_storeu_si256((__m256i *)pBufOut2, + _mm256_xor_si256(in[1], ks[1])); + _mm256_storeu_si256((__m256i *)pBufOut3, + _mm256_xor_si256(in[2], ks[2])); + _mm256_storeu_si256((__m256i *)pBufOut4, + _mm256_xor_si256(in[3], ks[3])); + _mm256_storeu_si256((__m256i *)pBufOut5, + _mm256_xor_si256(in[4], ks[4])); + _mm256_storeu_si256((__m256i *)pBufOut6, + _mm256_xor_si256(in[5], ks[5])); + _mm256_storeu_si256((__m256i *)pBufOut7, + _mm256_xor_si256(in[6], ks[6])); + _mm256_storeu_si256((__m256i *)pBufOut8, + _mm256_xor_si256(in[7], ks[7])); + + pBufIn1 += 32; + pBufIn2 += 32; + pBufIn3 += 32; + pBufIn4 += 32; + pBufIn5 += 32; + pBufIn6 += 32; + pBufIn7 += 32; + pBufIn8 += 32; + + pBufOut1 += 32; + pBufOut2 += 32; + pBufOut3 += 32; + pBufOut4 += 32; + pBufOut5 += 32; + pBufOut6 += 32; + pBufOut7 += 32; + pBufOut8 += 32; + } + + /* process the remaining of each buffer + * - extract the LFSR and FSM structures + * - Continue process 1 buffer + */ + if (lenInBytes1) { + snow3gKeyState1_t ctx1; + + snow3gStateConvert_8(&ctx, &ctx1, 0); + f8_snow3g(&ctx1, pBufIn1, pBufOut1, lenInBytes1); + } + + if (lenInBytes2) { + snow3gKeyState1_t ctx2; + + snow3gStateConvert_8(&ctx, &ctx2, 1); + f8_snow3g(&ctx2, pBufIn2, pBufOut2, lenInBytes2); + } + + if (lenInBytes3) { + snow3gKeyState1_t ctx3; + + snow3gStateConvert_8(&ctx, &ctx3, 2); + f8_snow3g(&ctx3, pBufIn3, pBufOut3, lenInBytes3); + } + + if (lenInBytes4) { + snow3gKeyState1_t ctx4; + + snow3gStateConvert_8(&ctx, &ctx4, 3); + f8_snow3g(&ctx4, pBufIn4, pBufOut4, lenInBytes4); + } + + if (lenInBytes5) { + snow3gKeyState1_t ctx5; + + snow3gStateConvert_8(&ctx, &ctx5, 4); + f8_snow3g(&ctx5, pBufIn5, pBufOut5, lenInBytes5); + } + + if (lenInBytes6) { + snow3gKeyState1_t ctx6; + + snow3gStateConvert_8(&ctx, &ctx6, 5); + f8_snow3g(&ctx6, pBufIn6, pBufOut6, lenInBytes6); + } + + if (lenInBytes7) { + snow3gKeyState1_t ctx7; + + snow3gStateConvert_8(&ctx, &ctx7, 6); + f8_snow3g(&ctx7, pBufIn7, pBufOut7, lenInBytes7); + } + + if (lenInBytes8) { + snow3gKeyState1_t ctx8; + + snow3gStateConvert_8(&ctx, &ctx8, 7); + f8_snow3g(&ctx8, pBufIn8, pBufOut8, lenInBytes8); + } + +#ifdef SAFE_DATA + CLEAR_MEM(&ctx, sizeof(ctx)); + CLEAR_MEM(&ks, sizeof(ks)); + CLEAR_MEM(&in, sizeof(in)); +#endif /* SAFE_DATA */ +} +#endif /* AVX2 */ + +/*--------------------------------------------------------- + * @description + * Snow3G F8 8 buffer, multi-key: + * Eight packets enc/dec with eight respective key schedules. + * The 8 IVs are independent and are passed as an array of pointers. + * Each buffer and data length are separate. + *---------------------------------------------------------*/ +void SNOW3G_F8_8_BUFFER_MULTIKEY(const snow3g_key_schedule_t * const pKey[], + const void * const IV[], + const void * const BufferIn[], + void *BufferOut[], + const uint32_t lengthInBytes[]) +{ + int i; + +#ifdef SAFE_PARAM + if ((pKey == NULL) || (IV == NULL) || (BufferIn == NULL) || + (BufferOut == NULL) || (lengthInBytes == NULL)) + return; + + for (i = 0; i < 8; i++) + if ((pKey[i] == NULL) || (IV[i] == NULL) || + (BufferIn[i] == NULL) || (BufferOut[i] == NULL) || + (lengthInBytes[i] == 0) || + (lengthInBytes[i] > SNOW3G_MAX_BYTELEN)) + return; +#endif + +#ifndef AVX2 + /* basic C workaround for lack of non AVX2 implementation */ + for (i = 0; i < 8; i++) + SNOW3G_F8_1_BUFFER(pKey[i], IV[i], BufferIn[i], BufferOut[i], + lengthInBytes[i]); +#else + uint32_t bytes = lengthInBytes[0]; + + /* find min byte lenght */ + for (i = 1; i < 8; i++) + if (lengthInBytes[i] < bytes) + bytes = lengthInBytes[i]; + + if (bytes % 32) { + snow3g_8_buffer_ks_8_multi(bytes, pKey, IV, BufferIn, BufferOut, + lengthInBytes); + } else { + snow3g_8_buffer_ks_32_multi(bytes, pKey, IV, BufferIn, + BufferOut, lengthInBytes); + } +#ifdef SAFE_DATA + CLEAR_SCRATCH_GPS(); + CLEAR_SCRATCH_SIMD_REGS(); +#endif +#endif /* AVX2 */ +} + +/*--------------------------------------------------------- + * @description + * Snow3G F8 8 buffer: + * Eight packets enc/dec with the same key schedule. + * The 8 IVs are independent and are passed as an array of pointers. + * Each buffer and data length are separate. + * Uses AVX instructions. + *---------------------------------------------------------*/ +void SNOW3G_F8_8_BUFFER(const snow3g_key_schedule_t *pHandle, + const void *pIV1, + const void *pIV2, + const void *pIV3, + const void *pIV4, + const void *pIV5, + const void *pIV6, + const void *pIV7, + const void *pIV8, + const void *pBufIn1, + void *pBufOut1, + const uint32_t lenInBytes1, + const void *pBufIn2, + void *pBufOut2, + const uint32_t lenInBytes2, + const void *pBufIn3, + void *pBufOut3, + const uint32_t lenInBytes3, + const void *pBufIn4, + void *pBufOut4, + const uint32_t lenInBytes4, + const void *pBufIn5, + void *pBufOut5, + const uint32_t lenInBytes5, + const void *pBufIn6, + void *pBufOut6, + const uint32_t lenInBytes6, + const void *pBufIn7, + void *pBufOut7, + const uint32_t lenInBytes7, + const void *pBufIn8, + void *pBufOut8, + const uint32_t lenInBytes8) +{ +#ifdef SAFE_PARAM + if ((pHandle == NULL) || + (pIV1 == NULL) || (pIV2 == NULL) || + (pIV3 == NULL) || (pIV4 == NULL) || + (pIV5 == NULL) || (pIV6 == NULL) || + (pIV7 == NULL) || (pIV8 == NULL) || + (pBufIn1 == NULL) || (pBufOut1 == NULL) || + (pBufIn2 == NULL) || (pBufOut2 == NULL) || + (pBufIn3 == NULL) || (pBufOut3 == NULL) || + (pBufIn4 == NULL) || (pBufOut4 == NULL) || + (pBufIn5 == NULL) || (pBufOut5 == NULL) || + (pBufIn6 == NULL) || (pBufOut6 == NULL) || + (pBufIn7 == NULL) || (pBufOut7 == NULL) || + (pBufIn8 == NULL) || (pBufOut8 == NULL) || + (lenInBytes1 == 0) || (lenInBytes1 > SNOW3G_MAX_BYTELEN) || + (lenInBytes2 == 0) || (lenInBytes2 > SNOW3G_MAX_BYTELEN) || + (lenInBytes3 == 0) || (lenInBytes3 > SNOW3G_MAX_BYTELEN) || + (lenInBytes4 == 0) || (lenInBytes4 > SNOW3G_MAX_BYTELEN) || + (lenInBytes5 == 0) || (lenInBytes5 > SNOW3G_MAX_BYTELEN) || + (lenInBytes6 == 0) || (lenInBytes6 > SNOW3G_MAX_BYTELEN) || + (lenInBytes7 == 0) || (lenInBytes7 > SNOW3G_MAX_BYTELEN) || + (lenInBytes8 == 0) || (lenInBytes8 > SNOW3G_MAX_BYTELEN)) + return; +#endif + +#ifdef AVX2 + uint32_t bytes1 = + (lenInBytes1 < lenInBytes2 ? lenInBytes1 + : lenInBytes2); /* number of bytes */ + uint32_t bytes2 = + (lenInBytes3 < lenInBytes4 ? lenInBytes3 + : lenInBytes4); /* number of bytes */ + uint32_t bytes3 = + (lenInBytes5 < lenInBytes6 ? lenInBytes5 + : lenInBytes6); /* number of bytes */ + uint32_t bytes4 = + (lenInBytes7 < lenInBytes8 ? lenInBytes7 + : lenInBytes8); /* number of bytes */ + uint32_t bytesq1 = + (bytes1 < bytes2) ? bytes1 : bytes2; /* min number of bytes */ + uint32_t bytesq2 = (bytes3 < bytes4) ? bytes3 : bytes4; + uint32_t bytes = (bytesq1 < bytesq2) ? bytesq1 : bytesq2; + + if (bytes % 32) { + snow3g_8_buffer_ks_8( + bytes, pHandle, pIV1, pIV2, pIV3, pIV4, pIV5, pIV6, + pIV7, pIV8, pBufIn1, pBufOut1, lenInBytes1, pBufIn2, + pBufOut2, lenInBytes2, pBufIn3, pBufOut3, lenInBytes3, + pBufIn4, pBufOut4, lenInBytes4, pBufIn5, pBufOut5, + lenInBytes5, pBufIn6, pBufOut6, lenInBytes6, pBufIn7, + pBufOut7, lenInBytes7, pBufIn8, pBufOut8, lenInBytes8); + } else { + snow3g_8_buffer_ks_32( + bytes, pHandle, pIV1, pIV2, pIV3, pIV4, pIV5, pIV6, + pIV7, pIV8, pBufIn1, pBufOut1, lenInBytes1, pBufIn2, + pBufOut2, lenInBytes2, pBufIn3, pBufOut3, lenInBytes3, + pBufIn4, pBufOut4, lenInBytes4, pBufIn5, pBufOut5, + lenInBytes5, pBufIn6, pBufOut6, lenInBytes6, pBufIn7, + pBufOut7, lenInBytes7, pBufIn8, pBufOut8, lenInBytes8); + } +#ifdef SAFE_DATA + CLEAR_SCRATCH_GPS(); + CLEAR_SCRATCH_SIMD_REGS(); +#endif +#else /* ~AVX2 */ + SNOW3G_F8_2_BUFFER(pHandle, pIV1, pIV2, pBufIn1, pBufOut1, lenInBytes1, + pBufIn2, pBufOut2, lenInBytes2); + + SNOW3G_F8_2_BUFFER(pHandle, pIV3, pIV4, pBufIn3, pBufOut3, lenInBytes3, + pBufIn4, pBufOut4, lenInBytes4); + + SNOW3G_F8_2_BUFFER(pHandle, pIV5, pIV6, pBufIn5, pBufOut5, lenInBytes5, + pBufIn6, pBufOut6, lenInBytes6); + + SNOW3G_F8_2_BUFFER(pHandle, pIV7, pIV8, pBufIn7, pBufOut7, lenInBytes7, + pBufIn8, pBufOut8, lenInBytes8); +#endif /* AVX */ +} + +/****************************************************************************** + * @description + * Snow3G F8 multi packet: + * Performs F8 enc/dec on [n] packets. The operation is performed in-place. + * The input IV's are passed in Little Endian format. + * The KeySchedule is in Little Endian format. + ******************************************************************************/ +void SNOW3G_F8_N_BUFFER(const snow3g_key_schedule_t *pCtx, + const void * const IV[], + const void * const pBufferIn[], + void *pBufferOut[], + const uint32_t bufLenInBytes[], + const uint32_t packetCount) +{ +#ifdef SAFE_PARAM + uint32_t i; + + if ((pCtx == NULL) || (IV == NULL) || (pBufferIn == NULL) || + (pBufferOut == NULL) || (bufLenInBytes == NULL)) + return; + + for (i = 0; i < packetCount; i++) + if ((IV[i] == NULL) || (pBufferIn[i] == NULL) || + (pBufferOut[i] == NULL) || (bufLenInBytes[i] == 0) || + (bufLenInBytes[i] > SNOW3G_MAX_BYTELEN)) + return; +#endif + if (packetCount > 16) { + pBufferOut[0] = NULL; + printf("packetCount too high (%d)\n", packetCount); + return; + } + + uint32_t packet_index, inner_index, pktCnt = packetCount; + int sortNeeded = 0, tempLen = 0; + uint8_t *srctempbuff; + uint8_t *dsttempbuff; + uint8_t *ivtempbuff; + uint8_t *pSrcBuf[NUM_PACKETS_16] = {NULL}; + uint8_t *pDstBuf[NUM_PACKETS_16] = {NULL}; + uint8_t *pIV[NUM_PACKETS_16] = {NULL}; + uint32_t lensBuf[NUM_PACKETS_16] = {0}; + + memcpy((void *)lensBuf, bufLenInBytes, packetCount * sizeof(uint32_t)); + memcpy((void *)pSrcBuf, pBufferIn, packetCount * sizeof(void *)); + memcpy((void *)pDstBuf, pBufferOut, packetCount * sizeof(void *)); + memcpy((void *)pIV, IV, packetCount * sizeof(void *)); + + packet_index = packetCount; + + while (packet_index--) { + + /* check if all packets are sorted by decreasing length */ + if (packet_index > 0 && lensBuf[packet_index - 1] < + lensBuf[packet_index]) { + /* this packet array is not correctly sorted */ + sortNeeded = 1; + } + } + + if (sortNeeded) { + + /* sort packets in decreasing buffer size from [0] to + [n]th packet, ** where buffer[0] will contain longest + buffer and buffer[n] will contain the shortest buffer. + 4 arrays are swapped : + - pointers to input buffers + - pointers to output buffers + - pointers to input IV's + - input buffer lengths */ + packet_index = packetCount; + while (packet_index--) { + + inner_index = packet_index; + while (inner_index--) { + + if (lensBuf[packet_index] > + lensBuf[inner_index]) { + + /* swap buffers to arrange in + descending order from [0]. */ + srctempbuff = pSrcBuf[packet_index]; + dsttempbuff = pDstBuf[packet_index]; + ivtempbuff = pIV[packet_index]; + tempLen = lensBuf[packet_index]; + + pSrcBuf[packet_index] = + pSrcBuf[inner_index]; + pDstBuf[packet_index] = + pDstBuf[inner_index]; + pIV[packet_index] = pIV[inner_index]; + lensBuf[packet_index] = + lensBuf[inner_index]; + + pSrcBuf[inner_index] = srctempbuff; + pDstBuf[inner_index] = dsttempbuff; + pIV[inner_index] = ivtempbuff; + lensBuf[inner_index] = tempLen; + } + } /* for inner packet index (inner bubble-sort) */ + } /* for outer packet index (outer bubble-sort) */ + } /* if sortNeeded */ + + packet_index = 0; + /* process 8 buffers at-a-time */ +#ifdef AVX2 + while (pktCnt >= 8) { + pktCnt -= 8; + SNOW3G_F8_8_BUFFER(pCtx, pIV[packet_index], + pIV[packet_index + 1], + pIV[packet_index + 2], + pIV[packet_index + 3], + pIV[packet_index + 4], + pIV[packet_index + 5], + pIV[packet_index + 6], + pIV[packet_index + 7], + pSrcBuf[packet_index], + pDstBuf[packet_index], + lensBuf[packet_index], + pSrcBuf[packet_index + 1], + pDstBuf[packet_index + 1], + lensBuf[packet_index + 1], + pSrcBuf[packet_index + 2], + pDstBuf[packet_index + 2], + lensBuf[packet_index + 2], + pSrcBuf[packet_index + 3], + pDstBuf[packet_index + 3], + lensBuf[packet_index + 3], + pSrcBuf[packet_index + 4], + pDstBuf[packet_index + 4], + lensBuf[packet_index + 4], + pSrcBuf[packet_index + 5], + pDstBuf[packet_index + 5], + lensBuf[packet_index + 5], + pSrcBuf[packet_index + 6], + pDstBuf[packet_index + 6], + lensBuf[packet_index + 6], + pSrcBuf[packet_index + 7], + pDstBuf[packet_index + 7], + lensBuf[packet_index + 7]); + packet_index += 8; + } +#endif + /* process 4 buffers at-a-time */ + while (pktCnt >= 4) { + pktCnt -= 4; + SNOW3G_F8_4_BUFFER(pCtx, pIV[packet_index + 0], + pIV[packet_index + 1], + pIV[packet_index + 2], + pIV[packet_index + 3], + pSrcBuf[packet_index + 0], + pDstBuf[packet_index + 0], + lensBuf[packet_index + 0], + pSrcBuf[packet_index + 1], + pDstBuf[packet_index + 1], + lensBuf[packet_index + 1], + pSrcBuf[packet_index + 2], + pDstBuf[packet_index + 2], + lensBuf[packet_index + 2], + pSrcBuf[packet_index + 3], + pDstBuf[packet_index + 3], + lensBuf[packet_index + 3]); + packet_index += 4; + } + + /* process 2 packets at-a-time */ + while (pktCnt >= 2) { + pktCnt -= 2; + SNOW3G_F8_2_BUFFER(pCtx, pIV[packet_index + 0], + pIV[packet_index + 1], + pSrcBuf[packet_index + 0], + pDstBuf[packet_index + 0], + lensBuf[packet_index + 0], + pSrcBuf[packet_index + 1], + pDstBuf[packet_index + 1], + lensBuf[packet_index + 1]); + packet_index += 2; + } + + /* remaining packets are processed 1 at a time */ + while (pktCnt--) { + SNOW3G_F8_1_BUFFER(pCtx, pIV[packet_index + 0], + pSrcBuf[packet_index + 0], + pDstBuf[packet_index + 0], + lensBuf[packet_index + 0]); + packet_index++; + } +} + +void SNOW3G_F8_N_BUFFER_MULTIKEY(const snow3g_key_schedule_t * const pCtx[], + const void * const IV[], + const void * const pBufferIn[], + void *pBufferOut[], + const uint32_t bufLenInBytes[], + const uint32_t packetCount) +{ +#ifdef SAFE_PARAM + uint32_t i; + + if ((pCtx == NULL) || (IV == NULL) || (pBufferIn == NULL) || + (pBufferOut == NULL) || (bufLenInBytes == NULL)) + return; + + for (i = 0; i < packetCount; i++) + if ((pCtx[i] == NULL) || (IV[i] == NULL) || + (pBufferIn[i] == NULL) || (pBufferOut[i] == NULL) || + (bufLenInBytes[i] == 0) || + (bufLenInBytes[i] > SNOW3G_MAX_BYTELEN)) + return; +#endif + if (packetCount > 16) { + pBufferOut[0] = NULL; + printf("packetCount too high (%d)\n", packetCount); + return; + } + + uint32_t packet_index, inner_index, pktCnt = packetCount; + int sortNeeded = 0, tempLen = 0; + uint8_t *srctempbuff; + uint8_t *dsttempbuff; + uint8_t *ivtempbuff; + snow3g_key_schedule_t *pCtxBuf[NUM_PACKETS_16] = {NULL}; + uint8_t *pSrcBuf[NUM_PACKETS_16] = {NULL}; + uint8_t *pDstBuf[NUM_PACKETS_16] = {NULL}; + uint8_t *pIV[NUM_PACKETS_16] = {NULL}; + uint32_t lensBuf[NUM_PACKETS_16] = {0}; + snow3g_key_schedule_t *tempCtx; + + memcpy((void *)pCtxBuf, pCtx, packetCount * sizeof(void *)); + memcpy((void *)lensBuf, bufLenInBytes, packetCount * sizeof(uint32_t)); + memcpy((void *)pSrcBuf, pBufferIn, packetCount * sizeof(void *)); + memcpy((void *)pDstBuf, pBufferOut, packetCount * sizeof(void *)); + memcpy((void *)pIV, IV, packetCount * sizeof(void *)); + + packet_index = packetCount; + + while (packet_index--) { + + /* check if all packets are sorted by decreasing length */ + if (packet_index > 0 && lensBuf[packet_index - 1] < + lensBuf[packet_index]) { + /* this packet array is not correctly sorted */ + sortNeeded = 1; + } + } + + if (sortNeeded) { + /* sort packets in decreasing buffer size from [0] to [n]th + packet, where buffer[0] will contain longest buffer and + buffer[n] will contain the shortest buffer. + 4 arrays are swapped : + - pointers to input buffers + - pointers to output buffers + - pointers to input IV's + - input buffer lengths */ + packet_index = packetCount; + while (packet_index--) { + inner_index = packet_index; + while (inner_index--) { + if (lensBuf[packet_index] > + lensBuf[inner_index]) { + /* swap buffers to arrange in + descending order from [0]. */ + srctempbuff = pSrcBuf[packet_index]; + dsttempbuff = pDstBuf[packet_index]; + ivtempbuff = pIV[packet_index]; + tempLen = lensBuf[packet_index]; + tempCtx = pCtxBuf[packet_index]; + + pSrcBuf[packet_index] = + pSrcBuf[inner_index]; + pDstBuf[packet_index] = + pDstBuf[inner_index]; + pIV[packet_index] = pIV[inner_index]; + lensBuf[packet_index] = + lensBuf[inner_index]; + pCtxBuf[packet_index] = + pCtxBuf[inner_index]; + + pSrcBuf[inner_index] = srctempbuff; + pDstBuf[inner_index] = dsttempbuff; + pIV[inner_index] = ivtempbuff; + lensBuf[inner_index] = tempLen; + pCtxBuf[inner_index] = tempCtx; + } + } /* for inner packet index (inner bubble-sort) */ + } /* for outer packet index (outer bubble-sort) */ + } /* if sortNeeded */ + + packet_index = 0; + /* process 8 buffers at-a-time */ +#ifdef AVX2 + while (pktCnt >= 8) { + pktCnt -= 8; + SNOW3G_F8_8_BUFFER_MULTIKEY( + (const snow3g_key_schedule_t * const *) + &pCtxBuf[packet_index], + (const void * const *)&pIV[packet_index], + (const void * const *)&pSrcBuf[packet_index], + (void **)&pDstBuf[packet_index], + &lensBuf[packet_index]); + packet_index += 8; + } +#endif + /* TODO process 4 buffers at-a-time */ + /* TODO process 2 packets at-a-time */ + /* remaining packets are processed 1 at a time */ + while (pktCnt--) { + SNOW3G_F8_1_BUFFER(pCtxBuf[packet_index + 0], + pIV[packet_index + 0], + pSrcBuf[packet_index + 0], + pDstBuf[packet_index + 0], + lensBuf[packet_index + 0]); + packet_index++; + } +} + +/*--------------------------------------------------------- + * @description + * Snow3G F9 1 buffer + * Single buffer digest with IV and precomputed key schedule + *---------------------------------------------------------*/ +void SNOW3G_F9_1_BUFFER(const snow3g_key_schedule_t *pHandle, + const void *pIV, + const void *pBufferIn, + const uint64_t lengthInBits, + void *pDigest) +{ +#ifdef SAFE_PARAM + if ((pHandle == NULL) || (pIV == NULL) || + (pBufferIn == NULL) || (pDigest == NULL) || + (lengthInBits == 0) || (lengthInBits > SNOW3G_MAX_BITLEN)) + return; +#endif + snow3gKeyState1_t ctx; + uint32_t z[5]; + uint64_t lengthInQwords, E, V, P; + uint64_t i, rem_bits; + const uint64_t *inputBuffer; + + inputBuffer = (const uint64_t *)pBufferIn; + + /* Initialize the snow3g key schedule */ + snow3gStateInitialize_1(&ctx, pHandle, pIV); + + /*Generate 5 keystream words*/ + snow3g_f9_keystream_words(&ctx, &z[0]); + + P = ((uint64_t)z[0] << 32) | ((uint64_t)z[1]); + + lengthInQwords = lengthInBits / 64; + + E = 0; + /* all blocks except the last one */ + for (i = 0; i < lengthInQwords; i++) { + V = BSWAP64(inputBuffer[i]); + E = multiply_and_reduce64(E ^ V, P); + } + + /* last bits of last block if any left */ + rem_bits = lengthInBits % 64; + if (rem_bits) { + /* last bytes, do not go past end of buffer */ + memcpy(&V, &inputBuffer[i], (rem_bits + 7) / 8); + V = BSWAP64(V); + V &= (((uint64_t)-1) << (64 - rem_bits)); /* mask extra bits */ + E = multiply_and_reduce64(E ^ V, P); + } + + /* Multiply by Q */ + E = multiply_and_reduce64(E ^ lengthInBits, + (((uint64_t)z[2] << 32) | ((uint64_t)z[3]))); + + /* Final MAC */ + *(uint32_t *)pDigest = + (uint32_t)BSWAP64(E ^ ((uint64_t)z[4] << 32)); +#ifdef SAFE_DATA + CLEAR_VAR(&E, sizeof(E)); + CLEAR_VAR(&V, sizeof(V)); + CLEAR_VAR(&P, sizeof(P)); + CLEAR_MEM(&z, sizeof(z)); + CLEAR_MEM(&ctx, sizeof(ctx)); + CLEAR_SCRATCH_GPS(); + CLEAR_SCRATCH_SIMD_REGS(); +#endif /* SAFE_DATA */ +} + +#endif /* SNOW3G_COMMON_H */ diff --git a/src/spdk/intel-ipsec-mb/include/snow3g_internal.h b/src/spdk/intel-ipsec-mb/include/snow3g_internal.h new file mode 100644 index 000000000..287d60be1 --- /dev/null +++ b/src/spdk/intel-ipsec-mb/include/snow3g_internal.h @@ -0,0 +1,638 @@ +/******************************************************************************* + Copyright (c) 2009-2019, Intel Corporation + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of Intel Corporation nor the names of its contributors + may be used to endorse or promote products derived from this software + without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE + FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#ifndef _SNOW3G_INTERNAL_H_ +#define _SNOW3G_INTERNAL_H_ + +#include "intel-ipsec-mb.h" +#include "wireless_common.h" +#include "constant_lookup.h" + +#define MAX_KEY_LEN (16) +#define SNOW3G_4_BYTES (4) +#define SNOW3G_8_BYTES (8) +#define SNOW3G_8_BITS (8) +#define SNOW3G_16_BYTES (16) +#define SNOW3G_16_BITS (16) + +#define SNOW3G_BLOCK_SIZE (8) + +#define SNOW3G_KEY_LEN_IN_BYTES (16) /* 128b */ +#define SNOW3G_IV_LEN_IN_BYTES (16) /* 128b */ + +#define SNOW3GCONSTANT (0x1b) + +/* Range of input data for SNOW3G is from 1 to 2^32 bits */ +#define SNOW3G_MIN_LEN 1 +#define SNOW3G_MAX_BITLEN (UINT32_MAX) +#define SNOW3G_MAX_BYTELEN (UINT32_MAX / 8) + +#define ComplementaryMask64(x) ((~(x) % 64) + 1) +#define ComplementaryMask32(x) ((~(x) % 32) + 1) + +#ifndef SAFE_LOOKUP +/*standard lookup */ +#define SNOW3G_LOOKUP_W0(table, idx, size) \ + table[idx].w0.v +#define SNOW3G_LOOKUP_W1(table, idx, size) \ + table[idx].w1.v +#define SNOW3G_LOOKUP_W2(table, idx, size) \ + table[idx].w2.v +#define SNOW3G_LOOKUP_W3(table, idx, size) \ + table[idx].w3.v +#else +/* contant time lookup */ +#if defined (AVX) || defined (AVX2) +#define SNOW3G_LOOKUP_W0(table, idx, size) \ + ((uint32_t)(LOOKUP64_AVX(table, idx, size) >> 0)) +#define SNOW3G_LOOKUP_W1(table, idx, size) \ + ((uint32_t)(LOOKUP64_AVX(table, idx, size) >> 8)) +#define SNOW3G_LOOKUP_W2(table, idx, size) \ + ((uint32_t)(LOOKUP64_AVX(table, idx, size) >> 16)) +#define SNOW3G_LOOKUP_W3(table, idx, size) \ + ((uint32_t)(LOOKUP64_AVX(table, idx, size) >> 24)) +#else +#define SNOW3G_LOOKUP_W0(table, idx, size) \ + ((uint32_t)(LOOKUP64_SSE(table, idx, size) >> 0)) +#define SNOW3G_LOOKUP_W1(table, idx, size) \ + ((uint32_t)(LOOKUP64_SSE(table, idx, size) >> 8)) +#define SNOW3G_LOOKUP_W2(table, idx, size) \ + ((uint32_t)(LOOKUP64_SSE(table, idx, size) >> 16)) +#define SNOW3G_LOOKUP_W3(table, idx, size) \ + ((uint32_t)(LOOKUP64_SSE(table, idx, size) >> 24)) +#endif /* AVX || AVX2 */ +#endif /* SAFE_LOOKUP */ + +typedef union SafeBuffer { + uint64_t b64; + uint32_t b32[2]; + uint8_t b8[SNOW3G_8_BYTES]; +} SafeBuf; + +typedef struct snow3gKeyState1_s { + /* 16 LFSR stages */ + uint32_t LFSR_S[16]; + /* 3 FSM states */ + uint32_t FSM_R3; + uint32_t FSM_R2; + uint32_t FSM_R1; +} DECLARE_ALIGNED(snow3gKeyState1_t, 16); + +typedef struct snow3gKeyState4_s { + /* 16 LFSR stages */ + __m128i LFSR_X[16]; + /* 3 FSM states */ + __m128i FSM_X[3]; + uint32_t iLFSR_X; + +} snow3gKeyState4_t; + + +#ifdef _WIN32 +#pragma pack(push,1) +#define DECLARE_PACKED_UINT32(x) uint32_t x +#else +#define DECLARE_PACKED_UINT32(x) uint32_t x __attribute__((__packed__)) +#endif + +typedef union snow3gTableEntry_u { + uint64_t v; + struct { + uint8_t shift[3]; + DECLARE_PACKED_UINT32(v); + } w3; + struct { + uint8_t shift[2]; + DECLARE_PACKED_UINT32(v); + } w2; + struct { + uint8_t shift[1]; + DECLARE_PACKED_UINT32(v); + } w1; + struct { + uint8_t shift[4]; + DECLARE_PACKED_UINT32(v); + } w0; +} snow3gTableEntry_t; +#ifdef _WIN32 +#pragma pack(pop) +#endif + +#define rotl32(x, n) (((x) << (n)) | ((x) >> (32 - (n)))) + +#define rotr32(x, n) (((x) << (32 - (n))) | ((x) >> (n))) + +#define rotl8(x, n) (((x) << (n)) | ((x) >> (8 - (n)))) + +#define rotr8(x, n) (((x) << (8 - (n))) | ((x) >> (n))) + +/************************************************************************* + * @description - snow3g internal tables + *************************************************************************/ + +extern const int snow3g_table_A_mul[256]; +extern const int snow3g_table_A_div[256]; +extern snow3gTableEntry_t snow3g_table_S1[256]; +extern snow3gTableEntry_t snow3g_table_S2[256]; +extern const int S1_T0[256]; +extern const int S1_T1[256]; +extern const int S1_T2[256]; +extern const int S1_T3[256]; +extern const int S2_T0[256]; +extern const int S2_T1[256]; +extern const int S2_T2[256]; +extern const int S2_T3[256]; + +/* ------------------------------------------------------------------- + * combined S-Box processing for reduced instruction dependencies + * + * S1_S2_1 : 2 S-Box , 1 packet at a time + * S1_S2_S3_1 : 3 S-Box at the same time + * + * S1_S2_4 : 2 S-Box , 4 packets at a time + * + * ------------------------------------------------------------------ */ +#ifdef AVX2 +#define _mm256_set_m128i(/* __m128i */ hi, /* __m128i */ lo) \ + _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 0x1) + +#ifndef _mm256_loadu2_m128i +#define _mm256_loadu2_m128i(hi, lo) \ + _mm256_inserti128_si256( \ + _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)lo)), \ + _mm_loadu_si128((const __m128i *)hi), 1) +#endif /* _mm256_loadu2_m128i */ + +typedef struct snow3gKeyState8_s { + /* 16 LFSR stages */ + __m256i LFSR_X[16]; + /* 3 FSM states */ + __m256i FSM_X[3]; + uint32_t iLFSR_X; + +} snow3gKeyState8_t; + +/* Sbox Snow3g_S1 and Snow3g_S2 with dependency unrolling + * for n in [0..3] + * w[n-1] = k; y[n] = Snow3g_S2(w[n]); k = Snow3g_S1(x[n]) + * + * + */ +#define S1_S2_8(y, w, x, k, l, n) \ + do { \ + uint8_t w0, w1, w2, w3; \ + uint8_t x0, x1, x2, x3; \ + uint32_t ty = l; \ + w3 = _mm256_extract_epi8(w, (4 * n + 0)); \ + w2 = _mm256_extract_epi8(w, (4 * n + 1)); \ + w1 = _mm256_extract_epi8(w, (4 * n + 2)); \ + w0 = _mm256_extract_epi8(w, (4 * n + 3)); \ + l = snow3g_table_S2[w3].w3.v ^ snow3g_table_S2[w2].w2.v ^ \ + snow3g_table_S2[w1].w1.v ^ snow3g_table_S2[w0].w0.v; \ + if (n != 0) \ + w = _mm256_insert_epi32(w, k, (n - 1)); \ + if (n != 0) \ + y = _mm256_insert_epi32(y, ty, (n - 1)); \ + x3 = _mm256_extract_epi8(x, (4 * n + 0)); \ + x2 = _mm256_extract_epi8(x, (4 * n + 1)); \ + x1 = _mm256_extract_epi8(x, (4 * n + 2)); \ + x0 = _mm256_extract_epi8(x, (4 * n + 3)); \ + k = snow3g_table_S1[x3].w3.v ^ snow3g_table_S1[x2].w2.v ^ \ + snow3g_table_S1[x1].w1.v ^ snow3g_table_S1[x0].w0.v; \ + if (n == 7) \ + w = _mm256_insert_epi32(w, k, n); \ + if (n == 7) \ + y = _mm256_insert_epi32(y, l, n); \ + } while (0) +#endif /* AVX2 */ + + +#if defined (NO_AESNI) || defined (SAFE_LOOKUP) +/* help compilers to interleave the + * operations and table access latencies + */ + +/* Sbox Snow3g_S1 and Snow3g_S2, simple C code + * y = Snow3g_S2(w); w = Snow3g_S1(x); + */ +#define S1_S2_1(y, w, x) \ + do { \ + uint32_t w0, w1, w2, w3; \ + uint32_t x0, x1, x2, x3; \ + uint32_t tw, tx; \ + w3 = w & 0xff; \ + x3 = x & 0xff; \ + tw = SNOW3G_LOOKUP_W3(snow3g_table_S2, w3, \ + sizeof(snow3g_table_S2)); \ + tx = SNOW3G_LOOKUP_W3(snow3g_table_S1, x3, \ + sizeof(snow3g_table_S1)); \ + w0 = w >> 24; \ + x0 = x >> 24; \ + tw ^= SNOW3G_LOOKUP_W0(snow3g_table_S2, w0, \ + sizeof(snow3g_table_S2)); \ + tx ^= SNOW3G_LOOKUP_W0(snow3g_table_S1, x0, \ + sizeof(snow3g_table_S1)); \ + w1 = (w >> 16) & 0xff; \ + x1 = (x >> 16) & 0xff; \ + tw ^= SNOW3G_LOOKUP_W1(snow3g_table_S2, w1, \ + sizeof(snow3g_table_S2)); \ + tx ^= SNOW3G_LOOKUP_W1(snow3g_table_S1, x1, \ + sizeof(snow3g_table_S1)); \ + w2 = (w >> 8) & 0xff; \ + x2 = (x >> 8) & 0xff; \ + y = tw ^ SNOW3G_LOOKUP_W2(snow3g_table_S2, w2, \ + sizeof(snow3g_table_S2)); \ + w = tx ^ SNOW3G_LOOKUP_W2(snow3g_table_S1, x2, \ + sizeof(snow3g_table_S1)); \ + } while (0) + +/* Sbox Snow3g_S1 and Snow3g_S2, simple C code + * y = Snow3g_S2(w); w = Snow3g_S1(x); u = Snow3g_S1(z); + */ +#define S1_S2_S3_1(y, w, x, u, z) \ + do { \ + unsigned w0, w1, w2, w3; \ + unsigned x0, x1, x2, x3; \ + unsigned z0, z1, z2, z3; \ + uint32_t tw, tx, tz; \ + w3 = w & 0xff; \ + x3 = x & 0xff; \ + z3 = z & 0xff; \ + tw = SNOW3G_LOOKUP_W3(snow3g_table_S2, w3, \ + sizeof(snow3g_table_S2)); \ + tx = SNOW3G_LOOKUP_W3(snow3g_table_S1, x3, \ + sizeof(snow3g_table_S1)); \ + tz = SNOW3G_LOOKUP_W3(snow3g_table_S1, z3, \ + sizeof(snow3g_table_S1)); \ + w0 = w >> 24; \ + x0 = x >> 24; \ + z0 = z >> 24; \ + tw ^= SNOW3G_LOOKUP_W0(snow3g_table_S2, w0, \ + sizeof(snow3g_table_S2)); \ + tx ^= SNOW3G_LOOKUP_W0(snow3g_table_S1, x0, \ + sizeof(snow3g_table_S1)); \ + tz ^= SNOW3G_LOOKUP_W0(snow3g_table_S1, z0, \ + sizeof(snow3g_table_S1)); \ + w1 = (w >> 16) & 0xff; \ + x1 = (x >> 16) & 0xff; \ + z1 = (z >> 16) & 0xff; \ + tw ^= SNOW3G_LOOKUP_W1(snow3g_table_S2, w1, \ + sizeof(snow3g_table_S2)); \ + tx ^= SNOW3G_LOOKUP_W1(snow3g_table_S1, x1, \ + sizeof(snow3g_table_S1)); \ + tz ^= SNOW3G_LOOKUP_W1(snow3g_table_S1, z1, \ + sizeof(snow3g_table_S1)); \ + w2 = (w >> 8) & 0xff; \ + x2 = (x >> 8) & 0xff; \ + z2 = (z >> 8) & 0xff; \ + y = tw ^ SNOW3G_LOOKUP_W2(snow3g_table_S2, w2, \ + sizeof(snow3g_table_S2)); \ + w = tx ^ SNOW3G_LOOKUP_W2(snow3g_table_S1, x2, \ + sizeof(snow3g_table_S1)); \ + u = tz ^ SNOW3G_LOOKUP_W2(snow3g_table_S1, z2, \ + sizeof(snow3g_table_S1)); \ + } while (0) + +/* Sbox Snow3g_S1 and Snow3g_S2 with dependency unrolling + * for n in [0..3] + * w[n-1] = k; y[n] = Snow3g_S2(w[n]); k = Snow3g_S1(x[n]) + * + * + */ +#define S1_S2_4(y, w, x, k, l, n) \ + do { \ + unsigned w0, w1, w2, w3; \ + unsigned x0, x1, x2, x3; \ + uint32_t ty = l; \ + w3 = _mm_extract_epi8(w, (4 * n + 0)); \ + w2 = _mm_extract_epi8(w, (4 * n + 1)); \ + w1 = _mm_extract_epi8(w, (4 * n + 2)); \ + w0 = _mm_extract_epi8(w, (4 * n + 3)); \ + l = SNOW3G_LOOKUP_W3(snow3g_table_S2, w3, \ + sizeof(snow3g_table_S2)) ^ \ + SNOW3G_LOOKUP_W2(snow3g_table_S2, w2, \ + sizeof(snow3g_table_S2)) ^ \ + SNOW3G_LOOKUP_W1(snow3g_table_S2, w1, \ + sizeof(snow3g_table_S2)) ^ \ + SNOW3G_LOOKUP_W0(snow3g_table_S2, w0, \ + sizeof(snow3g_table_S2)); \ + if (n != 0) \ + w = _mm_insert_epi32(w, k, (n - 1)); \ + if (n != 0) \ + y = _mm_insert_epi32(y, ty, (n - 1)); \ + x3 = _mm_extract_epi8(x, (4 * n + 0)); \ + x2 = _mm_extract_epi8(x, (4 * n + 1)); \ + x1 = _mm_extract_epi8(x, (4 * n + 2)); \ + x0 = _mm_extract_epi8(x, (4 * n + 3)); \ + k = SNOW3G_LOOKUP_W3(snow3g_table_S1, x3, \ + sizeof(snow3g_table_S1)) ^ \ + SNOW3G_LOOKUP_W2(snow3g_table_S1, x2, \ + sizeof(snow3g_table_S1)) ^ \ + SNOW3G_LOOKUP_W1(snow3g_table_S1, x1, \ + sizeof(snow3g_table_S1)) ^ \ + SNOW3G_LOOKUP_W0(snow3g_table_S1, x0, \ + sizeof(snow3g_table_S1)); \ + if (n == 3) \ + w = _mm_insert_epi32(w, k, n); \ + if (n == 3) \ + y = _mm_insert_epi32(y, l, n); \ + } while (0) + +#else /* SSE/AVX */ + +/* use AES-NI Rijndael for Snow3G Sbox, overlap the latency + * of AESENC with Snow3g_S2 sbox calculations + */ + +/* Sbox Snow3g_S1 and Snow3g_S2, simple C code + * y = Snow3g_S2(w); w = rijndael Snow3g_S1(x); + */ +#define S1_S2_1(y, w, x) \ + do { \ + __m128i m10, m11; \ + m11 = _mm_cvtsi32_si128(x); \ + m10 = _mm_setzero_si128(); \ + m11 = _mm_shuffle_epi32(m11, 0x0); \ + m11 = _mm_aesenc_si128(m11, m10); \ + y = Snow3g_S2(w); \ + w = _mm_cvtsi128_si32(m11); \ + } while (0) + +/* Sbox Snow3g_S1 and Snow3g_S2 + * y = Snow3g_S2(w); w = rijndael Snow3g_S1(x); u = rijndael Snow3g_S1(z); + */ +#define S1_S2_S3_1(y, w, x, v, z) \ + do { \ + __m128i m10, m11, m12; \ + m11 = _mm_cvtsi32_si128(x); \ + m10 = _mm_setzero_si128(); \ + m11 = _mm_shuffle_epi32(m11, 0x0); \ + m11 = _mm_aesenc_si128(m11, m10); \ + m12 = _mm_cvtsi32_si128(z); \ + m12 = _mm_shuffle_epi32(m12, 0x0); \ + m12 = _mm_aesenc_si128(m12, m10); \ + y = Snow3g_S2(w); \ + w = _mm_cvtsi128_si32(m11); \ + v = _mm_cvtsi128_si32(m12); \ + } while (0) +/* Sbox Snow3g_S1 and Snow3g_S2 + * for n in [0..3] + * extract packet data + * y = Snow3g_S2(w); w = rijndael Snow3g_S1(x) + * insert the result data + */ +#define S1_S2_4(y, w, x, k, n) \ + do { \ + uint32_t ty; \ + unsigned w0, w1, w2, w3; \ + __m128i m10, m11; \ + m10 = _mm_setzero_si128(); \ + m11 = _mm_shuffle_epi32( \ + x, ((n << 6) | (n << 4) | (n << 2) | (n << 0))); \ + m11 = _mm_aesenc_si128(m11, m10); \ + w3 = _mm_extract_epi8(w, (4 * n + 0)); \ + w2 = _mm_extract_epi8(w, (4 * n + 1)); \ + w1 = _mm_extract_epi8(w, (4 * n + 2)); \ + w0 = _mm_extract_epi8(w, (4 * n + 3)); \ + ty = snow3g_table_S2[w3].w3.v ^ snow3g_table_S2[w1].w1.v ^ \ + snow3g_table_S2[w2].w2.v ^ snow3g_table_S2[w0].w0.v; \ + if (n != 0) \ + w = _mm_insert_epi32(w, k, (n - 1)); \ + k = _mm_cvtsi128_si32(m11); \ + if (n == 3) \ + w = _mm_insert_epi32(w, k, n); \ + y = _mm_insert_epi32(y, ty, n); \ + } while (0) + +#endif /* NO_AESNI || SAFE_LOOKUP */ + +/* ------------------------------------------------------------------- + * Sbox Snow3g_S1 maps a 32bit input to a 32bit output + * ------------------------------------------------------------------ */ +static inline uint32_t Snow3g_S1(uint32_t w) +{ + uint32_t w0, w1, w2, w3; + + w3 = w & 0xff; + w1 = (w >> 16) & 0xff; + w2 = (w >> 8) & 0xff; + w0 = w >> 24; + return snow3g_table_S1[w3].w3.v ^ snow3g_table_S1[w1].w1.v ^ + snow3g_table_S1[w2].w2.v ^ snow3g_table_S1[w0].w0.v; +} + +/* ------------------------------------------------------------------- + * Sbox Snow3g_S2 maps a 32bit input to a 32bit output + * ------------------------------------------------------------------ */ +static inline uint32_t Snow3g_S2(uint32_t w) +{ + uint32_t w0, w1, w2, w3; + + w3 = w & 0xff; + w1 = (w >> 16) & 0xff; + w2 = (w >> 8) & 0xff; + w0 = w >> 24; + + return snow3g_table_S2[w3].w3.v ^ snow3g_table_S2[w1].w1.v ^ + snow3g_table_S2[w2].w2.v ^ snow3g_table_S2[w0].w0.v; +} + +/* ------------------------------------------------------------------- + * LFSR array shift by 1 position + * ------------------------------------------------------------------ */ +static inline void ShiftLFSR_1(snow3gKeyState1_t *pCtx) +{ + uint32_t i; + + for (i = 0; i < 15; i++) + pCtx->LFSR_S[i] = pCtx->LFSR_S[i + 1]; +} + +/* ------------------------------------------------------------------- + * LFSR array shift by 2 positions + * ------------------------------------------------------------------ */ +static inline void ShiftTwiceLFSR_1(snow3gKeyState1_t *pCtx) +{ + int i; + + for (i = 0; i < 14; i++) + pCtx->LFSR_S[i] = pCtx->LFSR_S[i + 2]; +} + +/* ------------------------------------------------------------------- + * ClockFSM function as defined in snow3g standard + * The FSM has 2 input words S5 and S15 from the LFSR + * produces a 32 bit output word F + * ------------------------------------------------------------------ */ +static inline void ClockFSM_1(snow3gKeyState1_t *pCtx, uint32_t *data) +{ + uint32_t F, R; + + F = pCtx->LFSR_S[15] + pCtx->FSM_R1; + R = pCtx->FSM_R3 ^ pCtx->LFSR_S[5]; + *data = F ^ pCtx->FSM_R2; + R += pCtx->FSM_R2; + S1_S2_1(pCtx->FSM_R3, pCtx->FSM_R2, pCtx->FSM_R1); + pCtx->FSM_R1 = R; +} + +/* ------------------------------------------------------------------- + * ClockLFSR functin as defined in snow3g standard + * ------------------------------------------------------------------ */ +static inline void ClockLFSR_1(snow3gKeyState1_t *pCtx) +{ + uint32_t V = pCtx->LFSR_S[2]; + uint32_t S0 = pCtx->LFSR_S[0]; + uint32_t S11 = pCtx->LFSR_S[11]; + + V ^= snow3g_table_A_mul[S0 >> 24]; + V ^= snow3g_table_A_div[S11 & 0xff]; + V ^= S0 << 8; + V ^= S11 >> 8; + + ShiftLFSR_1(pCtx); + + pCtx->LFSR_S[15] = V; +} + +/** + ******************************************************************************* + * @description + * This function initializes the key schedule for 1 buffer for snow3g f8/f9. + * + * @param[in] pCtx Context where the scheduled keys are stored + * @param [in] pKeySched Key schedule + * @param [in] pIV IV + * + ******************************************************************************/ +static inline void +snow3gStateInitialize_1(snow3gKeyState1_t *pCtx, + const snow3g_key_schedule_t *pKeySched, + const void *pIV) +{ + uint32_t K, L; + int i; + uint32_t V0, V1; + uint32_t F0, F1; + uint32_t L0, L1, L11, L12; + uint32_t R0, R1; + uint32_t FSM2, FSM3, FSM4; + const uint32_t *pIV32 = pIV; + + /* LFSR initialisation */ + for (i = 0; i < 4; i++) { + K = pKeySched->k[i]; + L = ~K; + pCtx->LFSR_S[i + 4] = K; + pCtx->LFSR_S[i + 12] = K; + pCtx->LFSR_S[i + 0] = L; + pCtx->LFSR_S[i + 8] = L; + } + + pCtx->LFSR_S[15] ^= BSWAP32(pIV32[3]); + pCtx->LFSR_S[12] ^= BSWAP32(pIV32[2]); + pCtx->LFSR_S[10] ^= BSWAP32(pIV32[1]); + pCtx->LFSR_S[9] ^= BSWAP32(pIV32[0]); + + /* FSM initialialization */ + FSM2 = 0x0; + FSM3 = 0x0; + FSM4 = 0x0; + R1 = 0x0; + V1 = pCtx->LFSR_S[15]; + + for (i = 0; i < 16; i++) { + /* clock FSM + clock LFSR + clockFSM + clock LFSR */ + L0 = pCtx->LFSR_S[0]; + L1 = pCtx->LFSR_S[1]; + V0 = pCtx->LFSR_S[2]; + F0 = V1 + R1; /** (s15 + R1) **/ + V1 = pCtx->LFSR_S[3]; + V0 ^= snow3g_table_A_mul[L0 >> 24]; /* MUL(s0,0 ) */ + F0 ^= FSM2; /** (s15 + R1) ^ R2 **/ + V1 ^= snow3g_table_A_mul[L1 >> 24]; + L11 = pCtx->LFSR_S[11]; + L12 = pCtx->LFSR_S[12]; + R0 = FSM3 ^ pCtx->LFSR_S[5]; /*** (R3 ^ s5 ) ***/ + V0 ^= snow3g_table_A_div[L11 & 0xff]; /* DIV(s11,3 )*/ + R0 += FSM2; /*** R2 + (R3 ^ s5 ) ***/ + V1 ^= snow3g_table_A_div[L12 & 0xff]; + V0 ^= L0 << 8; /* (s0,1 || s0,2 || s0,3 || 0x00) */ + V1 ^= L1 << 8; + V0 ^= L11 >> 8; /* (0x00 || s11,0 || s11,1 || s11,2 ) */ + V1 ^= L12 >> 8; + S1_S2_S3_1(FSM3, FSM2, R1, FSM4, R0); + V0 ^= F0; /* ^F */ + R1 = FSM3 ^ pCtx->LFSR_S[6]; + F1 = V0 + R0; + F1 ^= FSM2; + R1 += FSM2; + FSM3 = Snow3g_S2(FSM2); + FSM2 = FSM4; + V1 ^= F1; + + /* shift LFSR twice */ + ShiftTwiceLFSR_1(pCtx); + + pCtx->LFSR_S[14] = V0; + pCtx->LFSR_S[15] = V1; + } + + /* set FSM into scheduling structure */ + pCtx->FSM_R3 = FSM3; + pCtx->FSM_R2 = FSM2; + pCtx->FSM_R1 = R1; +} + +/** + ******************************************************************************* + * @description + * This function generates 5 words of keystream used in the initial stages + * of snow3g F9. + * + * @param[in] pCtx Context where the scheduled + *keys are stored + * @param[in/out] pKeyStream Pointer to the generated keystream + * + ******************************************************************************/ +static inline void snow3g_f9_keystream_words(snow3gKeyState1_t *pCtx, + uint32_t *pKeyStream) +{ + uint32_t F, XX; + int i; + + ClockFSM_1(pCtx, &XX); + ClockLFSR_1(pCtx); + + for (i = 0; i < 5; i++) { + ClockFSM_1(pCtx, &F); + pKeyStream[i] = F ^ pCtx->LFSR_S[0]; + ClockLFSR_1(pCtx); + } +} + +#endif /* _SNOW3G_INTERNAL_H_ */ diff --git a/src/spdk/intel-ipsec-mb/include/transpose_avx2.asm b/src/spdk/intel-ipsec-mb/include/transpose_avx2.asm new file mode 100644 index 000000000..fed12cf4b --- /dev/null +++ b/src/spdk/intel-ipsec-mb/include/transpose_avx2.asm @@ -0,0 +1,218 @@ +;; +;; Copyright (c) 2012-2019, Intel Corporation +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are met: +;; +;; * Redistributions of source code must retain the above copyright notice, +;; this list of conditions and the following disclaimer. +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; * Neither the name of Intel Corporation nor the names of its contributors +;; may be used to endorse or promote products derived from this software +;; without specific prior written permission. +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;; + +%ifndef _TRANSPOSE_AVX2_ASM_ +%define _TRANSPOSE_AVX2_ASM_ + +%include "include/reg_sizes.asm" + +; LOAD ALL 8 LANES FOR 8x8 32-BIT TRANSPOSE +; +; r0-r7 [out] ymm registers which will contain the data to be transposed +; addr0-addr7 [in] pointers to the next 32-byte block of data to be fetch for all 8 lanes +; ptr_offset [in] offset to be applied on all pointers (addr0-addr7) +%macro TRANSPOSE8_U32_LOAD8 17 +%define %%r0 %1 +%define %%r1 %2 +%define %%r2 %3 +%define %%r3 %4 +%define %%r4 %5 +%define %%r5 %6 +%define %%r6 %7 +%define %%r7 %8 +%define %%addr0 %9 +%define %%addr1 %10 +%define %%addr2 %11 +%define %%addr3 %12 +%define %%addr4 %13 +%define %%addr5 %14 +%define %%addr6 %15 +%define %%addr7 %16 +%define %%ptr_offset %17 + +; Expected output data +; +; r0 = {e3 e2 e1 e0 a3 a2 a1 a0} +; r1 = {f3 f2 f1 f0 b3 b2 b1 b0} +; r2 = {g3 g2 g1 g0 c3 c2 c1 c0} +; r3 = {h3 h2 h1 h0 d3 d2 d1 d0} +; r4 = {e7 e6 e5 e4 a7 a6 a5 a4} +; r5 = {f7 f6 f5 f4 b7 b6 b5 b4} +; r6 = {g7 g6 g5 g4 c7 c6 c5 c4} +; r7 = {h7 h6 h5 h4 d7 d6 d5 d4} + + vmovups XWORD(%%r0),[%%addr0+%%ptr_offset] + vmovups XWORD(%%r1),[%%addr1+%%ptr_offset] + vmovups XWORD(%%r2),[%%addr2+%%ptr_offset] + vmovups XWORD(%%r3),[%%addr3+%%ptr_offset] + vmovups XWORD(%%r4),[%%addr0+%%ptr_offset+16] + vmovups XWORD(%%r5),[%%addr1+%%ptr_offset+16] + vmovups XWORD(%%r6),[%%addr2+%%ptr_offset+16] + vmovups XWORD(%%r7),[%%addr3+%%ptr_offset+16] + + vinserti128 %%r0, %%r0, [%%addr4+%%ptr_offset], 0x01 + vinserti128 %%r1, %%r1, [%%addr5+%%ptr_offset], 0x01 + vinserti128 %%r2, %%r2, [%%addr6+%%ptr_offset], 0x01 + vinserti128 %%r3, %%r3, [%%addr7+%%ptr_offset], 0x01 + vinserti128 %%r4, %%r4, [%%addr4+%%ptr_offset+16], 0x01 + vinserti128 %%r5, %%r5, [%%addr5+%%ptr_offset+16], 0x01 + vinserti128 %%r6, %%r6, [%%addr6+%%ptr_offset+16], 0x01 + vinserti128 %%r7, %%r7, [%%addr7+%%ptr_offset+16], 0x01 + +%endmacro + +; 8x8 32-BIT TRANSPOSE +; +; Before calling this macro, TRANSPOSE8_U32_LOAD8 must be called. +; +; r0-r3 [in/out] ymm registers containing bytes 0-15 of each 32B block (e.g. ymm0 = [e3-e0 a3-a0]) +; r4-r7 [in/out] ymm registers containing bytes 16-31 of each 32B block (e.g. ymm4 = [e4-e7 a4-a7]) +; t0-t1 [clobbered] ymm temporary registers +%macro TRANSPOSE8_U32 10 +%define %%r0 %1 +%define %%r1 %2 +%define %%r2 %3 +%define %%r3 %4 +%define %%r4 %5 +%define %%r5 %6 +%define %%r6 %7 +%define %%r7 %8 +%define %%t0 %9 +%define %%t1 %10 +; Input looks like: {r0 r1 r2 r3 r4 r5 r6 r7} +; r0 = {e3 e2 e1 e0 a3 a2 a1 a0} +; r1 = {f3 f2 f1 f0 b3 b2 b1 b0} +; r2 = {g3 g2 g1 g0 c3 c2 c1 c0} +; r3 = {h3 h2 h1 h0 d3 d2 d1 d0} +; r4 = {e7 e6 e5 e4 a7 a6 a5 a4} +; r5 = {f7 f6 f5 f4 b7 b6 b5 b4} +; r6 = {g7 g6 g5 g4 c7 c6 c5 c4} +; r7 = {h7 h6 h5 h4 d7 d6 d5 d4} +; +; Output looks like: {r0 r1 r2 r3 r4 r5 r6 r7} +; r0 = {h0 g0 f0 e0 d0 c0 b0 a0} +; r1 = {h1 g1 f1 e1 d1 c1 b1 a1} +; r2 = {h2 g2 f2 e2 d2 c2 b2 a2} +; r3 = {h3 g3 f3 e3 d3 c3 b3 a3} +; r4 = {h4 g4 f4 e4 d4 c4 b4 a4} +; r5 = {h5 g5 f5 e5 d5 c5 b5 a5} +; r6 = {h6 g6 f6 e6 d6 c6 b6 a6} +; r7 = {h7 g7 f7 e7 d7 c7 b7 a7} +; + ; process top half (r0..r3) + vshufps %%t0, %%r0, %%r1, 0x44 ; t0 = {f1 f0 e1 e0 b1 b0 a1 a0} + vshufps %%r0, %%r0, %%r1, 0xEE ; r0 = {f3 f2 e3 e2 b3 b2 a3 a2} + vshufps %%t1, %%r2, %%r3, 0x44 ; t1 = {h1 h0 g1 g0 d1 d0 c1 c0} + vshufps %%r2, %%r2, %%r3, 0xEE ; r2 = {h3 h2 g3 g2 d3 d2 c3 c2} + + vshufps %%r1, %%t0, %%t1, 0xDD ; r1 = {h1 g1 f1 e1 d1 c1 b1 a1} + vshufps %%r3, %%r0, %%r2, 0xDD ; r3 = {h3 g3 f3 e3 d3 c3 b3 a3} + vshufps %%r2, %%r0, %%r2, 0x88 ; r2 = {h2 g2 f2 e2 d2 c2 b2 a2} + vshufps %%r0, %%t0, %%t1, 0x88 ; r0 = {h0 g0 f0 e0 d0 c0 b0 a0} + + ;; process bottom half (r4..r7) + vshufps %%t0, %%r4, %%r5, 0x44 ; t0 = {f5 f4 e5 e4 b5 b4 a5 a4} + vshufps %%r4, %%r4, %%r5, 0xEE ; r4 = {f7 f6 e7 e6 b7 b6 a7 a6} + vshufps %%t1, %%r6, %%r7, 0x44 ; t1 = {h5 h4 g5 g4 d5 d4 c5 c4} + vshufps %%r6, %%r6, %%r7, 0xEE ; r6 = {h7 h6 g7 g6 d7 d6 c7 c6} + + vshufps %%r5, %%t0, %%t1, 0xDD ; r5 = {h5 g5 f5 e5 d5 c5 b5 a5} + vshufps %%r7, %%r4, %%r6, 0xDD ; r7 = {h7 g7 f7 e7 d7 c7 b7 a7} + vshufps %%r6, %%r4, %%r6, 0x88 ; r6 = {h6 g6 f6 e6 d6 c6 b6 a6} + vshufps %%r4, %%t0, %%t1, 0x88 ; r4 = {h4 g4 f4 e4 d4 c4 b4 a4} +%endmacro + +; LOAD ALL 4 LANES FOR 4x4 64-BIT TRANSPOSE +; +; r0-r3 [out] ymm registers which will contain the data to be transposed +; addr0-addr3 [in] pointers to the next 32-byte block of data to be fetch for the 4 lanes +; ptr_offset [in] offset to be applied on all pointers (addr0-addr3) +%macro TRANSPOSE4_U64_LOAD4 9 +%define %%r0 %1 +%define %%r1 %2 +%define %%r2 %3 +%define %%r3 %4 +%define %%addr0 %5 +%define %%addr1 %6 +%define %%addr2 %7 +%define %%addr3 %8 +%define %%ptr_offset %9 + +; Expected output data +; +; r0 = {c1 c0 a1 a0} +; r1 = {d1 d0 b1 b0} +; r2 = {c3 c2 a3 a2} +; r3 = {d3 d2 b3 b2} + + vmovupd XWORD(%%r0),[%%addr0+%%ptr_offset] + vmovupd XWORD(%%r1),[%%addr1+%%ptr_offset] + vmovupd XWORD(%%r2),[%%addr0+%%ptr_offset+16] + vmovupd XWORD(%%r3),[%%addr1+%%ptr_offset+16] + + vinserti128 %%r0, %%r0, [%%addr2+%%ptr_offset], 0x01 + vinserti128 %%r1, %%r1, [%%addr3+%%ptr_offset], 0x01 + vinserti128 %%r2, %%r2, [%%addr2+%%ptr_offset+16], 0x1 + vinserti128 %%r3, %%r3, [%%addr3+%%ptr_offset+16], 0x01 + +%endmacro + +; 4x4 64-BIT TRANSPOSE +; +; Before calling this macro, TRANSPOSE4_U64_LOAD4 must be called. +; +; This macro takes 4 registers as input (r0-r3) +; and transposes their content (64-bit elements) +; outputing the data in registers (o0,r1,o2,r3), +; using two additional registers +%macro TRANSPOSE4_U64 6 +%define %%r0 %1 ; [in] ymm register for row 0 input (c0-c1 a1-a0) +%define %%r1 %2 ; [in/out] ymm register for row 1 input (d0-d1 b1-b0) and output +%define %%r2 %3 ; [in] ymm register for row 2 input (c3-c2 a3-a2) +%define %%r3 %4 ; [in/out] ymm register for row 3 input (d3-d2 b3-b2) and output +%define %%o0 %5 ; [out] ymm register for row 0 output +%define %%o2 %6 ; [out] ymm register for row 2 output +; Input looks like: {r0 r1 r2 r3} +; r0 = {c1 c0 a1 a0} +; r1 = {d1 d0 b1 b0} +; r2 = {c3 c2 a3 a2} +; r3 = {d3 d2 b3 b2} +; +; output looks like: {o0 r1 o2 r3} +; o0 = {d0 c0 b0 a0} +; r1 = {d1 c1 b1 a1} +; o2 = {d2 c2 b2 a2} +; r3 = {d3 c3 b3 a3} + ; vshufps does not cross the mid-way boundary and hence is cheaper + vshufps %%o0, %%r0, %%r1, 0x44 ; o0 = {d0 c0 b0 a0} + vshufps %%r1, %%r0, %%r1, 0xEE ; r1 = {d1 d0 b1 b0} + + vshufps %%o2, %%r2, %%r3, 0x44 ; o1 = {d2 c2 b2 a2} + vshufps %%r3, %%r2, %%r3, 0xEE ; r3 = {d3 c3 b3 a3} +%endmacro + +%endif ;; _TRANSPOSE_AVX2_ASM_ diff --git a/src/spdk/intel-ipsec-mb/include/transpose_avx512.asm b/src/spdk/intel-ipsec-mb/include/transpose_avx512.asm new file mode 100644 index 000000000..6937ceb00 --- /dev/null +++ b/src/spdk/intel-ipsec-mb/include/transpose_avx512.asm @@ -0,0 +1,497 @@ +;; +;; Copyright (c) 2012-2019, Intel Corporation +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are met: +;; +;; * Redistributions of source code must retain the above copyright notice, +;; this list of conditions and the following disclaimer. +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; * Neither the name of Intel Corporation nor the names of its contributors +;; may be used to endorse or promote products derived from this software +;; without specific prior written permission. +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;; + +%ifndef _TRANSPOSE_AVX512_ASM_ +%define _TRANSPOSE_AVX512_ASM_ + +%include "include/reg_sizes.asm" + +section .data +default rel +align 64 +PSHUFFLE_TRANSPOSE_MASK1: dq 0x0000000000000000 + dq 0x0000000000000001 + dq 0x0000000000000008 + dq 0x0000000000000009 + dq 0x0000000000000004 + dq 0x0000000000000005 + dq 0x000000000000000C + dq 0x000000000000000D + +align 64 +PSHUFFLE_TRANSPOSE_MASK2: dq 0x0000000000000002 + dq 0x0000000000000003 + dq 0x000000000000000A + dq 0x000000000000000B + dq 0x0000000000000006 + dq 0x0000000000000007 + dq 0x000000000000000E + dq 0x000000000000000F + + +; LOAD FIRST 8 LANES FOR 16x16 32-BIT TRANSPOSE +; +; r0-r15 [out] zmm registers which will contain the data to be transposed +; addr0-addr7 [in] pointers to the next 64-byte block of data to be fetch for the first 8 lanes +; ptr_offset [in] offset to be applied on all pointers (addr0-addr7) +%macro TRANSPOSE16_U32_LOAD_FIRST8 25 +%define %%r0 %1 +%define %%r1 %2 +%define %%r2 %3 +%define %%r3 %4 +%define %%r4 %5 +%define %%r5 %6 +%define %%r6 %7 +%define %%r7 %8 +%define %%r8 %9 +%define %%r9 %10 +%define %%r10 %11 +%define %%r11 %12 +%define %%r12 %13 +%define %%r13 %14 +%define %%r14 %15 +%define %%r15 %16 +%define %%addr0 %17 +%define %%addr1 %18 +%define %%addr2 %19 +%define %%addr3 %20 +%define %%addr4 %21 +%define %%addr5 %22 +%define %%addr6 %23 +%define %%addr7 %24 +%define %%ptr_offset %25 + +; Expected output data +; +; r0 = {X X X X X X X X a7 a6 a5 a4 a3 a2 a1 a0} +; r1 = {X X X X X X X X b7 b6 b5 b4 b3 b2 b1 b0} +; r2 = {X X X X X X X X c7 c6 c5 c4 c3 c2 c1 c0} +; r3 = {X X X X X X X X d7 d6 d5 d4 d3 d2 d1 d0} +; r4 = {X X X X X X X X e7 e6 e5 e4 e3 e2 e1 e0} +; r5 = {X X X X X X X X f7 f6 f5 f4 f3 f2 f1 f0} +; r6 = {X X X X X X X X g7 g6 g5 g4 g3 g2 g1 g0} +; r7 = {X X X X X X X X h7 h6 h5 h4 h3 h2 h1 h0} +; r8 = {X X X X X X X X a15 a14 a13 a12 a11 a10 a9 a8} +; r9 = {X X X X X X X X b15 b14 b13 b12 b11 b10 b9 b8} +; r10 = {X X X X X X X X c15 c14 c13 c12 c11 c10 c9 c8} +; r11 = {X X X X X X X X d15 d14 d13 d12 d11 d10 d9 d8} +; r12 = {X X X X X X X X e15 e14 e13 e12 e11 e10 e9 e8} +; r13 = {X X X X X X X X f15 f14 f13 f12 f11 f10 f9 f8} +; r14 = {X X X X X X X X g15 g14 g13 g12 g11 g10 g9 g8} +; r15 = {X X X X X X X X h15 h14 h13 h12 h11 h10 h9 h8} + vmovups YWORD(%%r0),[%%addr0+%%ptr_offset] + vmovups YWORD(%%r1),[%%addr1+%%ptr_offset] + vmovups YWORD(%%r2),[%%addr2+%%ptr_offset] + vmovups YWORD(%%r3),[%%addr3+%%ptr_offset] + vmovups YWORD(%%r4),[%%addr4+%%ptr_offset] + vmovups YWORD(%%r5),[%%addr5+%%ptr_offset] + vmovups YWORD(%%r6),[%%addr6+%%ptr_offset] + vmovups YWORD(%%r7),[%%addr7+%%ptr_offset] + vmovups YWORD(%%r8),[%%addr0+%%ptr_offset+32] + vmovups YWORD(%%r9),[%%addr1+%%ptr_offset+32] + vmovups YWORD(%%r10),[%%addr2+%%ptr_offset+32] + vmovups YWORD(%%r11),[%%addr3+%%ptr_offset+32] + vmovups YWORD(%%r12),[%%addr4+%%ptr_offset+32] + vmovups YWORD(%%r13),[%%addr5+%%ptr_offset+32] + vmovups YWORD(%%r14),[%%addr6+%%ptr_offset+32] + vmovups YWORD(%%r15),[%%addr7+%%ptr_offset+32] + +%endmacro + +; LOAD LAST 8 LANES FOR 16x16 32-BIT TRANSPOSE +; +; r0-r15 [in/out] zmm registers which will contain the data to be transposed +; addr0-addr7 [in] pointers to the next 64-byte block of data to be fetch for the last 8 lanes +; ptr_offset [in] offset to be applied on all pointers (addr0-addr7) +%macro TRANSPOSE16_U32_LOAD_LAST8 25 +%define %%r0 %1 +%define %%r1 %2 +%define %%r2 %3 +%define %%r3 %4 +%define %%r4 %5 +%define %%r5 %6 +%define %%r6 %7 +%define %%r7 %8 +%define %%r8 %9 +%define %%r9 %10 +%define %%r10 %11 +%define %%r11 %12 +%define %%r12 %13 +%define %%r13 %14 +%define %%r14 %15 +%define %%r15 %16 +%define %%addr0 %17 +%define %%addr1 %18 +%define %%addr2 %19 +%define %%addr3 %20 +%define %%addr4 %21 +%define %%addr5 %22 +%define %%addr6 %23 +%define %%addr7 %24 +%define %%ptr_offset %25 + +; Expected output data +; +; r0 = {i7 i6 i5 i4 i3 i2 i1 i0 a7 a6 a5 a4 a3 a2 a1 a0} +; r1 = {j7 j6 j5 j4 j3 j2 j1 j0 b7 b6 b5 b4 b3 b2 b1 b0} +; r2 = {k7 k6 k5 k4 k3 k2 k1 k0 c7 c6 c5 c4 c3 c2 c1 c0} +; r3 = {l7 l6 l5 l4 l3 l2 l1 l0 d7 d6 d5 d4 d3 d2 d1 d0} +; r4 = {m7 m6 m5 m4 m3 m2 m1 m0 e7 e6 e5 e4 e3 e2 e1 e0} +; r5 = {n7 n6 n5 n4 n3 n2 n1 n0 f7 f6 f5 f4 f3 f2 f1 f0} +; r6 = {o7 o6 o5 o4 o3 o2 o1 o0 g7 g6 g5 g4 g3 g2 g1 g0} +; r7 = {p7 p6 p5 p4 p3 p2 p1 p0 h7 h6 h5 h4 h3 h2 h1 h0} +; r8 = {i15 i14 i13 i12 i11 i10 i9 i8 a15 a14 a13 a12 a11 a10 a9 a8} +; r9 = {j15 j14 j13 j12 j11 j10 j9 j8 b15 b14 b13 b12 b11 b10 b9 b8} +; r10 = {k15 k14 k13 k12 k11 k10 k9 k8 c15 c14 c13 c12 c11 c10 c9 c8} +; r11 = {l15 l14 l13 l12 l11 l10 l9 l8 d15 d14 d13 d12 d11 d10 d9 d8} +; r12 = {m15 m14 m13 m12 m11 m10 m9 m8 e15 e14 e13 e12 e11 e10 e9 e8} +; r13 = {n15 n14 n13 n12 n11 n10 n9 n8 f15 f14 f13 f12 f11 f10 f9 f8} +; r14 = {o15 o14 o13 o12 o11 o10 o9 o8 g15 g14 g13 g12 g11 g10 g9 g8} +; r15 = {p15 p14 p13 p12 p11 p10 p9 p8 h15 h14 h13 h12 h11 h10 h9 h8} + + vinserti64x4 %%r0, %%r0, [%%addr0+%%ptr_offset], 0x01 + vinserti64x4 %%r1, %%r1, [%%addr1+%%ptr_offset], 0x01 + vinserti64x4 %%r2, %%r2, [%%addr2+%%ptr_offset], 0x01 + vinserti64x4 %%r3, %%r3, [%%addr3+%%ptr_offset], 0x01 + vinserti64x4 %%r4, %%r4, [%%addr4+%%ptr_offset], 0x01 + vinserti64x4 %%r5, %%r5, [%%addr5+%%ptr_offset], 0x01 + vinserti64x4 %%r6, %%r6, [%%addr6+%%ptr_offset], 0x01 + vinserti64x4 %%r7, %%r7, [%%addr7+%%ptr_offset], 0x01 + vinserti64x4 %%r8, %%r8, [%%addr0+%%ptr_offset+32], 0x01 + vinserti64x4 %%r9, %%r9, [%%addr1+%%ptr_offset+32], 0x01 + vinserti64x4 %%r10, %%r10, [%%addr2+%%ptr_offset+32], 0x01 + vinserti64x4 %%r11, %%r11, [%%addr3+%%ptr_offset+32], 0x01 + vinserti64x4 %%r12, %%r12, [%%addr4+%%ptr_offset+32], 0x01 + vinserti64x4 %%r13, %%r13, [%%addr5+%%ptr_offset+32], 0x01 + vinserti64x4 %%r14, %%r14, [%%addr6+%%ptr_offset+32], 0x01 + vinserti64x4 %%r15, %%r15, [%%addr7+%%ptr_offset+32], 0x01 + +%endmacro + +; 16x16 32-BIT TRANSPOSE +; +; Before calling this macro, TRANSPOSE16_U32_LOAD_FIRST8 and TRANSPOSE16_U32_LOAD_LAST8 +; must be called. +; +; r0-r7 [in/out] zmm registers containing bytes 0-31 of each 64B block (e.g. zmm0 = [i7-i0 a7-a0]) +; r8-r15 [in/out] zmm registers containing bytes 32-63 of each 64B block (e.g. zmm8 = [i15-i8 a15-a8]) +; t0-t1 [clobbered] zmm temporary registers +; m0-m1 [clobbered] zmm registers for shuffle mask storing +%macro TRANSPOSE16_U32 20 +%define %%r0 %1 +%define %%r1 %2 +%define %%r2 %3 +%define %%r3 %4 +%define %%r4 %5 +%define %%r5 %6 +%define %%r6 %7 +%define %%r7 %8 +%define %%r8 %9 +%define %%r9 %10 +%define %%r10 %11 +%define %%r11 %12 +%define %%r12 %13 +%define %%r13 %14 +%define %%r14 %15 +%define %%r15 %16 +%define %%t0 %17 +%define %%t1 %18 +%define %%m0 %19 +%define %%m1 %20 + +; Input data +; +; r0 = {i7 i6 i5 i4 i3 i2 i1 i0 a7 a6 a5 a4 a3 a2 a1 a0} +; r1 = {j7 j6 j5 j4 j3 j2 j1 j0 b7 b6 b5 b4 b3 b2 b1 b0} +; r2 = {k7 k6 k5 k4 k3 k2 k1 k0 c7 c6 c5 c4 c3 c2 c1 c0} +; r3 = {l7 l6 l5 l4 l3 l2 l1 l0 d7 d6 d5 d4 d3 d2 d1 d0} +; r4 = {m7 m6 m5 m4 m3 m2 m1 m0 e7 e6 e5 e4 e3 e2 e1 e0} +; r5 = {n7 n6 n5 n4 n3 n2 n1 n0 f7 f6 f5 f4 f3 f2 f1 f0} +; r6 = {o7 o6 o5 o4 o3 o2 o1 o0 g7 g6 g5 g4 g3 g2 g1 g0} +; r7 = {p7 p6 p5 p4 p3 p2 p1 p0 h7 h6 h5 h4 h3 h2 h1 h0} +; r8 = {i15 i14 i13 i12 i11 i10 i9 i8 a15 a14 a13 a12 a11 a10 a9 a8} +; r9 = {j15 j14 j13 j12 j11 j10 j9 j8 b15 b14 b13 b12 b11 b10 b9 b8} +; r10 = {k15 k14 k13 k12 k11 k10 k9 k8 c15 c14 c13 c12 c11 c10 c9 c8} +; r11 = {l15 l14 l13 l12 l11 l10 l9 l8 d15 d14 d13 d12 d11 d10 d9 d8} +; r12 = {m15 m14 m13 m12 m11 m10 m9 m8 e15 e14 e13 e12 e11 e10 e9 e8} +; r13 = {n15 n14 n13 n12 n11 n10 n9 n8 f15 f14 f13 f12 f11 f10 f9 f8} +; r14 = {o15 o14 o13 o12 o11 o10 o9 o8 g15 g14 g13 g12 g11 g10 g9 g8} +; r15 = {p15 p14 p13 p12 p11 p10 p9 p8 h15 h14 h13 h12 h11 h10 h9 h8} + +; Expected output data +; +; r0 = {p0 o0 n0 m0 l0 k0 j0 i0 h0 g0 f0 e0 d0 c0 b0 a0} +; r1 = {p1 o1 n1 m1 l1 k1 j1 i1 h1 g1 f1 e1 d1 c1 b1 a1} +; r2 = {p2 o2 n2 m2 l2 k2 j2 i2 h2 g2 f2 e2 d2 c2 b2 a2} +; r3 = {p3 o3 n3 m3 l3 k3 j3 i3 h3 g3 f3 e3 d3 c3 b3 a3} +; r4 = {p4 o4 n4 m4 l4 k4 j4 i4 h4 g4 f4 e4 d4 c4 b4 a4} +; r5 = {p5 o5 n5 m5 l5 k5 j5 i5 h5 g5 f5 e5 d5 c5 b5 a5} +; r6 = {p6 o6 n6 m6 l6 k6 j6 i6 h6 g6 f6 e6 d6 c6 b6 a6} +; r7 = {p7 o7 n7 m7 l7 k7 j7 i7 h7 g7 f7 e7 d7 c7 b7 a7} +; r8 = {p8 o8 n8 m8 l8 k8 j8 i8 h8 g8 f8 e8 d8 c8 b8 a8} +; r9 = {p9 o9 n9 m9 l9 k9 j9 i9 h9 g9 f9 e9 d9 c9 b9 a9} +; r10 = {p10 o10 n10 m10 l10 k10 j10 i10 h10 g10 f10 e10 d10 c10 b10 a10} +; r11 = {p11 o11 n11 m11 l11 k11 j11 i11 h11 g11 f11 e11 d11 c11 b11 a11} +; r12 = {p12 o12 n12 m12 l12 k12 j12 i12 h12 g12 f12 e12 d12 c12 b12 a12} +; r13 = {p13 o13 n13 m13 l13 k13 j13 i13 h13 g13 f13 e13 d13 c13 b13 a13} +; r14 = {p14 o14 n14 m14 l14 k14 j14 i14 h14 g14 f14 e14 d14 c14 b14 a14} +; r15 = {p15 o15 n15 m15 l15 k15 j15 i15 h15 g15 f15 e15 d15 c15 b15 a15} + + + ; process first 4 rows (r0..r3) + vshufps %%t0, %%r0, %%r1, 0x44 ; t0 = {j5 j4 i5 i4 j1 j0 i1 i0 b5 b4 a5 a4 b1 b0 a1 a0} + vshufps %%r0, %%r0, %%r1, 0xEE ; r0 = {j7 j6 i7 i6 j3 j2 i3 i2 b7 b6 a7 a6 b3 b2 a3 a2} + vshufps %%t1, %%r2, %%r3, 0x44 ; t1 = {l5 l4 k5 k4 l1 l0 k1 k0 d5 d4 c5 c4 d1 d0 c1 c0} + vshufps %%r2, %%r2, %%r3, 0xEE ; r2 = {l7 l6 k7 k6 l3 l2 k3 k2 d7 d6 c7 c6 d3 d2 c3 c2} + + vshufps %%r3, %%t0, %%t1, 0xDD ; r3 = {l5 k5 j5 i5 l1 k1 j1 i1 d5 c5 b5 a5 d1 c1 b1 a1} + vshufps %%r1, %%r0, %%r2, 0x88 ; r1 = {l6 k6 j6 i6 l2 k2 j2 i2 d6 c6 b6 a6 d2 c2 b2 a2} + vshufps %%r0, %%r0, %%r2, 0xDD ; r0 = {l7 k7 j7 i7 l3 k3 j3 i3 d7 c7 b7 a7 d3 c3 b3 a3} + vshufps %%t0, %%t0, %%t1, 0x88 ; t0 = {l4 k4 j4 i4 l0 k0 j0 i0 d4 c4 b4 a4 d0 c0 b0 a0} + + ; Load permute masks + vmovdqa64 %%m0, [PSHUFFLE_TRANSPOSE_MASK1] + vmovdqa64 %%m1, [PSHUFFLE_TRANSPOSE_MASK2] + + ; process second 4 rows (r4..r7) + vshufps %%r2, %%r4, %%r5, 0x44 ; r2 = {n5 n4 m5 m4 n1 n0 m1 m0 f5 f4 e5 e4 f1 f0 e1 e0} + vshufps %%r4, %%r4, %%r5, 0xEE ; r4 = {n7 n6 m7 m6 n3 n2 m3 m2 f7 f6 e7 e6 f3 f2 e3 e2} + vshufps %%t1, %%r6, %%r7, 0x44 ; t1 = {p5 p4 o5 o4 p1 p0 o1 o0 h5 h4 g5 g4 h1 h0 g1 g0} + vshufps %%r6, %%r6, %%r7, 0xEE ; r6 = {p7 p6 o7 o6 p3 p2 o3 o2 h7 h6 g7 g6 h3 h2 g3 g2} + + vshufps %%r7, %%r2, %%t1, 0xDD ; r7 = {p5 o5 n5 m5 p1 o1 n1 m1 h5 g5 f5 e5 h1 g1 f1 e1} + vshufps %%r5, %%r4, %%r6, 0x88 ; r5 = {p6 o6 n6 m6 p2 o2 n2 m2 h6 g6 f6 e6 h2 g2 f2 e2} + vshufps %%r4, %%r4, %%r6, 0xDD ; r4 = {p7 o7 n7 m7 p3 o3 n3 m3 h7 g7 f7 e7 h3 g3 f3 e3} + vshufps %%r2, %%r2, %%t1, 0x88 ; r2 = {p4 o4 n4 m4 p0 o0 n0 m0 h4 g4 f4 e4 h0 g0 f0 e0} + + ; process third 4 rows (r8..r11) + vshufps %%r6, %%r8, %%r9, 0x44 ; r6 = {j13 j12 i13 i12 j9 j8 i9 i8 b13 b12 a13 a12 b9 b8 a9 a8 } + vshufps %%r8, %%r8, %%r9, 0xEE ; r8 = {j15 j14 i15 i14 j11 j10 i11 i10 b15 b14 a15 a14 b11 b10 a11 a10} + vshufps %%t1, %%r10, %%r11, 0x44 ; t1 = {l13 l12 k13 k12 l9 l8 k9 k8 d13 d12 c13 c12 d9 d8 c9 c8 } + vshufps %%r10, %%r10, %%r11, 0xEE ; r10 = {l15 l14 k15 k14 l11 l10 k11 k10 d15 d14 c15 c14 d11 d10 c11 c10} + + vshufps %%r11, %%r6, %%t1, 0xDD ; r11 = {l13 k13 j13 i13 l9 k9 j9 i9 d13 c13 b13 a13 d9 c9 b9 a9 } + vshufps %%r9, %%r8, %%r10, 0x88 ; r9 = {l14 k14 j14 i14 l10 k10 j10 i10 d14 c14 b14 a14 d10 c10 b10 a10} + vshufps %%r8, %%r8, %%r10, 0xDD ; r8 = {l15 k15 j15 i15 l11 k11 j11 i11 d15 c15 b15 a15 d11 c11 b11 a11} + vshufps %%r6, %%r6, %%t1, 0x88 ; r6 = {l12 k12 j12 i12 l8 k8 j8 i8 d12 c12 b12 a12 d8 c8 b8 a8 } + + ; process fourth 4 rows (r12..r15) + vshufps %%r10, %%r12, %%r13, 0x44 ; r10 = {n13 n12 m13 m12 n9 n8 m9 m8 f13 f12 e13 e12 f9 f8 e9 e8 } + vshufps %%r12, %%r12, %%r13, 0xEE ; r12 = {n15 n14 m15 m14 n11 n10 m11 m10 f15 f14 e15 e14 f11 f10 e11 e10} + vshufps %%t1, %%r14, %%r15, 0x44 ; t1 = {p13 p12 o13 o12 p9 p8 o9 o8 h13 h12 g13 g12 h9 h8 g9 g8 } + vshufps %%r14, %%r14, %%r15, 0xEE ; r14 = {p15 p14 o15 o14 p11 p10 o11 o10 h15 h14 g15 g14 h11 h10 g11 g10} + + vshufps %%r15, %%r10, %%t1, 0xDD ; r15 = {p13 o13 n13 m13 p9 o9 n9 m9 h13 g13 f13 e13 h9 g9 f9 e9 } + vshufps %%r13, %%r12, %%r14, 0x88 ; r13 = {p14 o14 n14 m14 p10 o10 n10 m10 h14 g14 f14 e14 h10 g10 f10 e10} + vshufps %%r12, %%r12, %%r14, 0xDD ; r12 = {p15 o15 n15 m15 p11 o11 n11 m11 h15 g15 f15 e15 h11 g11 f11 e11} + vshufps %%r10, %%r10, %%t1, 0x88 ; r10 = {p12 o12 n12 m12 p8 o8 n8 m8 h12 g12 f12 e12 h8 g8 f8 e8 } + + ; perform final shuffles on bottom half, producing r8-r15 + vmovdqu32 %%t1, %%m0 + vpermi2q %%t1, %%r9, %%r13 ; t1 = {p10 o10 n10 m10 l10 k10 j10 i10 h10 g10 f10 e10 d10 c10 b10 a10} + vmovdqu32 %%r14, %%m1 + vpermi2q %%r14, %%r9, %%r13 ; r14 = {p14 o14 n14 m14 l14 k14 j14 i14 h14 g14 f14 e14 d14 c14 b14 a14} + + vmovdqu32 %%r9, %%m0 + vpermi2q %%r9, %%r11, %%r15 ; r9 = {p9 o9 n9 m9 l9 k9 j9 i9 h9 g9 f9 e9 d9 c9 b9 a9} + vmovdqu32 %%r13, %%m1 + vpermi2q %%r13, %%r11, %%r15 ; r13 = {p13 o13 n13 m13 l13 k13 j13 i13 h13 g13 f13 e13 d13 c13 b13 a13} + + vmovdqu32 %%r11, %%m0 + vpermi2q %%r11, %%r8, %%r12 ; r11 = {p11 o11 n11 m11 l11 k11 j11 i11 h11 g11 f11 e11 d11 c11 b11 a11} + vmovdqu32 %%r15, %%m1 + vpermi2q %%r15, %%r8, %%r12 ; r15 = {p15 o15 n15 m15 l15 k15 j15 i15 h15 g15 f15 e15 d15 c15 b15 a15} + + vmovdqu32 %%r8, %%m0 + vpermi2q %%r8, %%r6, %%r10 ; r8 = {p8 o8 n8 m8 l8 k8 j8 i8 h8 g8 f8 e8 d8 c8 b8 a8} + vmovdqu32 %%r12, %%m1 + vpermi2q %%r12, %%r6, %%r10 ; r12 = {p12 o12 n12 m12 l12 k12 j12 i12 h12 g12 f12 e12 d12 c12 b12 a12} + + vmovdqu32 %%r10, %%t1 ; r10 = {p10 o10 n10 m10 l10 k10 j10 i10 h10 g10 f10 e10 d10 c10 b10 a10} + + ; perform final shuffles on top half, producing r0-r7 + vmovdqu32 %%t1, %%m0 + vpermi2q %%t1, %%r1, %%r5 ; t1 = {p2 o2 n2 m2 l2 k2 j2 i2 h2 g2 f2 e2 d2 c2 b2 a2} + vmovdqu32 %%r6, %%m1 + vpermi2q %%r6, %%r1, %%r5 ; r6 = {p6 o6 n6 m6 l6 k6 j6 i6 h6 g6 f6 e6 d6 c6 b6 a6} + + vmovdqu32 %%r1, %%m0 + vpermi2q %%r1, %%r3, %%r7 ; r1 = {p1 o1 n1 m1 l1 k1 j1 i1 h1 g1 f1 e1 d1 c1 b1 a1} + vmovdqu32 %%r5, %%m1 + vpermi2q %%r5, %%r3, %%r7 ; r5 = {p5 o5 n5 m5 l5 k5 j5 i5 h5 g5 f5 e5 d5 c5 b5 a5} + + vmovdqu32 %%r3, %%m0 + vpermi2q %%r3, %%r0, %%r4 ; r3 = {p3 o3 n3 m3 l3 k3 j3 i3 h3 g3 f3 e3 d3 c3 b3 a3} + vmovdqu32 %%r7, %%m1 + vpermi2q %%r7, %%r0, %%r4 ; r7 = {p7 o7 n7 m7 l7 k7 j7 i7 h7 g7 f7 e7 d7 c7 b7 a7} + + vmovdqu32 %%r0, %%m0 + vpermi2q %%r0, %%t0, %%r2 ; r0 = {p0 o0 n0 m0 l0 k0 j0 i0 h0 g0 f0 e0 d0 c0 b0 a0} + vmovdqu32 %%r4, %%m1 + vpermi2q %%r4, %%t0, %%r2 ; r4 = {p4 o4 n4 m4 l4 k4 j4 i4 h4 g4 f4 e4 d4 c4 b4 a4} + + vmovdqu32 %%r2, %%t1 ; r2 = {p2 o2 n2 m2 l2 k2 j2 i2 h2 g2 f2 e2 d2 c2 b2 a2} + +%endmacro + +; LOAD ALL 8 LANES FOR 8x8 64-BIT TRANSPOSE +; +; r0-r7 [out] zmm registers which will contain the data to be transposed +; addr0-addr7 [in] pointers to the next 64-byte block of data to be fetch for all 8 lanes +; ptr_offset [in] offset to be applied on all pointers (addr0-addr7) +%macro TRANSPOSE8_U64_LOAD8 17 +%define %%r0 %1 +%define %%r1 %2 +%define %%r2 %3 +%define %%r3 %4 +%define %%r4 %5 +%define %%r5 %6 +%define %%r6 %7 +%define %%r7 %8 +%define %%addr0 %9 +%define %%addr1 %10 +%define %%addr2 %11 +%define %%addr3 %12 +%define %%addr4 %13 +%define %%addr5 %14 +%define %%addr6 %15 +%define %%addr7 %16 +%define %%ptr_offset %17 + +; Expected output data +; +; r0 = {e3 e2 e1 e0 a3 a2 a1 a0} +; r1 = {f3 f2 f1 f0 b3 b2 b1 b0} +; r2 = {g3 g2 g1 g0 c3 c2 c1 c0} +; r3 = {h3 h2 h1 h0 d3 d2 d1 d0} +; r4 = {e7 e6 e5 e4 a7 a6 a5 a4} +; r5 = {f7 f6 f5 f4 b7 b6 b5 b4} +; r6 = {g7 g6 g5 g4 c7 c6 c5 c4} +; r7 = {h7 h6 h5 h4 d7 d6 d5 d4} + + vmovups YWORD(%%r0),[%%addr0+%%ptr_offset] + vmovups YWORD(%%r1),[%%addr1+%%ptr_offset] + vmovups YWORD(%%r2),[%%addr2+%%ptr_offset] + vmovups YWORD(%%r3),[%%addr3+%%ptr_offset] + vmovups YWORD(%%r4),[%%addr0+%%ptr_offset+32] + vmovups YWORD(%%r5),[%%addr1+%%ptr_offset+32] + vmovups YWORD(%%r6),[%%addr2+%%ptr_offset+32] + vmovups YWORD(%%r7),[%%addr3+%%ptr_offset+32] + + vinserti64x4 %%r0, %%r0, [%%addr4+%%ptr_offset], 0x01 + vinserti64x4 %%r1, %%r1, [%%addr5+%%ptr_offset], 0x01 + vinserti64x4 %%r2, %%r2, [%%addr6+%%ptr_offset], 0x01 + vinserti64x4 %%r3, %%r3, [%%addr7+%%ptr_offset], 0x01 + vinserti64x4 %%r4, %%r4, [%%addr4+%%ptr_offset+32], 0x01 + vinserti64x4 %%r5, %%r5, [%%addr5+%%ptr_offset+32], 0x01 + vinserti64x4 %%r6, %%r6, [%%addr6+%%ptr_offset+32], 0x01 + vinserti64x4 %%r7, %%r7, [%%addr7+%%ptr_offset+32], 0x01 + +%endmacro + +; 8x8 64-BIT TRANSPOSE +; +; Before calling this macro, TRANSPOSE8_U64_LOAD8 must be called. +; +; r0-r3 [in/out] zmm registers containing bytes 0-31 of each 64B block (e.g. zmm0 = [e3-e0 a3-a0]) +; r4-r7 [in/out] zmm registers containing bytes 32-63 of each 64B block (e.g. zmm4 = [e4-e7 a4-a7]) +; t0-t1 [clobbered] zmm temporary registers +; PERM_INDEX1-2 [clobbered] zmm registers for shuffle mask storing +%macro TRANSPOSE8_U64 12 +%define %%r0 %1 +%define %%r1 %2 +%define %%r2 %3 +%define %%r3 %4 +%define %%r4 %5 +%define %%r5 %6 +%define %%r6 %7 +%define %%r7 %8 +%define %%t0 %9 +%define %%t1 %10 +%define %%PERM_INDEX1 %11 +%define %%PERM_INDEX2 %12 + +; each x(i) is 64 bits, 8 * 64 = 512 ==> a full digest length, 64-bit double precision quantities + +; Input data +; +; r0 = {e3 e2 e1 e0 a3 a2 a1 a0} +; r1 = {f3 f2 f1 f0 b3 b2 b1 b0} +; r2 = {g3 g2 g1 g0 c3 c2 c1 c0} +; r3 = {h3 h2 h1 h0 d3 d2 d1 d0} +; r4 = {e7 e6 e5 e4 a7 a6 a5 a4} +; r5 = {f7 f6 f5 f4 b7 b6 b5 b4} +; r6 = {g7 g6 g5 g4 c7 c6 c5 c4} +; r7 = {h7 h6 h5 h4 d7 d6 d5 d4} +; +; Expected output data +; +; r0 = {h0 g0 f0 e0 d0 c0 b0 a0} +; r1 = {h1 g1 f1 e1 d1 c1 b1 a1} +; r2 = {h2 g2 f2 e2 d2 c2 b2 a2} +; r3 = {h3 g3 f3 e3 d3 c3 b3 a3} +; r4 = {h4 g4 f4 e4 d4 c4 b4 a4} +; r5 = {h5 g5 f5 e5 d5 c5 b5 a5} +; r6 = {h6 g6 f6 e6 d6 c6 b6 a6} +; r7 = {h7 g7 f7 e7 d7 c7 b7 a7} + + ;; ;;; will not get clobbered + vmovdqa32 %%PERM_INDEX1, [PSHUFFLE_TRANSPOSE_MASK1] ; temp + vmovdqa32 %%PERM_INDEX2, [PSHUFFLE_TRANSPOSE_MASK2] ; temp + + ; process top half (r0..r3) + vshufpd %%t0, %%r0, %%r1, 0x00 ; t0 = {f2 e2 f0 e0 b2 a2 b0 a0} + vshufpd %%r1, %%r0, %%r1, 0xFF ; r0 = {f3 e3 f1 e1 b3 a3 b1 a1} + vshufpd %%t1, %%r2, %%r3, 0x00 ; t1 = {h2 g2 h0 g0 d2 c2 d0 c0} + vshufpd %%r2, %%r2, %%r3, 0xFF ; r2 = {h3 g3 h1 g1 d3 c3 d1 c1} + + vmovdqa32 %%r3, %%r1 + vpermt2q %%r1, %%PERM_INDEX1,%%r2 ; r1 = {h1 g1 f1 e1 d1 c1 b1 a1} + vpermt2q %%r3, %%PERM_INDEX2,%%r2 ; r3 = {h3 g3 f3 e3 d3 c3 b3 a3} + + vmovdqa32 %%r0, %%t0 + vmovdqa32 %%r2, %%t0 + vpermt2q %%r0, %%PERM_INDEX1,%%t1 ; r0 = {h0 g0 f0 e0 d0 c0 b0 a0} + vpermt2q %%r2, %%PERM_INDEX2,%%t1 ; r2 = {h2 g2 f2 e2 d2 c2 b2 a2} + + ; process top bottom (r4..r7) + vshufpd %%t0, %%r4, %%r5, 0x00 ; t0 = {f6 e6 f4 e4 b6 a6 b4 a4} + vshufpd %%r5, %%r4, %%r5, 0xFF ; r0 = {f7 e7 f5 e5 b7 a7 b5 a5} + vshufpd %%t1, %%r6, %%r7, 0x00 ; t1 = {h6 g6 h4 g4 d6 c6 d4 c4} + vshufpd %%r6, %%r6, %%r7, 0xFF ; r2 = {h7 g7 h5 g5 d7 c7 d5 c5} + + vmovdqa32 %%r7, %%r5 + vpermt2q %%r5, %%PERM_INDEX1,%%r6 ; r5 = {h5 g5 f5 e5 d5 c5 b5 a5} + vpermt2q %%r7, %%PERM_INDEX2,%%r6 ; r7 = {h7 g7 f7 e7 d7 c7 b7 a7} + + vmovdqa32 %%r4, %%t0 + vmovdqa32 %%r6, %%t0 + vpermt2q %%r4, %%PERM_INDEX1,%%t1 ; r4 = {h4 g4 f4 e4 d4 c4 b4 a4} + vpermt2q %%r6, %%PERM_INDEX2,%%t1 ; r6 = {h6 g6 f6 e6 d6 c6 b6 a6} +%endmacro + +%endif ;; _TRANSPOSE_AVX512_ASM_ diff --git a/src/spdk/intel-ipsec-mb/include/wireless_common.asm b/src/spdk/intel-ipsec-mb/include/wireless_common.asm new file mode 100644 index 000000000..811c2c256 --- /dev/null +++ b/src/spdk/intel-ipsec-mb/include/wireless_common.asm @@ -0,0 +1,128 @@ +;; +;; Copyright (c) 2019, Intel Corporation +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are met: +;; +;; * Redistributions of source code must retain the above copyright notice, +;; this list of conditions and the following disclaimer. +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; * Neither the name of Intel Corporation nor the names of its contributors +;; may be used to endorse or promote products derived from this software +;; without specific prior written permission. +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;; + +%include "include/os.asm" + +section .data +default rel +align 16 +swap_mask: +db 0x03, 0x02, 0x01, 0x00, 0x07, 0x06, 0x05, 0x04 +db 0x0b, 0x0a, 0x09, 0x08, 0x0f, 0x0e, 0x0d, 0x0c + +section .text + +; Function which XOR's 64 bytes of the input buffer with 64 bytes of the +; KeyStream, placing the result in the output buffer. +; KeyStream bytes must be swapped on 32 bit boundary before this operation +%macro xor_keystream 1 +%define %%SIMDTYPE %1 ; "SSE" or "AVX" + +%ifidn %%SIMDTYPE, AVX + %define %%MOVDQU vmovdqu + %define %%MOVDQA vmovdqa + %define %%PXOR vpxor + %define %%PSHUFB vpshufb +%else + %define %%MOVDQU movdqu + %define %%MOVDQA movdqa + %define %%PXOR pxor + %define %%PSHUFB pshufb +%endif +%ifdef LINUX + %define %%pIn rdi + %define %%pOut rsi + %define %%pKS rdx +%else + %define %%pIn rcx + %define %%pOut rdx + %define %%pKS r8 + + mov rax, rsp + sub rsp, 48 + and rsp, ~15 + %%MOVDQA [rsp], xmm6 + %%MOVDQA [rsp + 16], xmm7 + %%MOVDQA [rsp + 32], xmm8 +%endif + %define XKEY0 xmm0 + %define XKEY1 xmm1 + %define XKEY2 xmm2 + %define XKEY3 xmm3 + %define XIN0 xmm4 + %define XIN1 xmm5 + %define XIN2 xmm6 + %define XIN3 xmm7 + %define XSHUF xmm8 + + %%MOVDQA XSHUF, [rel swap_mask] + %%MOVDQA XKEY0, [%%pKS] + %%MOVDQA XKEY1, [%%pKS + 16] + %%MOVDQA XKEY2, [%%pKS + 32] + %%MOVDQA XKEY3, [%%pKS + 48] + + %%PSHUFB XKEY0, XSHUF + %%PSHUFB XKEY1, XSHUF + %%PSHUFB XKEY2, XSHUF + %%PSHUFB XKEY3, XSHUF + + %%MOVDQU XIN0, [%%pIn] + %%MOVDQU XIN1, [%%pIn + 16] + %%MOVDQU XIN2, [%%pIn + 32] + %%MOVDQU XIN3, [%%pIn + 48] + + %%PXOR XKEY0, XIN0 + %%PXOR XKEY1, XIN1 + %%PXOR XKEY2, XIN2 + %%PXOR XKEY3, XIN3 + + %%MOVDQU [%%pOut], XKEY0 + %%MOVDQU [%%pOut + 16], XKEY1 + %%MOVDQU [%%pOut + 32], XKEY2 + %%MOVDQU [%%pOut + 48], XKEY3 + +%ifndef LINUX + %%MOVDQA xmm6, [rsp] + %%MOVDQA xmm7, [rsp + 16] + %%MOVDQA xmm8, [rsp + 32] + mov rsp,rax +%endif +%endmacro + +MKGLOBAL(asm_XorKeyStream64B_avx,function,internal) +asm_XorKeyStream64B_avx: + xor_keystream AVX + ret + +MKGLOBAL(asm_XorKeyStream64B_sse,function,internal) +asm_XorKeyStream64B_sse: + xor_keystream SSE + ret + +%ifdef LINUX +section .note.GNU-stack noalloc noexec nowrite progbits +%endif diff --git a/src/spdk/intel-ipsec-mb/include/wireless_common.h b/src/spdk/intel-ipsec-mb/include/wireless_common.h new file mode 100644 index 000000000..a0ba60019 --- /dev/null +++ b/src/spdk/intel-ipsec-mb/include/wireless_common.h @@ -0,0 +1,216 @@ +/******************************************************************************* + Copyright (c) 2009-2019, Intel Corporation + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of Intel Corporation nor the names of its contributors + may be used to endorse or promote products derived from this software + without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE + FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#ifndef _WIRELESS_COMMON_H_ +#define _WIRELESS_COMMON_H_ + +#include +#ifdef LINUX +#include +#else +#include +#endif + +#define NUM_PACKETS_1 1 +#define NUM_PACKETS_2 2 +#define NUM_PACKETS_3 3 +#define NUM_PACKETS_4 4 +#define NUM_PACKETS_8 8 +#define NUM_PACKETS_16 16 + +#ifdef LINUX +#define BSWAP32 __builtin_bswap32 +#define BSWAP64 __builtin_bswap64 +#else +#define BSWAP32 _byteswap_ulong +#define BSWAP64 _byteswap_uint64 +#endif + +typedef union _m128_u { + uint8_t byte[16]; + uint16_t word[8]; + uint32_t dword[4]; + uint64_t qword[2]; + __m128i m; +} m128_t; + +typedef union _m64_u { + uint8_t byte[8]; + uint16_t word[4]; + uint32_t dword[2]; + uint64_t m; +} m64_t; + +static inline uint32_t bswap4(const uint32_t val) +{ + return ((val >> 24) | /**< A*/ + ((val & 0xff0000) >> 8) | /**< B*/ + ((val & 0xff00) << 8) | /**< C*/ + (val << 24)); /**< D*/ +} + +/************************************************************************* +* @description - this function is used to copy the right number of bytes +* from the source to destination buffer +* +* @param pSrc [IN] - pointer to an input Byte array (at least len bytes +* available) +* @param pDst [IN] - pointer to the output buffer (at least len bytes available) +* @param len [IN] - length in bytes to copy (0 to 4) +* +*************************************************************************/ +static inline void memcpy_keystream_32(uint8_t *pDst, + const uint8_t *pSrc, + const uint32_t len) +{ + switch (len) { + case 4: + *(uint32_t *)pDst = *(const uint32_t *)pSrc; + break; + case 3: + pDst[2] = pSrc[2]; + /* fall-through */ + case 2: + pDst[1] = pSrc[1]; + /* fall-through */ + case 1: + pDst[0] = pSrc[0]; + /* fall-through */ + } +} + +/************************************************************************* +* @description - this function is used to XOR the right number of bytes +* from a keystrea and a source into a destination buffer +* +* @param pSrc [IN] - pointer to an input Byte array (at least 4 bytes available) +* @param pDst [IN] - pointer to the output buffer (at least 4 bytes available) +* @param KS [IN] - 4 bytes of keystream number, must be reversed +* into network byte order before XOR +* +*************************************************************************/ +static inline void xor_keystream_reverse_32(uint8_t *pDst, + const uint8_t *pSrc, + const uint32_t KS) +{ + *(uint32_t *)pDst = (*(const uint32_t *)pSrc) ^ BSWAP32(KS); +} + +/****************************************************************************** + * @description - this function is used to do a keystream operation + * @param pSrc [IN] - pointer to an input Byte array (at least 8 bytes + * available) + * @param pDst [IN] - pointer to the output buffer (at least 8 bytes available) + * @param keyStream [IN] - the Keystream value (8 bytes) + ******************************************************************************/ +static inline const uint8_t * +xor_keystrm_rev(uint8_t *pDst, const uint8_t *pSrc, uint64_t keyStream) +{ + /* default: XOR ONLY, read the input buffer, update the output buffer */ + const uint64_t *pSrc64 = (const uint64_t *)pSrc; + uint64_t *pDst64 = (uint64_t *)pDst; + *pDst64 = *pSrc64 ^ BSWAP64(keyStream); + return (const uint8_t *)(pSrc64 + 1); +} + +/****************************************************************************** + * @description - this function is used to copy the right number of bytes + * from the source to destination buffer + * @param pSrc [IN] - pointer to an input Byte array (at least len bytes + * available) + * @param pDst [IN] - pointer to the output buffer (at least len bytes + * available) + * @param len [IN] - length in bytes to copy + ******************************************************************************/ +static inline void +memcpy_keystrm(uint8_t *pDst, const uint8_t *pSrc, const uint32_t len) +{ + switch (len) { + case 8: + *(uint64_t *)pDst = *(const uint64_t *)pSrc; + break; + case 7: + pDst[6] = pSrc[6]; + /* fall-through */ + case 6: + pDst[5] = pSrc[5]; + /* fall-through */ + case 5: + pDst[4] = pSrc[4]; + /* fall-through */ + case 4: + *(uint32_t *)pDst = *(const uint32_t *)pSrc; + break; + case 3: + pDst[2] = pSrc[2]; + /* fall-through */ + case 2: + pDst[1] = pSrc[1]; + /* fall-through */ + case 1: + pDst[0] = pSrc[0]; + /* fall-through */ + } +} + +/** + ****************************************************************************** + * + * @description + * Definition of the external SSE function that XOR's 64 bytes of input + * with 64 bytes of keystream, swapping keystream bytes every 4 bytes. + * + * @param[in] pIn Pointer to the input buffer + * @param[out] pOut Pointer to the output buffer + * @param[in] pKey Pointer to the new 64 byte keystream + * + * @pre + * None + * + *****************************************************************************/ +IMB_DLL_LOCAL void asm_XorKeyStream64B_sse(const void *pIn, void *pOut, + const void *pKey); + +/** + ****************************************************************************** + * + * @description + * Definition of the external AVX function that XOR's 64 bytes of input + * with 64 bytes of keystream, swapping keystream bytes every 4 bytes. + * + * @param[in] pIn Pointer to the input buffer + * @param[out] pOut Pointer to the output buffer + * @param[in] pKey Pointer to the new 64 byte keystream + * + * @pre + * None + * + *****************************************************************************/ +IMB_DLL_LOCAL void asm_XorKeyStream64B_avx(const void *pIn, void *pOut, + const void *pKey); + +#endif /* _WIRELESS_COMMON_H_ */ diff --git a/src/spdk/intel-ipsec-mb/include/zuc_common.asm b/src/spdk/intel-ipsec-mb/include/zuc_common.asm new file mode 100644 index 000000000..4b9cdd3ec --- /dev/null +++ b/src/spdk/intel-ipsec-mb/include/zuc_common.asm @@ -0,0 +1,740 @@ +;; +;; Copyright (c) 2009-2019, Intel Corporation +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are met: +;; +;; * Redistributions of source code must retain the above copyright notice, +;; this list of conditions and the following disclaimer. +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; * Neither the name of Intel Corporation nor the names of its contributors +;; may be used to endorse or promote products derived from this software +;; without specific prior written permission. +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;; + +%include "include/os.asm" +%include "include/reg_sizes.asm" + +extern lookup_8bit_sse + + +section .data +default rel +align 64 +S0: +db 0x3e,0x72,0x5b,0x47,0xca,0xe0,0x00,0x33,0x04,0xd1,0x54,0x98,0x09,0xb9,0x6d,0xcb +db 0x7b,0x1b,0xf9,0x32,0xaf,0x9d,0x6a,0xa5,0xb8,0x2d,0xfc,0x1d,0x08,0x53,0x03,0x90 +db 0x4d,0x4e,0x84,0x99,0xe4,0xce,0xd9,0x91,0xdd,0xb6,0x85,0x48,0x8b,0x29,0x6e,0xac +db 0xcd,0xc1,0xf8,0x1e,0x73,0x43,0x69,0xc6,0xb5,0xbd,0xfd,0x39,0x63,0x20,0xd4,0x38 +db 0x76,0x7d,0xb2,0xa7,0xcf,0xed,0x57,0xc5,0xf3,0x2c,0xbb,0x14,0x21,0x06,0x55,0x9b +db 0xe3,0xef,0x5e,0x31,0x4f,0x7f,0x5a,0xa4,0x0d,0x82,0x51,0x49,0x5f,0xba,0x58,0x1c +db 0x4a,0x16,0xd5,0x17,0xa8,0x92,0x24,0x1f,0x8c,0xff,0xd8,0xae,0x2e,0x01,0xd3,0xad +db 0x3b,0x4b,0xda,0x46,0xeb,0xc9,0xde,0x9a,0x8f,0x87,0xd7,0x3a,0x80,0x6f,0x2f,0xc8 +db 0xb1,0xb4,0x37,0xf7,0x0a,0x22,0x13,0x28,0x7c,0xcc,0x3c,0x89,0xc7,0xc3,0x96,0x56 +db 0x07,0xbf,0x7e,0xf0,0x0b,0x2b,0x97,0x52,0x35,0x41,0x79,0x61,0xa6,0x4c,0x10,0xfe +db 0xbc,0x26,0x95,0x88,0x8a,0xb0,0xa3,0xfb,0xc0,0x18,0x94,0xf2,0xe1,0xe5,0xe9,0x5d +db 0xd0,0xdc,0x11,0x66,0x64,0x5c,0xec,0x59,0x42,0x75,0x12,0xf5,0x74,0x9c,0xaa,0x23 +db 0x0e,0x86,0xab,0xbe,0x2a,0x02,0xe7,0x67,0xe6,0x44,0xa2,0x6c,0xc2,0x93,0x9f,0xf1 +db 0xf6,0xfa,0x36,0xd2,0x50,0x68,0x9e,0x62,0x71,0x15,0x3d,0xd6,0x40,0xc4,0xe2,0x0f +db 0x8e,0x83,0x77,0x6b,0x25,0x05,0x3f,0x0c,0x30,0xea,0x70,0xb7,0xa1,0xe8,0xa9,0x65 +db 0x8d,0x27,0x1a,0xdb,0x81,0xb3,0xa0,0xf4,0x45,0x7a,0x19,0xdf,0xee,0x78,0x34,0x60 + +S1: +db 0x55,0xc2,0x63,0x71,0x3b,0xc8,0x47,0x86,0x9f,0x3c,0xda,0x5b,0x29,0xaa,0xfd,0x77 +db 0x8c,0xc5,0x94,0x0c,0xa6,0x1a,0x13,0x00,0xe3,0xa8,0x16,0x72,0x40,0xf9,0xf8,0x42 +db 0x44,0x26,0x68,0x96,0x81,0xd9,0x45,0x3e,0x10,0x76,0xc6,0xa7,0x8b,0x39,0x43,0xe1 +db 0x3a,0xb5,0x56,0x2a,0xc0,0x6d,0xb3,0x05,0x22,0x66,0xbf,0xdc,0x0b,0xfa,0x62,0x48 +db 0xdd,0x20,0x11,0x06,0x36,0xc9,0xc1,0xcf,0xf6,0x27,0x52,0xbb,0x69,0xf5,0xd4,0x87 +db 0x7f,0x84,0x4c,0xd2,0x9c,0x57,0xa4,0xbc,0x4f,0x9a,0xdf,0xfe,0xd6,0x8d,0x7a,0xeb +db 0x2b,0x53,0xd8,0x5c,0xa1,0x14,0x17,0xfb,0x23,0xd5,0x7d,0x30,0x67,0x73,0x08,0x09 +db 0xee,0xb7,0x70,0x3f,0x61,0xb2,0x19,0x8e,0x4e,0xe5,0x4b,0x93,0x8f,0x5d,0xdb,0xa9 +db 0xad,0xf1,0xae,0x2e,0xcb,0x0d,0xfc,0xf4,0x2d,0x46,0x6e,0x1d,0x97,0xe8,0xd1,0xe9 +db 0x4d,0x37,0xa5,0x75,0x5e,0x83,0x9e,0xab,0x82,0x9d,0xb9,0x1c,0xe0,0xcd,0x49,0x89 +db 0x01,0xb6,0xbd,0x58,0x24,0xa2,0x5f,0x38,0x78,0x99,0x15,0x90,0x50,0xb8,0x95,0xe4 +db 0xd0,0x91,0xc7,0xce,0xed,0x0f,0xb4,0x6f,0xa0,0xcc,0xf0,0x02,0x4a,0x79,0xc3,0xde +db 0xa3,0xef,0xea,0x51,0xe6,0x6b,0x18,0xec,0x1b,0x2c,0x80,0xf7,0x74,0xe7,0xff,0x21 +db 0x5a,0x6a,0x54,0x1e,0x41,0x31,0x92,0x35,0xc4,0x33,0x07,0x0a,0xba,0x7e,0x0e,0x34 +db 0x88,0xb1,0x98,0x7c,0xf3,0x3d,0x60,0x6c,0x7b,0xca,0xd3,0x1f,0x32,0x65,0x04,0x28 +db 0x64,0xbe,0x85,0x9b,0x2f,0x59,0x8a,0xd7,0xb0,0x25,0xac,0xaf,0x12,0x03,0xe2,0xf2 + +EK_d: +dw 0x44D7, 0x26BC, 0x626B, 0x135E, 0x5789, 0x35E2, 0x7135, 0x09AF, +dw 0x4D78, 0x2F13, 0x6BC4, 0x1AF1, 0x5E26, 0x3C4D, 0x789A, 0x47AC + +%ifdef LINUX +section .note.GNU-stack noalloc noexec nowrite progbits +%endif + +section .text + +%define OFFSET_FR1 (16*4) +%define OFFSET_FR2 (17*4) +%define OFFSET_BRC_X0 (18*4) +%define OFFSET_BRC_X1 (19*4) +%define OFFSET_BRC_X2 (20*4) +%define OFFSET_BRC_X3 (21*4) + +; +; BITS_REORG() +; +; params +; %1 - round number +; uses +; eax, ebx, ecx, edx +; return +; updates r12d, r13d, r14d, r15d +; +%macro BITS_REORG 1 + ; + ; r12d = LFSR_S15 + ; eax = LFSR_S14 + ; r13d = LFSR_S11 + ; ebx = LFSR_S9 + ; r14d = LFSR_S7 + ; ecx = LFSR_S5 + ; r15d = LFSR_S2 + ; edx = LFSR_S0 + + mov r12d, [rsi + ((15 + %1) % 16)*4] + mov eax, [rsi + ((14 + %1) % 16)*4] + mov r13d, [rsi + ((11 + %1) % 16)*4] + mov ebx, [rsi + (( 9 + %1) % 16)*4] + mov r14d, [rsi + (( 7 + %1) % 16)*4] + mov ecx, [rsi + (( 5 + %1) % 16)*4] + mov r15d, [rsi + (( 2 + %1) % 16)*4] + mov edx, [rsi + (( 0 + %1) % 16)*4] + + shr r12d, 15 + shl eax, 16 + shl ebx, 1 + shl ecx, 1 + shl edx, 1 + shld r12d, eax, 16 ; BRC_X0 + shld r13d, ebx, 16 ; BRC_X1 + shld r14d, ecx, 16 ; BRC_X2 + shld r15d, edx, 16 ; BRC_X3 +%endmacro + +%macro lookup_single_sbox 3 +%define %%table %1 ; [in] Pointer to table to look up +%define %%idx %2 ; [in] Index to look up +%define %%value %3 ; [out] Returned value from lookup function (rcx, rdx, r8, r9) + +%ifdef SAFE_LOOKUP + ;; Save all registers used in lookup_8bit (xmm0-5, r9,r10) + ;; and registers for param passing and return (4 regs, OS dependent) + ;; (6*16 + 6*8 = 144 bytes) + sub rsp, 144 + + movdqu [rsp], xmm0 + movdqu [rsp + 16], xmm1 + movdqu [rsp + 32], xmm2 + movdqu [rsp + 48], xmm3 + movdqu [rsp + 64], xmm4 + movdqu [rsp + 80], xmm5 + mov [rsp + 96], r9 + mov [rsp + 104], r10 + +%ifdef LINUX + mov [rsp + 112], rdi + mov [rsp + 120], rsi + mov [rsp + 128], rdx + + mov rdi, %%table + mov rsi, %%idx + mov rdx, 256 +%else + mov [rsp + 112], rcx + mov [rsp + 120], rdx + mov [rsp + 128], r8 + mov rcx, %%table + mov rdx, %%idx + mov r8, 256 +%endif + mov [rsp + 136], rax + + call lookup_8bit_sse + + ;; Restore all registers + movdqu xmm0, [rsp] + movdqu xmm1, [rsp + 16] + movdqu xmm2, [rsp + 32] + movdqu xmm3, [rsp + 48] + movdqu xmm4, [rsp + 64] + movdqu xmm5, [rsp + 80] + mov r9, [rsp + 96] + mov r10, [rsp + 104] + +%ifdef LINUX + mov rdi, [rsp + 112] + mov rsi, [rsp + 120] + mov rdx, [rsp + 128] +%else + mov rcx, [rsp + 112] + mov rdx, [rsp + 120] + mov r8, [rsp + 128] +%endif + + ;; Move returned value from lookup function, before restoring rax + mov DWORD(%%value), eax + mov rax, [rsp + 136] + + add rsp, 144 + +%else ;; SAFE_LOOKUP + + movzx DWORD(%%value), BYTE [%%table + %%idx] + +%endif ;; SAFE_LOOKUP +%endmacro + +; +; NONLIN_FUN() +; +; params +; %1 == 1, then calculate W +; uses +; rdi rsi eax rdx edx +; r8d r9d ebx +; return +; eax = W value +; r10d = F_R1 +; r11d = F_R2 +; +%macro NONLIN_FUN 1 + +%if (%1 == 1) + mov eax, r12d + xor eax, r10d + add eax, r11d ; W = (BRC_X0 ^ F_R1) + F_R2 +%endif + lea rdi, [rel S0] + lea rsi, [rel S1] + + add r10d, r13d ; W1= F_R1 + BRC_X1 + xor r11d, r14d ; W2= F_R2 ^ BRC_X2 + + mov rdx, r10 + shld edx, r11d, 16 ; P = (W1 << 16) | (W2 >> 16) + shld r11d, r10d, 16 ; Q = (W2 << 16) | (W1 >> 16) + + mov ebx, edx + mov ecx, edx + mov r8d, edx + mov r9d, edx + + rol ebx, 2 + rol ecx, 10 + rol r8d, 18 + rol r9d, 24 + xor edx, ebx + xor edx, ecx + xor edx, r8d + xor edx, r9d ; U = L1(P) = EDX, hi(RDX)=0 + ; + xor r10, r10 + shld ebx, edx, 24 + shld r8d, edx, 16 + shld r9d, edx, 8 + and rdx, 0xFF + lookup_single_sbox rsi, rdx, rdx + and rbx, 0xFF + lookup_single_sbox rdi, rbx, rbx + and r8, 0xFF + lookup_single_sbox rsi, r8, r8 + and r9, 0xFF + lookup_single_sbox rdi, r9, r9 + shrd r10d, edx, 8 + shrd r10d, ebx, 8 + shrd r10d, r8d, 8 + shrd r10d, r9d, 8 + ; + mov ebx, r11d + mov ecx, r11d + mov r8d, r11d + mov r9d, r11d + rol ebx, 8 + rol ecx, 14 + rol r8d, 22 + rol r9d, 30 + xor r11d, ebx + xor r11d, ecx + xor r11d, r8d + xor r11d, r9d ; V = L2(Q) = ECX, hi(RCX)=0 + ; + shld ebx, r11d, 24 + shld r8d, r11d, 16 + shld r9d, r11d, 8 + and r11, 0xFF + + lookup_single_sbox rsi, r11, r11 + and rbx, 0xFF + lookup_single_sbox rdi, rbx, rbx + and r8, 0xFF + lookup_single_sbox rsi, r8, r8 + and r9, 0xFF + lookup_single_sbox rdi, r9, r9 + + shrd r11d, r11d, 8 + + shrd r11d, ebx, 8 + shrd r11d, r8d, 8 + shrd r11d, r9d, 8 +%endmacro + + +; +; LFSR_UPDT() +; +; params +; %1 - round number +; uses +; rax as input (ZERO or W) +; return +; +%macro LFSR_UPDT 1 + ; + ; ebx = LFSR_S0 + ; ecx = LFSR_S4 + ; edx = LFSR_S10 + ; r8d = LFSR_S13 + ; r9d = LFSR_S15 + ;lea rsi, [LFSR_STA] ; moved to calling function + + mov ebx, [rsi + (( 0 + %1) % 16)*4] + mov ecx, [rsi + (( 4 + %1) % 16)*4] + mov edx, [rsi + ((10 + %1) % 16)*4] + mov r8d, [rsi + ((13 + %1) % 16)*4] + mov r9d, [rsi + ((15 + %1) % 16)*4] + + ; Calculate 64-bit LFSR feedback + add rax, rbx + shl rbx, 8 + shl rcx, 20 + shl rdx, 21 + shl r8, 17 + shl r9, 15 + add rax, rbx + add rax, rcx + add rax, rdx + add rax, r8 + add rax, r9 + + ; Reduce it to 31-bit value + mov rbx, rax + and rax, 0x7FFFFFFF + shr rbx, 31 + add rax, rbx + + mov rbx, rax + sub rbx, 0x7FFFFFFF + cmovns rax, rbx + + + ; LFSR_S16 = (LFSR_S15++) = eax + mov [rsi + (( 0 + %1) % 16)*4], eax +%endmacro + + +; +; make_u31() +; +%macro make_u31 4 + +%define %%Rt %1 +%define %%Ke %2 +%define %%Ek %3 +%define %%Iv %4 + xor %%Rt, %%Rt + shrd %%Rt, %%Iv, 8 + shrd %%Rt, %%Ek, 15 + shrd %%Rt, %%Ke, 9 +%endmacro + + +; +; key_expand() +; +%macro key_expand 1 + movzx r8d, byte [pKe + (%1 + 0)] + movzx r9d, word [rbx + ((%1 + 0)*2)] + movzx r10d, byte [pIv + (%1 + 0)] + make_u31 r11d, r8d, r9d, r10d + mov [rax + ((%1 + 0)*4)], r11d + + movzx r12d, byte [pKe + (%1 + 1)] + movzx r13d, word [rbx + ((%1 + 1)*2)] + movzx r14d, byte [pIv + (%1 + 1)] + make_u31 r15d, r12d, r13d, r14d + mov [rax + ((%1 + 1)*4)], r15d +%endmacro + + + +;---------------------------------------------------------------------------------------- +;; +;;extern void Zuc_Initialization(uint8_t* pKey, uint8_t* pIV, uint32_t * pState) +;; +;; WIN64 +;; RCX - pKey +;; RDX - pIV +;; R8 - pState +;; LIN64 +;; RDI - pKey +;; RSI - pIV +;; RDX - pState +;; +align 16 +MKGLOBAL(asm_ZucInitialization,function,internal) +asm_ZucInitialization: + +%ifdef LINUX + %define pKe rdi + %define pIv rsi + %define pState rdx +%else + %define pKe rcx + %define pIv rdx + %define pState r8 +%endif + + ; save the base pointer + push rbp + + ;load stack pointer to rbp and reserve memory in the red zone + mov rbp, rsp + sub rsp, 196 + + ; Save non-volatile registers + mov [rbp - 8], rbx + mov [rbp - 32], r12 + mov [rbp - 40], r13 + mov [rbp - 48], r14 + mov [rbp - 56], r15 +%ifndef LINUX + mov [rbp - 64], rdi + mov [rbp - 72], rsi +%endif + + lea rbx, [rel EK_d] ; load pointer to D + lea rax, [pState] ; load pointer to pState + mov [rbp - 88], pState ; save pointer to pState + + ; Expand key + key_expand 0 + key_expand 2 + key_expand 4 + key_expand 6 + key_expand 8 + key_expand 10 + key_expand 12 + key_expand 14 + + ; Set R1 and R2 to zero + xor r10, r10 + xor r11, r11 + + ; Shift LFSR 32-times, update state variables +%assign N 0 +%rep 32 + mov rdx, [rbp - 88] ; load pointer to pState + lea rsi, [rdx] + + BITS_REORG N + + NONLIN_FUN 1 + shr eax, 1 + + mov rdx, [rbp - 88] ; re-load pointer to pState + lea rsi, [rdx] + + LFSR_UPDT N + +%assign N N+1 +%endrep + + ; And once more, initial round from keygen phase = 33 times + mov rdx, [rbp - 88] ; load pointer to pState + lea rsi, [rdx] + + + BITS_REORG 0 + NONLIN_FUN 0 + xor rax, rax + + mov rdx, [rbp - 88] ; load pointer to pState + lea rsi, [rdx] + + LFSR_UPDT 0 + + mov rdx, [rbp - 88] ; load pointer to pState + lea rsi, [rdx] + + ; Save ZUC's state variables + mov [rsi + (16*4)],r10d ;F_R1 + mov [rsi + (17*4)],r11d ;F_R2 + mov [rsi + (18*4)],r12d ;BRC_X0 + mov [rsi + (19*4)],r13d ;BRC_X1 + mov [rsi + (20*4)],r14d ;BRC_X2 + mov [rsi + (21*4)],r15d ;BRC_X3 + + + ; Restore non-volatile registers + mov rbx, [rbp - 8] + mov r12, [rbp - 32] + mov r13, [rbp - 40] + mov r14, [rbp - 48] + mov r15, [rbp - 56] +%ifndef LINUX + mov rdi, [rbp - 64] + mov rsi, [rbp - 72] +%endif + + ; restore base pointer + mov rsp, rbp + pop rbp + + ret + + +;; +;; void asm_ZucGenKeystream8B(void *pKeystream, ZucState_t *pState); +;; +;; WIN64 +;; RCX - KS (key stream pointer) +;; RDX - STATE (state pointer) +;; LIN64 +;; RDI - KS (key stream pointer) +;; RSI - STATE (state pointer) +;; +align 16 +MKGLOBAL(asm_ZucGenKeystream8B,function,internal) +asm_ZucGenKeystream8B: + +%ifdef LINUX + %define pKS rdi + %define pState rsi +%else + %define pKS rcx + %define pState rdx +%endif + ; save the base pointer + push rbp + + ;load stack pointer to rbp and reserve memory in the red zone + mov rbp, rsp + sub rsp, 196 + + ; Save non-volatile registers + mov [rbp - 8], rbx + mov [rbp - 32], r12 + mov [rbp - 40], r13 + mov [rbp - 48], r14 + mov [rbp - 56], r15 +%ifndef LINUX + mov [rbp - 64], rdi + mov [rbp - 72], rsi +%endif + + + ; Load input keystream pointer parameter in RAX + mov rax, pKS + + ; Restore ZUC's state variables + xor r10, r10 + xor r11, r11 + mov r10d, [pState + OFFSET_FR1] + mov r11d, [pState + OFFSET_FR2] + mov r12d, [pState + OFFSET_BRC_X0] + mov r13d, [pState + OFFSET_BRC_X1] + mov r14d, [pState + OFFSET_BRC_X2] + mov r15d, [pState + OFFSET_BRC_X3] + + ; Store keystream pointer + mov [rbp - 80], rax + + ; Store ZUC State Pointer + mov [rbp - 88], pState + + ; Generate 8B of keystream in 2 rounds +%assign N 1 +%rep 2 + + mov rdx, [rbp - 88] ; load *pState + lea rsi, [rdx] + + BITS_REORG N + NONLIN_FUN 1 + + ;Store the keystream + mov rbx, [rbp - 80] ; load *pkeystream + xor eax, r15d + mov [rbx], eax + add rbx, 4 ; increment the pointer + mov [rbp - 80], rbx ; save pkeystream + + xor rax, rax + + mov rdx, [rbp - 88] ; load *pState + lea rsi, [rdx] + + LFSR_UPDT N + +%assign N N+1 +%endrep + + mov rsi, [rbp - 88] ; load pState + + + ; Save ZUC's state variables + mov [rsi + OFFSET_FR1], r10d + mov [rsi + OFFSET_FR2], r11d + mov [rsi + OFFSET_BRC_X0], r12d + mov [rsi + OFFSET_BRC_X1], r13d + mov [rsi + OFFSET_BRC_X2], r14d + mov [rsi + OFFSET_BRC_X3], r15d + + ; Restore non-volatile registers + mov rbx, [rbp - 8] + mov r12, [rbp - 32] + mov r13, [rbp - 40] + mov r14, [rbp - 48] + mov r15, [rbp - 56] +%ifndef LINUX + mov rdi, [rbp - 64] + mov rsi, [rbp - 72] +%endif + + mov rsp, rbp + pop rbp + + ret + + +;; +;; void asm_ZucGenKeystream64B(uint32_t * pKeystream, uint32_t * pState); +;; +;; WIN64 +;; RCX - KS (key stream pointer) +;; RDX - STATE (state pointer) +;; LIN64 +;; RDI - KS (key stream pointer) +;; RSI - STATE (state pointer) +;; +align 16 +MKGLOBAL(asm_ZucGenKeystream64B,function,internal) +asm_ZucGenKeystream64B: + +%ifdef LINUX + %define pKS rdi + %define pState rsi +%else + %define pKS rcx + %define pState rdx +%endif + ; save the base pointer + push rbp + + ;load stack pointer to rbp and reserve memory in the red zone + mov rbp, rsp + sub rsp, 196 + + ; Save non-volatile registers + mov [rbp - 8], rbx + mov [rbp - 32], r12 + mov [rbp - 40], r13 + mov [rbp - 48], r14 + mov [rbp - 56], r15 +%ifndef LINUX + mov [rbp - 64], rdi + mov [rbp - 72], rsi +%endif + + + ; Load input keystream pointer parameter in RAX + mov rax, pKS + + ; Restore ZUC's state variables + xor r10, r10 + xor r11, r11 + mov r10d, [pState + OFFSET_FR1] + mov r11d, [pState + OFFSET_FR2] + mov r12d, [pState + OFFSET_BRC_X0] + mov r13d, [pState + OFFSET_BRC_X1] + mov r14d, [pState + OFFSET_BRC_X2] + mov r15d, [pState + OFFSET_BRC_X3] + + ; Store keystream pointer + mov [rbp - 80], rax + + ; Store ZUC State Pointer + mov [rbp - 88], pState + + ; Generate 64B of keystream in 16 rounds +%assign N 1 +%rep 16 + + mov rdx, [rbp - 88] ; load *pState + lea rsi, [rdx] + + BITS_REORG N + NONLIN_FUN 1 + + ;Store the keystream + mov rbx, [rbp - 80] ; load *pkeystream + xor eax, r15d + mov [rbx], eax + add rbx, 4 ; increment the pointer + mov [rbp - 80], rbx ; save pkeystream + + xor rax, rax + + mov rdx, [rbp - 88] ; load *pState + lea rsi, [rdx] + + LFSR_UPDT N + +%assign N N+1 +%endrep + + mov rsi, [rbp - 88] ; load pState + + + ; Save ZUC's state variables + mov [rsi + OFFSET_FR1], r10d + mov [rsi + OFFSET_FR2], r11d + mov [rsi + OFFSET_BRC_X0], r12d + mov [rsi + OFFSET_BRC_X1], r13d + mov [rsi + OFFSET_BRC_X2], r14d + mov [rsi + OFFSET_BRC_X3], r15d + + ; Restore non-volatile registers + mov rbx, [rbp - 8] + mov r12, [rbp - 32] + mov r13, [rbp - 40] + mov r14, [rbp - 48] + mov r15, [rbp - 56] +%ifndef LINUX + mov rdi, [rbp - 64] + mov rsi, [rbp - 72] +%endif + + mov rsp, rbp + pop rbp + + ret + + diff --git a/src/spdk/intel-ipsec-mb/include/zuc_internal.h b/src/spdk/intel-ipsec-mb/include/zuc_internal.h new file mode 100755 index 000000000..525a1604c --- /dev/null +++ b/src/spdk/intel-ipsec-mb/include/zuc_internal.h @@ -0,0 +1,432 @@ +/******************************************************************************* + Copyright (c) 2009-2019, Intel Corporation + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of Intel Corporation nor the names of its contributors + may be used to endorse or promote products derived from this software + without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE + FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +/** + ****************************************************************************** + * @file zuc_internal.h + * + * @description + * This header file defines the internal API's and data types for the + * 3GPP algorithm ZUC. + * + *****************************************************************************/ + +#ifndef ZUC_INTERNAL_H_ +#define ZUC_INTERNAL_H_ + +#include +#include + +#include "intel-ipsec-mb.h" +#include "immintrin.h" +#include "include/wireless_common.h" + +/* 64 bytes of Keystream will be generated */ +#define ZUC_KEYSTR_LEN (64) +#define NUM_LFSR_STATES (16) +#define ZUC_WORD (32) + +/* Range of input data for ZUC is from 1 to 65504 bits */ +#define ZUC_MIN_LEN 1 +#define ZUC_MAX_LEN 65504 + +#ifdef DEBUG +#ifdef _WIN32 +#define DEBUG_PRINT(_fmt, ...) \ + fprintf(stderr, "%s()::%d " _fmt , __FUNCTION__, __LINE__, __VA_ARGS__) +#else +#define DEBUG_PRINT(_fmt, ...) \ + fprintf(stderr, "%s()::%d " _fmt , __func__, __LINE__, __VA_ARGS__) +#endif +#else +#define DEBUG_PRINT(_fmt, ...) +#endif + +/** + ****************************************************************************** + * @description + * Macro will loop through keystream of length 64bytes and xor with the + * input buffer placing the result in the output buffer. + * KeyStream bytes must be swaped on 32bit boundary before this operation + * + *****************************************************************************/ +#define ZUC_XOR_KEYSTREAM(pIn64, pOut64, pKeyStream64) \ +{ \ + int i =0; \ + union SwapBytes_t { \ + uint64_t l64; \ + uint32_t w32[2]; \ + }swapBytes; \ + /* loop through the key stream and xor 64 bits at a time */ \ + for(i =0; i < ZUC_KEYSTR_LEN/8; i++) { \ + swapBytes.l64 = *pKeyStream64++; \ + swapBytes.w32[0] = bswap4(swapBytes.w32[0]); \ + swapBytes.w32[1] = bswap4(swapBytes.w32[1]); \ + *pOut64++ = *pIn64++ ^ swapBytes.l64; \ + } \ +} + +/** + ***************************************************************************** + * @description + * Packed structure to store the ZUC state for a single packet. * + *****************************************************************************/ +typedef struct zuc_state_s { + uint32_t lfsrState[16]; + /**< State registers of the LFSR */ + uint32_t fR1; + /**< register of F */ + uint32_t fR2; + /**< register of F */ + uint32_t bX0; + /**< Output X0 of the bit reorganization */ + uint32_t bX1; + /**< Output X1 of the bit reorganization */ + uint32_t bX2; + /**< Output X2 of the bit reorganization */ + uint32_t bX3; + /**< Output X3 of the bit reorganization */ +} ZucState_t; + +/** + ***************************************************************************** + * @description + * Packed structure to store the ZUC state for a single packet. * + *****************************************************************************/ +typedef struct zuc_state_4_s { + uint32_t lfsrState[16][4]; + /**< State registers of the LFSR */ + uint32_t fR1[4]; + /**< register of F */ + uint32_t fR2[4]; + /**< register of F */ + uint32_t bX0[4]; + /**< Output X0 of the bit reorganization for 4 packets */ + uint32_t bX1[4]; + /**< Output X1 of the bit reorganization for 4 packets */ + uint32_t bX2[4]; + /**< Output X2 of the bit reorganization for 4 packets */ + uint32_t bX3[4]; + /**< Output X3 of the bit reorganization for 4 packets */ +} ZucState4_t; + +/** + ***************************************************************************** + * @description + * Structure to store pointers to the 4 keys to be used as input to + * @ref asm_ZucInitialization_4 and @ref asm_ZucGenKeystream64B_4 + *****************************************************************************/ +typedef struct zuc_key_4_s { + const uint8_t *pKey1; + /**< Pointer to 128-bit key for packet 1 */ + const uint8_t *pKey2; + /**< Pointer to 128-bit key for packet 2 */ + const uint8_t *pKey3; + /**< Pointer to 128-bit key for packet 3 */ + const uint8_t *pKey4; + /**< Pointer to 128-bit key for packet 4 */ +} ZucKey4_t; + +/** + ***************************************************************************** + * @description + * Structure to store pointers to the 4 IV's to be used as input to + * @ref asm_ZucInitialization_4 and @ref asm_ZucGenKeystream64B_4 + *****************************************************************************/ +typedef struct zuc_iv_4_s { + const uint8_t *pIv1; + /**< Pointer to 128-bit initialization vector for packet 1 */ + const uint8_t *pIv2; + /**< Pointer to 128-bit initialization vector for packet 2 */ + const uint8_t *pIv3; + /**< Pointer to 128-bit initialization vector for packet 3 */ + const uint8_t *pIv4; + /**< Pointer to 128-bit initialization vector for packet 4 */ +} ZucIv4_t; + +/** + ****************************************************************************** + * + * @description + * Definition of the external function that implements the initialization + * stage of the ZUC algorithm. The function will initialize the state + * for a single packet operation. + * + * @param[in] pKey Pointer to the 128-bit initial key that + * will be used when initializing the ZUC + * state. + * @param[in] pIv Pointer to the 128-bit initial vector that + * will be used when initializing the ZUC + * state. + * @param[in,out] pState Pointer to a ZUC state structure of type + * @ref ZucState_t that will be populated + * with the initialized ZUC state. + * + * @pre + * None + * + *****************************************************************************/ +IMB_DLL_LOCAL void asm_ZucInitialization(const void *pKey, + const void *pIv, + ZucState_t *pState); + +/** + ****************************************************************************** + * @description + * Definition of the external function that implements the initialization + * stage of the ZUC algorithm for 4 packets. The function will initialize + * the state for 4 individual packets. + * + * @param[in] pKey Pointer to an array of 128-bit initial keys + * that will be used when initializing the ZUC + * state. + * @param[in] pIv Pointer to an array of 128-bit initial + * vectors that will be used when initializing + * the ZUC state. + * @param[in,out] pState Pointer to a ZUC state structure of type + * @ref ZucState4_t that will be populated + * with the initialized ZUC state. + * + * @pre + * None + * + *****************************************************************************/ +IMB_DLL_LOCAL void asm_ZucInitialization_4_sse(ZucKey4_t *pKeys, + ZucIv4_t *pIvs, + ZucState4_t *pState); + +IMB_DLL_LOCAL void asm_ZucInitialization_4_avx(ZucKey4_t *pKeys, + ZucIv4_t *pIvs, + ZucState4_t *pState); + +/** + ****************************************************************************** + * + * @description + * Definition of the external function that implements the working + * stage of the ZUC algorithm. The function will generate 64 bytes of + * keystream. + * + * @param[in,out] pKeystream Pointer to an input buffer that will + * contain the generated keystream. + + * @param[in] pState Pointer to a ZUC state structure of type + * @ref ZucState_t + * + * @pre + * A successful call to @ref asm_ZucInitialization to initialize the ZUC + * state. + * + *****************************************************************************/ +IMB_DLL_LOCAL void asm_ZucGenKeystream64B(uint32_t *pKeystream, + ZucState_t *pState); + +/** + ****************************************************************************** + * + * @description + * Definition of the external function that implements the working + * stage of the ZUC algorithm. The function will generate 8 bytes of + * keystream. + * + * @param[in,out] pKeystream Pointer to an input buffer that will + * contain the generated keystream. + + * @param[in] pState Pointer to a ZUC state structure of type + * @ref ZucState_t + * + * @pre + * A successful call to @ref asm_ZucInitialization to initialize the ZUC + * state. + * + *****************************************************************************/ +IMB_DLL_LOCAL void asm_ZucGenKeystream8B(void *pKeystream, + ZucState_t *pState); + +/** + ****************************************************************************** + * + * @description + * Definition of the external function that implements the working + * stage of the ZUC algorithm. The function will generate 64 bytes of + * keystream for four packets in parallel. + * + * @param[in] pState Pointer to a ZUC state structure of type + * @ref ZucState4_t + * + * @param[in,out] pKeyStr1 Pointer to an input buffer that will + * contain the generated keystream for packet + * one. + * @param[in,out] pKeyStr2 Pointer to an input buffer that will + * contain the generated keystream for packet + * two. + * @param[in,out] pKeyStr3 Pointer to an input buffer that will + * contain the generated keystream for packet + * three. + * @param[in,out] pKeyStr4 Pointer to an input buffer that will + * contain the generated keystream for packet + * four. + * + * @pre + * A successful call to @ref asm_ZucInitialization_4 to initialize the ZUC + * state. + * + *****************************************************************************/ +IMB_DLL_LOCAL void asm_ZucGenKeystream64B_4_sse(ZucState4_t *pState, + uint32_t *pKeyStr1, + uint32_t *pKeyStr2, + uint32_t *pKeyStr3, + uint32_t *pKeyStr4); + +IMB_DLL_LOCAL void asm_ZucGenKeystream64B_4_avx(ZucState4_t *pState, + uint32_t *pKeyStr1, + uint32_t *pKeyStr2, + uint32_t *pKeyStr3, + uint32_t *pKeyStr4); + +/** + ****************************************************************************** + * @description + * Definition of the external function to update the authentication tag + * based on keystream and data (SSE varient) + * + * @param[in] T Authentication tag + * + * @param[in] ks Pointer to key stream + * + * @param[in] data Pointer to the data + * + * @pre + * None + * + *****************************************************************************/ +IMB_DLL_LOCAL uint32_t asm_Eia3Round64BSSE(uint32_t T, const void *ks, + const void *data); + +/** + ****************************************************************************** + * @description + * Definition of the external function to return the authentication + * update value to be XOR'ed with current authentication tag (SSE variant) + * + * @param[in] ks Pointer to key stream + * + * @param[in] data Pointer to the data + * + * @param[in] n_words Number of data bits to be processed + * + * @pre + * None + * + *****************************************************************************/ +IMB_DLL_LOCAL uint32_t asm_Eia3RemainderSSE(const void *ks, const void *data, + const uint64_t n_words); + +/** + ****************************************************************************** + * @description + * Definition of the external function to update the authentication tag + * based on keystream and data (AVX variant) + * + * @param[in] T Authentication tag + * + * @param[in] ks Pointer to key stream + * + * @param[in] data Pointer to the data + * + * @pre + * None + * + *****************************************************************************/ +IMB_DLL_LOCAL uint32_t asm_Eia3Round64BAVX(uint32_t T, const void *ks, + const void *data); + +/** + ****************************************************************************** + * @description + * Definition of the external function to return the authentication + * update value to be XOR'ed with current authentication tag (AVX variant) + * + * @param[in] ks Pointer to key stream + * + * @param[in] data Pointer to the data + * + * @param[in] n_words Number of data bits to be processed + * + * @pre + * None + * + *****************************************************************************/ +IMB_DLL_LOCAL uint32_t asm_Eia3RemainderAVX(const void *ks, const void *data, + const uint64_t n_words); + + +/* the s-boxes */ +extern const uint8_t S0[256]; +extern const uint8_t S1[256]; + +void zuc_eea3_1_buffer_sse(const void *pKey, const void *pIv, + const void *pBufferIn, void *pBufferOut, + const uint32_t lengthInBytes); + +void zuc_eea3_4_buffer_sse(const void * const pKey[4], + const void * const pIv[4], + const void * const pBufferIn[4], + void *pBufferOut[4], + const uint32_t lengthInBytes[4]); + +void zuc_eea3_n_buffer_sse(const void * const pKey[], const void * const pIv[], + const void * const pBufferIn[], void *pBufferOut[], + const uint32_t lengthInBytes[], + const uint32_t numBuffers); + +void zuc_eia3_1_buffer_sse(const void *pKey, const void *pIv, + const void *pBufferIn, const uint32_t lengthInBits, + uint32_t *pMacI); + +void zuc_eea3_1_buffer_avx(const void *pKey, const void *pIv, + const void *pBufferIn, void *pBufferOut, + const uint32_t lengthInBytes); + +void zuc_eea3_4_buffer_avx(const void * const pKey[4], + const void * const pIv[4], + const void * const pBufferIn[4], + void *pBufferOut[4], + const uint32_t lengthInBytes[4]); + +void zuc_eea3_n_buffer_avx(const void * const pKey[], const void * const pIv[], + const void * const pBufferIn[], void *pBufferOut[], + const uint32_t lengthInBytes[], + const uint32_t numBuffers); + +void zuc_eia3_1_buffer_avx(const void *pKey, const void *pIv, + const void *pBufferIn, const uint32_t lengthInBits, + uint32_t *pMacI); + + +#endif /* ZUC_INTERNAL_H_ */ + -- cgit v1.2.3